The following program illustrates the large nonlinear least squares solvers on a system with significant sparse structure in the Jacobian. The cost function is given by
\Phi(x) &= 1/2 \sum_{i=1}^{p+1} f_i^2
f_i &= \sqrt{\alpha} (x_i - 1), 1 \le i \le p
f_{p+1} &= ||x||^2 - 1/4
with \alpha = 10^{-5}. The residual f_{p+1} imposes a constraint on the p parameters x, to ensure that ||x||^2 \approx {1 \over 4}. The (p+1)-by-p Jacobian for this system is given by
J(x) = [ \sqrt{alpha} I_p; 2 x^T ]
and the normal equations matrix is given by
J^T J = [ \alpha I_p + 4 x x^T ]
Finally, the second directional derivative of f for the geodesic acceleration method is given by
fvv = [ 0; 2 ||v||^2 ]
Since the upper p-by-p block of J is diagonal, this sparse structure should be exploited in the nonlinear solver. For comparison, the following program solves the system for p = 2000 using the dense direct Cholesky solver based on the normal equations matrix J^T J, as well as the iterative Steihaug-Toint solver, based on sparse matrix-vector products J u and J^T u. The program output is shown below.
Method NITER NFEV NJUEV NJTJEV NAEV Init Cost Final cost cond(J) Final |x|^2 Time (s) levenberg-marquardt 25 31 26 26 0 7.1218e+18 1.9555e-02 447.50 2.5044e-01 46.28 levenberg-marquardt+accel 22 23 45 23 22 7.1218e+18 1.9555e-02 447.64 2.5044e-01 33.92 dogleg 37 87 36 36 0 7.1218e+18 1.9555e-02 447.59 2.5044e-01 56.05 double-dogleg 35 88 34 34 0 7.1218e+18 1.9555e-02 447.62 2.5044e-01 52.65 2D-subspace 37 88 36 36 0 7.1218e+18 1.9555e-02 447.71 2.5044e-01 59.75 steihaug-toint 35 88 345 0 0 7.1218e+18 1.9555e-02 inf 2.5044e-01 0.09
The first five rows use methods based on factoring the dense J^T J matrix while the last row uses the iterative Steihaug-Toint method. While the number of Jacobian matrix-vector products (NJUEV) is less for the dense methods, the added time to construct and factor the J^T J matrix (NJTJEV) results in a much larger runtime than the iterative method (see last column).
The program is given below.
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_multilarge_nlinear.h>
#include <gsl/gsl_spblas.h>
#include <gsl/gsl_spmatrix.h>
/* parameters for functions */
struct model_params
{
  double alpha;
  gsl_spmatrix *J;
};
/* penalty function */
int
penalty_f (const gsl_vector * x, void *params, gsl_vector * f)
{
  struct model_params *par = (struct model_params *) params;
  const double sqrt_alpha = sqrt(par->alpha);
  const size_t p = x->size;
  size_t i;
  double sum = 0.0;
  for (i = 0; i < p; ++i)
    {
      double xi = gsl_vector_get(x, i);
      gsl_vector_set(f, i, sqrt_alpha*(xi - 1.0));
      sum += xi * xi;
    }
  gsl_vector_set(f, p, sum - 0.25);
  return GSL_SUCCESS;
}
int
penalty_df (CBLAS_TRANSPOSE_t TransJ, const gsl_vector * x,
            const gsl_vector * u, void * params, gsl_vector * v,
            gsl_matrix * JTJ)
{
  struct model_params *par = (struct model_params *) params;
  const size_t p = x->size;
  size_t j;
  /* store 2*x in last row of J */
  for (j = 0; j < p; ++j)
    {
      double xj = gsl_vector_get(x, j);
      gsl_spmatrix_set(par->J, p, j, 2.0 * xj);
    }
  /* compute v = op(J) u */
  if (v)
    gsl_spblas_dgemv(TransJ, 1.0, par->J, u, 0.0, v);
  if (JTJ)
    {
      gsl_vector_view diag = gsl_matrix_diagonal(JTJ);
      /* compute J^T J = [ alpha*I_p + 4 x x^T ] */
      gsl_matrix_set_zero(JTJ);
      /* store 4 x x^T in lower half of JTJ */
      gsl_blas_dsyr(CblasLower, 4.0, x, JTJ);
      /* add alpha to diag(JTJ) */
      gsl_vector_add_constant(&diag.vector, par->alpha);
    }
  return GSL_SUCCESS;
}
int
penalty_fvv (const gsl_vector * x, const gsl_vector * v,
             void *params, gsl_vector * fvv)
{
  const size_t p = x->size;
  double normv = gsl_blas_dnrm2(v);
  gsl_vector_set_zero(fvv);
  gsl_vector_set(fvv, p, 2.0 * normv * normv);
  (void)params; /* avoid unused parameter warning */
  return GSL_SUCCESS;
}
void
solve_system(const gsl_vector *x0, gsl_multilarge_nlinear_fdf *fdf,
             gsl_multilarge_nlinear_parameters *params)
{
  const gsl_multilarge_nlinear_type *T = gsl_multilarge_nlinear_trust;
  const size_t max_iter = 200;
  const double xtol = 1.0e-8;
  const double gtol = 1.0e-8;
  const double ftol = 1.0e-8;
  const size_t n = fdf->n;
  const size_t p = fdf->p;
  gsl_multilarge_nlinear_workspace *work =
    gsl_multilarge_nlinear_alloc(T, params, n, p);
  gsl_vector * f = gsl_multilarge_nlinear_residual(work);
  gsl_vector * x = gsl_multilarge_nlinear_position(work);
  int info;
  double chisq0, chisq, rcond, xsq;
  struct timeval tv0, tv1;
  gettimeofday(&tv0, NULL);
  /* initialize solver */
  gsl_multilarge_nlinear_init(x0, fdf, work);
  /* store initial cost */
  gsl_blas_ddot(f, f, &chisq0);
  /* iterate until convergence */
  gsl_multilarge_nlinear_driver(max_iter, xtol, gtol, ftol,
                                NULL, NULL, &info, work);
  gettimeofday(&tv1, NULL);
  /* store final cost */
  gsl_blas_ddot(f, f, &chisq);
  /* compute final ||x||^2 */
  gsl_blas_ddot(x, x, &xsq);
  /* store cond(J(x)) */
  gsl_multilarge_nlinear_rcond(&rcond, work);
  /* print summary */
  fprintf(stderr, "%-25s %-5zu %-4zu %-5zu %-6zu %-4zu %-10.4e %-10.4e %-7.2f %-11.4e %.2f\n",
          gsl_multilarge_nlinear_trs_name(work),
          gsl_multilarge_nlinear_niter(work),
          fdf->nevalf,
          fdf->nevaldfu,
          fdf->nevaldf2,
          fdf->nevalfvv,
          chisq0,
          chisq,
          1.0 / rcond,
          xsq,
          (tv1.tv_sec - tv0.tv_sec) + 1.0e-6 * (tv1.tv_usec - tv0.tv_usec));
  gsl_multilarge_nlinear_free(work);
}
int
main (void)
{
  const size_t p = 2000;
  const size_t n = p + 1;
  gsl_vector *f = gsl_vector_alloc(n);
  gsl_vector *x = gsl_vector_alloc(p);
  /* allocate sparse Jacobian matrix with 2*p non-zero elements in triplet format */
  gsl_spmatrix *J = gsl_spmatrix_alloc_nzmax(n, p, 2 * p, GSL_SPMATRIX_TRIPLET);
  gsl_multilarge_nlinear_fdf fdf;
  gsl_multilarge_nlinear_parameters fdf_params =
    gsl_multilarge_nlinear_default_parameters();
  struct model_params params;
  size_t i;
  params.alpha = 1.0e-5;
  params.J = J;
  /* define function to be minimized */
  fdf.f = penalty_f;
  fdf.df = penalty_df;
  fdf.fvv = penalty_fvv;
  fdf.n = n;
  fdf.p = p;
  fdf.params = ¶ms;
  for (i = 0; i < p; ++i)
    {
      /* starting point */
      gsl_vector_set(x, i, i + 1.0);
      /* store sqrt(alpha)*I_p in upper p-by-p block of J */
      gsl_spmatrix_set(J, i, i, sqrt(params.alpha));
    }
  fprintf(stderr, "%-25s %-4s %-4s %-5s %-6s %-4s %-10s %-10s %-7s %-11s %-10s\n",
          "Method", "NITER", "NFEV", "NJUEV", "NJTJEV", "NAEV", "Init Cost",
          "Final cost", "cond(J)", "Final |x|^2", "Time (s)");
  
  fdf_params.scale = gsl_multilarge_nlinear_scale_levenberg;
  fdf_params.trs = gsl_multilarge_nlinear_trs_lm;
  solve_system(x, &fdf, &fdf_params);
  fdf_params.trs = gsl_multilarge_nlinear_trs_lmaccel;
  solve_system(x, &fdf, &fdf_params);
  fdf_params.trs = gsl_multilarge_nlinear_trs_dogleg;
  solve_system(x, &fdf, &fdf_params);
  fdf_params.trs = gsl_multilarge_nlinear_trs_ddogleg;
  solve_system(x, &fdf, &fdf_params);
  fdf_params.trs = gsl_multilarge_nlinear_trs_subspace2D;
  solve_system(x, &fdf, &fdf_params);
  fdf_params.trs = gsl_multilarge_nlinear_trs_cgst;
  solve_system(x, &fdf, &fdf_params);
  gsl_vector_free(f);
  gsl_vector_free(x);
  gsl_spmatrix_free(J);
  return 0;
}