Update Ceres to version 2.2.0

Brings a lot of performance improvements and bug fixes. Keyframe selection in bundle-adjustment.blend goes down from 4.5 seconds to 3.0 on M2 Ultra. The reconstruction itself stays within 0.2 seconds. Full change log can be found at http://ceres-solver.org/version_history.html Pull Request: https://projects.blender.org/blender/blender/pulls/136896
2025-04-03 16:20:38 +02:00
parent 0eccadd452
commit 59991e54f5
371 changed files with 14807 additions and 7059 deletions
--- a/extern/ceres/CMakeLists.txt
+++ b/extern/ceres/CMakeLists.txt
@@ -17,11 +17,11 @@ set(INC_SYS
 set(SRC
  include/ceres/autodiff_cost_function.h
  include/ceres/autodiff_first_order_function.h
-  include/ceres/autodiff_local_parameterization.h
  include/ceres/autodiff_manifold.h
  include/ceres/c_api.h
  include/ceres/ceres.h
  include/ceres/conditioned_cost_function.h
+  include/ceres/constants.h
  include/ceres/context.h
  include/ceres/cost_function.h
  include/ceres/cost_function_to_functor.h
@@ -41,7 +41,6 @@ set(SRC
  include/ceres/jet.h
  include/ceres/jet_fwd.h
  include/ceres/line_manifold.h
-  include/ceres/local_parameterization.h
  include/ceres/loss_function.h
  include/ceres/manifold.h
  include/ceres/manifold_test_utils.h
@@ -66,6 +65,7 @@ set(SRC
  include/ceres/internal/autodiff.h
  include/ceres/internal/disable_warnings.h
  include/ceres/internal/eigen.h
+  include/ceres/internal/euler_angles.h
  include/ceres/internal/fixed_array.h
  include/ceres/internal/householder_vector.h
  include/ceres/internal/integer_sequence_algorithm.h
@@ -107,7 +107,6 @@ set(SRC
  internal/ceres/canonical_views_clustering.cc
  internal/ceres/canonical_views_clustering.h
  internal/ceres/casts.h
-  internal/ceres/cgnr_linear_operator.h
  internal/ceres/cgnr_solver.cc
  internal/ceres/cgnr_solver.h
  internal/ceres/compressed_col_sparse_matrix_utils.cc
@@ -118,7 +117,6 @@ set(SRC
  internal/ceres/compressed_row_sparse_matrix.h
  internal/ceres/concurrent_queue.h
  internal/ceres/conditioned_cost_function.cc
-  internal/ceres/conjugate_gradients_solver.cc
  internal/ceres/conjugate_gradients_solver.h
  internal/ceres/context.cc
  internal/ceres/context_impl.cc
@@ -131,9 +129,23 @@ set(SRC
  internal/ceres/covariance.cc
  internal/ceres/covariance_impl.cc
  internal/ceres/covariance_impl.h
+  internal/ceres/cuda_block_sparse_crs_view.cc
+  internal/ceres/cuda_block_sparse_crs_view.h
+  internal/ceres/cuda_block_structure.cc
+  internal/ceres/cuda_block_structure.h
  internal/ceres/cuda_buffer.h
-  internal/ceres/cxsparse.cc
-  internal/ceres/cxsparse.h
+  # internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+  # internal/ceres/cuda_kernels_bsm_to_crs.h
+  internal/ceres/cuda_kernels_utils.h
+  # internal/ceres/cuda_kernels_vector_ops.cu.cc
+  internal/ceres/cuda_kernels_vector_ops.h
+  internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+  internal/ceres/cuda_partitioned_block_sparse_crs_view.h
+  internal/ceres/cuda_sparse_matrix.cc
+  internal/ceres/cuda_sparse_matrix.h
+  internal/ceres/cuda_streamed_buffer.h
+  internal/ceres/cuda_vector.cc
+  internal/ceres/cuda_vector.h
  internal/ceres/dense_cholesky.cc
  internal/ceres/dense_cholesky.h
  internal/ceres/dense_jacobian_writer.h
@@ -156,21 +168,25 @@ set(SRC
  internal/ceres/dynamic_compressed_row_sparse_matrix.h
  internal/ceres/dynamic_sparse_normal_cholesky_solver.cc
  internal/ceres/dynamic_sparse_normal_cholesky_solver.h
+  internal/ceres/eigen_vector_ops.h
  internal/ceres/eigensparse.cc
  internal/ceres/eigensparse.h
  internal/ceres/evaluation_callback.cc
  internal/ceres/evaluator.cc
  internal/ceres/evaluator.h
  internal/ceres/execution_summary.h
+  internal/ceres/fake_bundle_adjustment_jacobian.cc
+  internal/ceres/fake_bundle_adjustment_jacobian.h
  internal/ceres/file.cc
  internal/ceres/file.h
  internal/ceres/first_order_function.cc
-  internal/ceres/float_cxsparse.cc
-  internal/ceres/float_cxsparse.h
  internal/ceres/float_suitesparse.cc
  internal/ceres/float_suitesparse.h
  internal/ceres/function_sample.cc
  internal/ceres/function_sample.h
+  internal/ceres/generate_bundle_adjustment_tests.py
+  internal/ceres/generate_template_specializations.py
+  internal/ceres/generated
  internal/ceres/gradient_checker.cc
  internal/ceres/gradient_checking_cost_function.cc
  internal/ceres/gradient_checking_cost_function.h
@@ -207,31 +223,34 @@ set(SRC
  internal/ceres/linear_operator.h
  internal/ceres/linear_solver.cc
  internal/ceres/linear_solver.h
-  internal/ceres/local_parameterization.cc
  internal/ceres/loss_function.cc
  internal/ceres/low_rank_inverse_hessian.cc
  internal/ceres/low_rank_inverse_hessian.h
  internal/ceres/manifold.cc
-  internal/ceres/manifold_adapter.h
  internal/ceres/map_util.h
  internal/ceres/minimizer.cc
  internal/ceres/minimizer.h
  internal/ceres/normal_prior.cc
  internal/ceres/pair_hash.h
  internal/ceres/parallel_for.h
-  internal/ceres/parallel_for_cxx.cc
-  internal/ceres/parallel_for_nothreads.cc
-  internal/ceres/parallel_for_openmp.cc
+  internal/ceres/parallel_invoke.cc
+  internal/ceres/parallel_invoke.h
  internal/ceres/parallel_utils.cc
  internal/ceres/parallel_utils.h
+  internal/ceres/parallel_vector_ops.cc
+  internal/ceres/parallel_vector_ops.h
  internal/ceres/parameter_block.h
  internal/ceres/parameter_block_ordering.cc
  internal/ceres/parameter_block_ordering.h
+  internal/ceres/partition_range_for_parallel_for.h
  internal/ceres/partitioned_matrix_view.cc
  internal/ceres/partitioned_matrix_view.h
  internal/ceres/partitioned_matrix_view_impl.h
+  internal/ceres/partitioned_matrix_view_template.py
  internal/ceres/polynomial.cc
  internal/ceres/polynomial.h
+  internal/ceres/power_series_expansion_preconditioner.cc
+  internal/ceres/power_series_expansion_preconditioner.h
  internal/ceres/preconditioner.cc
  internal/ceres/preconditioner.h
  internal/ceres/preprocessor.cc
@@ -242,7 +261,6 @@ set(SRC
  internal/ceres/program.cc
  internal/ceres/program.h
  internal/ceres/program_evaluator.h
-  internal/ceres/random.h
  internal/ceres/reorder_program.cc
  internal/ceres/reorder_program.h
  internal/ceres/residual_block.cc
@@ -254,6 +272,7 @@ set(SRC
  internal/ceres/schur_eliminator.cc
  internal/ceres/schur_eliminator.h
  internal/ceres/schur_eliminator_impl.h
+  internal/ceres/schur_eliminator_template.py
  internal/ceres/schur_jacobi_preconditioner.cc
  internal/ceres/schur_jacobi_preconditioner.h
  internal/ceres/schur_templates.cc
--- a/extern/ceres/LICENSE
+++ b/extern/ceres/LICENSE
@@ -1,5 +1,5 @@
 Ceres Solver - A fast non-linear least squares minimizer
-Copyright 2015 Google Inc. All rights reserved.
+Copyright 2023 Google Inc. All rights reserved.
 http://ceres-solver.org/

 Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/README.blender
+++ b/extern/ceres/README.blender
@@ -1,6 +1,6 @@
 Project: Ceres Solver
 URL: http://ceres-solver.org/
 License: SPDX:BSD-3-Clause
-Upstream version 2.1.0
-Copyright: Copyright 2015 Google Inc. All rights reserved.
+Upstream version 2.2.0
+Copyright: Copyright 2023 Google Inc. All rights reserved.
 Local modifications: None
--- a/extern/ceres/config/ceres/internal/config.h
+++ b/extern/ceres/config/ceres/internal/config.h
@@ -50,9 +50,6 @@
 // If defined, Ceres was compiled without SuiteSparse.
 #define CERES_NO_SUITESPARSE

-// If defined, Ceres was compiled without CXSparse.
-#define CERES_NO_CXSPARSE
-
 // If defined, Ceres was compiled without CUDA.
 #define CERES_NO_CUDA

@@ -61,7 +58,6 @@

 #if defined(CERES_NO_SUITESPARSE) &&              \
    defined(CERES_NO_ACCELERATE_SPARSE) &&        \
-    defined(CERES_NO_CXSPARSE) &&                 \
    !defined(CERES_USE_EIGEN_SPARSE)  // NOLINT
 // If defined Ceres was compiled without any sparse linear algebra support.
 #define CERES_NO_SPARSE
@@ -74,12 +70,11 @@
 // routines.
 // #define CERES_NO_CUSTOM_BLAS

-// If defined, Ceres was compiled without multithreading support.
-// #define CERES_NO_THREADS
-// If defined Ceres was compiled with OpenMP multithreading.
-// #define CERES_USE_OPENMP
-// If defined Ceres was compiled with modern C++ multithreading.
-#define CERES_USE_CXX_THREADS
+// If defined, Ceres was compiled with a version of SuiteSparse/CHOLMOD without
+// the Partition module (requires METIS).
+#define CERES_NO_CHOLMOD_PARTITION
+// If defined Ceres was compiled without support for METIS via Eigen.
+#define CERES_NO_EIGEN_METIS

 // If defined, Ceres was compiled with a version MSVC >= 2005 which
 // deprecated the standard POSIX names for bessel functions, replacing them
@@ -88,22 +83,6 @@
 #define CERES_MSVC_USE_UNDERSCORE_PREFIXED_BESSEL_FUNCTIONS
 #endif

-#if defined(CERES_USE_OPENMP)
-#if defined(CERES_USE_CXX_THREADS) || defined(CERES_NO_THREADS)
-#error CERES_USE_OPENMP is mutually exclusive to CERES_USE_CXX_THREADS and CERES_NO_THREADS
-#endif
-#elif defined(CERES_USE_CXX_THREADS)
-#if defined(CERES_USE_OPENMP) || defined(CERES_NO_THREADS)
-#error CERES_USE_CXX_THREADS is mutually exclusive to CERES_USE_OPENMP, CERES_USE_CXX_THREADS and CERES_NO_THREADS
-#endif
-#elif defined(CERES_NO_THREADS)
-#if defined(CERES_USE_OPENMP) || defined(CERES_USE_CXX_THREADS)
-#error CERES_NO_THREADS is mutually exclusive to CERES_USE_OPENMP and CERES_USE_CXX_THREADS
-#endif
-#else
-#  error One of CERES_USE_OPENMP, CERES_USE_CXX_THREADS or CERES_NO_THREADS must be defined.
-#endif
-
 // CERES_NO_SPARSE should be automatically defined by config.h if Ceres was
 // compiled without any sparse back-end.  Verify that it has not subsequently
 // been inconsistently redefined.
@@ -111,9 +90,6 @@
 #if !defined(CERES_NO_SUITESPARSE)
 #error CERES_NO_SPARSE requires CERES_NO_SUITESPARSE.
 #endif
-#if !defined(CERES_NO_CXSPARSE)
-#error CERES_NO_SPARSE requires CERES_NO_CXSPARSE
-#endif
 #if !defined(CERES_NO_ACCELERATE_SPARSE)
 #error CERES_NO_SPARSE requires CERES_NO_ACCELERATE_SPARSE
 #endif
--- a/extern/ceres/config/ceres/internal/export.h
+++ b/extern/ceres/config/ceres/internal/export.h
@@ -33,6 +33,7 @@
 #  define CERES_DEPRECATED_NO_EXPORT CERES_NO_EXPORT CERES_DEPRECATED
 #endif

+/* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */
 #if 0 /* DEFINE_NO_DEPRECATED */
 #  ifndef CERES_NO_DEPRECATED
 #    define CERES_NO_DEPRECATED
--- a/extern/ceres/include/ceres/autodiff_cost_function.h
+++ b/extern/ceres/include/ceres/autodiff_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/autodiff_first_order_function.h
+++ b/extern/ceres/include/ceres/autodiff_first_order_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/autodiff_local_parameterization.h
+++ b/extern/ceres/include/ceres/autodiff_local_parameterization.h
@@ -1,158 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: sergey.vfx@gmail.com (Sergey Sharybin)
-//         mierle@gmail.com (Keir Mierle)
-//         sameeragarwal@google.com (Sameer Agarwal)
-
-#ifndef CERES_PUBLIC_AUTODIFF_LOCAL_PARAMETERIZATION_H_
-#define CERES_PUBLIC_AUTODIFF_LOCAL_PARAMETERIZATION_H_
-
-#include <memory>
-
-#include "ceres/internal/autodiff.h"
-#include "ceres/local_parameterization.h"
-
-namespace ceres {
-
-// WARNING: LocalParameterizations are deprecated, so is
-// AutoDiffLocalParameterization. They will be removed from Ceres Solver in
-// version 2.2.0. Please use Manifolds and AutoDiffManifold instead.
-
-// Create local parameterization with Jacobians computed via automatic
-// differentiation. For more information on local parameterizations,
-// see include/ceres/local_parameterization.h
-//
-// To get an auto differentiated local parameterization, you must define
-// a class with a templated operator() (a functor) that computes
-//
-//   x_plus_delta = Plus(x, delta);
-//
-// the template parameter T. The autodiff framework substitutes appropriate
-// "Jet" objects for T in order to compute the derivative when necessary, but
-// this is hidden, and you should write the function as if T were a scalar type
-// (e.g. a double-precision floating point number).
-//
-// The function must write the computed value in the last argument (the only
-// non-const one) and return true to indicate success.
-//
-// For example, Quaternions have a three dimensional local
-// parameterization. It's plus operation can be implemented as (taken
-// from internal/ceres/auto_diff_local_parameterization_test.cc)
-//
-//   struct QuaternionPlus {
-//     template<typename T>
-//     bool operator()(const T* x, const T* delta, T* x_plus_delta) const {
-//       const T squared_norm_delta =
-//           delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2];
-//
-//       T q_delta[4];
-//       if (squared_norm_delta > T(0.0)) {
-//         T norm_delta = sqrt(squared_norm_delta);
-//         const T sin_delta_by_delta = sin(norm_delta) / norm_delta;
-//         q_delta[0] = cos(norm_delta);
-//         q_delta[1] = sin_delta_by_delta * delta[0];
-//         q_delta[2] = sin_delta_by_delta * delta[1];
-//         q_delta[3] = sin_delta_by_delta * delta[2];
-//       } else {
-//         // We do not just use q_delta = [1,0,0,0] here because that is a
-//         // constant and when used for automatic differentiation will
-//         // lead to a zero derivative. Instead we take a first order
-//         // approximation and evaluate it at zero.
-//         q_delta[0] = T(1.0);
-//         q_delta[1] = delta[0];
-//         q_delta[2] = delta[1];
-//         q_delta[3] = delta[2];
-//       }
-//
-//       QuaternionProduct(q_delta, x, x_plus_delta);
-//       return true;
-//     }
-//   };
-//
-// Then given this struct, the auto differentiated local
-// parameterization can now be constructed as
-//
-//   LocalParameterization* local_parameterization =
-//     new AutoDiffLocalParameterization<QuaternionPlus, 4, 3>;
-//                                                       |  |
-//                            Global Size ---------------+  |
-//                            Local Size -------------------+
-//
-// WARNING: Since the functor will get instantiated with different types for
-// T, you must to convert from other numeric types to T before mixing
-// computations with other variables of type T. In the example above, this is
-// seen where instead of using k_ directly, k_ is wrapped with T(k_).
-
-template <typename Functor, int kGlobalSize, int kLocalSize>
-class CERES_DEPRECATED_WITH_MSG("Use AutoDiffManifold instead.")
-    AutoDiffLocalParameterization : public LocalParameterization {
- public:
-  AutoDiffLocalParameterization() : functor_(new Functor()) {}
-
-  // Takes ownership of functor.
-  explicit AutoDiffLocalParameterization(Functor* functor)
-      : functor_(functor) {}
-
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override {
-    return (*functor_)(x, delta, x_plus_delta);
-  }
-
-  bool ComputeJacobian(const double* x, double* jacobian) const override {
-    double zero_delta[kLocalSize];
-    for (int i = 0; i < kLocalSize; ++i) {
-      zero_delta[i] = 0.0;
-    }
-
-    double x_plus_delta[kGlobalSize];
-    for (int i = 0; i < kGlobalSize; ++i) {
-      x_plus_delta[i] = 0.0;
-    }
-
-    const double* parameter_ptrs[2] = {x, zero_delta};
-    double* jacobian_ptrs[2] = {nullptr, jacobian};
-    return internal::AutoDifferentiate<
-        kGlobalSize,
-        internal::StaticParameterDims<kGlobalSize, kLocalSize>>(
-        *functor_, parameter_ptrs, kGlobalSize, x_plus_delta, jacobian_ptrs);
-  }
-
-  int GlobalSize() const override { return kGlobalSize; }
-  int LocalSize() const override { return kLocalSize; }
-
-  const Functor& functor() const { return *functor_; }
-
- private:
-  std::unique_ptr<Functor> functor_;
-};
-
-}  // namespace ceres
-
-#endif  // CERES_PUBLIC_AUTODIFF_LOCAL_PARAMETERIZATION_H_
--- a/extern/ceres/include/ceres/autodiff_manifold.h
+++ b/extern/ceres/include/ceres/autodiff_manifold.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/c_api.h
+++ b/extern/ceres/include/ceres/c_api.h
@@ -1,5 +1,5 @@
 /* Ceres Solver - A fast non-linear least squares minimizer
- * Copyright 2019 Google Inc. All rights reserved.
+ * Copyright 2023 Google Inc. All rights reserved.
 * http://ceres-solver.org/
 *
 * Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/ceres.h
+++ b/extern/ceres/include/ceres/ceres.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,12 @@
 #ifndef CERES_PUBLIC_CERES_H_
 #define CERES_PUBLIC_CERES_H_

+// IWYU pragma: begin_exports
 #include "ceres/autodiff_cost_function.h"
 #include "ceres/autodiff_first_order_function.h"
-#include "ceres/autodiff_local_parameterization.h"
 #include "ceres/autodiff_manifold.h"
 #include "ceres/conditioned_cost_function.h"
+#include "ceres/constants.h"
 #include "ceres/context.h"
 #include "ceres/cost_function.h"
 #include "ceres/cost_function_to_functor.h"
@@ -56,7 +57,6 @@
 #include "ceres/iteration_callback.h"
 #include "ceres/jet.h"
 #include "ceres/line_manifold.h"
-#include "ceres/local_parameterization.h"
 #include "ceres/loss_function.h"
 #include "ceres/manifold.h"
 #include "ceres/numeric_diff_cost_function.h"
@@ -70,5 +70,6 @@
 #include "ceres/sphere_manifold.h"
 #include "ceres/types.h"
 #include "ceres/version.h"
+// IWYU pragma: end_exports

 #endif  // CERES_PUBLIC_CERES_H_
--- a/extern/ceres/include/ceres/conditioned_cost_function.h
+++ b/extern/ceres/include/ceres/conditioned_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/float_cxsparse.cc
+++ b/extern/ceres/internal/ceres/float_cxsparse.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -26,24 +26,17 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: sameeragarwal@google.com (Sameer Agarwal)
+// Author: hellston20a@gmail.com (H S Helson Go)

-#include "ceres/float_cxsparse.h"
+#ifndef CERES_PUBLIC_CONSTANTS_H_
+#define CERES_PUBLIC_CONSTANTS_H_

-#include <memory>
+// TODO(HSHelson): This header should no longer be necessary once C++20's
+// <numbers> (e.g. std::numbers::pi_v) becomes usable
+namespace ceres::constants {
+template <typename T>
+inline constexpr T pi_v(3.141592653589793238462643383279502884);
+inline constexpr double pi = pi_v<double>;
+}  // namespace ceres::constants

-#if !defined(CERES_NO_CXSPARSE)
-
-namespace ceres {
-namespace internal {
-
-std::unique_ptr<SparseCholesky> FloatCXSparseCholesky::Create(
-    OrderingType ordering_type) {
-  LOG(FATAL) << "FloatCXSparseCholesky is not available.";
-  return {};
-}
-
-}  // namespace internal
-}  // namespace ceres
-
-#endif  // !defined(CERES_NO_CXSPARSE)
+#endif  // CERES_PUBLIC_CONSTANTS_H_
--- a/extern/ceres/include/ceres/context.h
+++ b/extern/ceres/include/ceres/context.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/cost_function.h
+++ b/extern/ceres/include/ceres/cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/cost_function_to_functor.h
+++ b/extern/ceres/include/ceres/cost_function_to_functor.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -120,7 +120,7 @@ class CostFunctionToFunctor {
    if (parameter_block_sizes.size() == num_parameter_blocks) {
      for (int block = 0; block < num_parameter_blocks; ++block) {
        CHECK_EQ(ParameterDims::GetDim(block), parameter_block_sizes[block])
-            << "Parameter block size missmatch. The specified static parameter "
+            << "Parameter block size mismatch. The specified static parameter "
               "block dimension does not match the one from the cost function.";
      }
    }
--- a/extern/ceres/include/ceres/covariance.h
+++ b/extern/ceres/include/ceres/covariance.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -146,7 +146,7 @@ class CovarianceImpl;
 //   a. The rank deficiency arises from overparameterization. e.g., a
 //   four dimensional quaternion used to parameterize SO(3), which is
 //   a three dimensional manifold. In cases like this, the user should
-//   use an appropriate LocalParameterization/Manifold. Not only will this lead
+//   use an appropriate Manifold. Not only will this lead
 //   to better numerical behaviour of the Solver, it will also expose
 //   the rank deficiency to the Covariance object so that it can
 //   handle it correctly.
@@ -246,6 +246,20 @@ class CERES_EXPORT Covariance {
    // used.
    CovarianceAlgorithmType algorithm_type = SPARSE_QR;

+    // During QR factorization, if a column with Euclidean norm less
+    // than column_pivot_threshold is encountered it is treated as
+    // zero.
+    //
+    // If column_pivot_threshold < 0, then an automatic default value
+    // of 20*(m+n)*eps*sqrt(max(diag(J’*J))) is used. Here m and n are
+    // the number of rows and columns of the Jacobian (J)
+    // respectively.
+    //
+    // This is an advanced option meant for users who know enough
+    // about their Jacobian matrices that they can determine a value
+    // better than the default.
+    double column_pivot_threshold = -1;
+
    // If the Jacobian matrix is near singular, then inverting J'J
    // will result in unreliable results, e.g, if
    //
@@ -266,7 +280,7 @@ class CERES_EXPORT Covariance {
    //
    //      min_sigma / max_sigma < sqrt(min_reciprocal_condition_number)
    //
-    //    where min_sigma and max_sigma are the minimum and maxiumum
+    //    where min_sigma and max_sigma are the minimum and maximum
    //    singular values of J respectively.
    //
    // 2. SPARSE_QR
@@ -394,11 +408,9 @@ class CERES_EXPORT Covariance {
                          const double* parameter_block2,
                          double* covariance_block) const;

-  // Return the block of the cross-covariance matrix corresponding to
-  // parameter_block1 and parameter_block2.
-  // Returns cross-covariance in the tangent space if a local
-  // parameterization is associated with either parameter block;
-  // else returns cross-covariance in the ambient space.
+  // Returns the block of the cross-covariance in the tangent space if a
+  // manifold is associated with either parameter block; else returns
+  // cross-covariance in the ambient space.
  //
  // Compute must be called before the first call to
  // GetCovarianceBlock and the pair <parameter_block1,
@@ -430,9 +442,8 @@ class CERES_EXPORT Covariance {
                           double* covariance_matrix) const;

  // Return the covariance matrix corresponding to parameter_blocks
-  // in the tangent space if a local parameterization is associated
-  // with one of the parameter blocks else returns the covariance
-  // matrix in the ambient space.
+  // in the tangent space if a manifold is associated with one of the parameter
+  // blocks else returns the covariance matrix in the ambient space.
  //
  // Compute must be called before calling GetCovarianceMatrix and all
  // parameter_blocks must have been present in the vector
--- a/extern/ceres/include/ceres/crs_matrix.h
+++ b/extern/ceres/include/ceres/crs_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/cubic_interpolation.h
+++ b/extern/ceres/include/ceres/cubic_interpolation.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -368,7 +368,7 @@ class BiCubicInterpolator {
 //
 //   f001, f002, f011, f012, ...
 //
-// A commonly occuring example are color images (RGB) where the three
+// A commonly occurring example are color images (RGB) where the three
 // channels are stored interleaved.
 //
 // If kInterleaved = false, then it is stored as
--- a/extern/ceres/include/ceres/dynamic_autodiff_cost_function.h
+++ b/extern/ceres/include/ceres/dynamic_autodiff_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -264,11 +264,23 @@ class DynamicAutoDiffCostFunction final : public DynamicCostFunction {
    return true;
  }

+  const CostFunctor& functor() const { return *functor_; }
+
 private:
  std::unique_ptr<CostFunctor> functor_;
  Ownership ownership_;
 };

+// Deduction guide that allows the user to avoid explicitly specifying the
+// template parameter of DynamicAutoDiffCostFunction. The class can instead be
+// instantiated as follows:
+//
+//   new DynamicAutoDiffCostFunction{new MyCostFunctor{}};
+//
+template <typename CostFunctor>
+DynamicAutoDiffCostFunction(CostFunctor* functor, Ownership ownership)
+    -> DynamicAutoDiffCostFunction<CostFunctor>;
+
 }  // namespace ceres

 #endif  // CERES_PUBLIC_DYNAMIC_AUTODIFF_COST_FUNCTION_H_
--- a/extern/ceres/include/ceres/dynamic_cost_function.h
+++ b/extern/ceres/include/ceres/dynamic_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/dynamic_cost_function_to_functor.h
+++ b/extern/ceres/include/ceres/dynamic_cost_function_to_functor.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/dynamic_numeric_diff_cost_function.h
+++ b/extern/ceres/include/ceres/dynamic_numeric_diff_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -76,7 +76,7 @@ namespace ceres {
 //   cost_function.AddParameterBlock(5);
 //   cost_function.AddParameterBlock(10);
 //   cost_function.SetNumResiduals(21);
-template <typename CostFunctor, NumericDiffMethodType method = CENTRAL>
+template <typename CostFunctor, NumericDiffMethodType kMethod = CENTRAL>
 class DynamicNumericDiffCostFunction final : public DynamicCostFunction {
 public:
  explicit DynamicNumericDiffCostFunction(
@@ -134,7 +134,7 @@ class DynamicNumericDiffCostFunction final : public DynamicCostFunction {
    for (size_t block = 0; block < block_sizes.size(); ++block) {
      if (jacobians[block] != nullptr &&
          !NumericDiff<CostFunctor,
-                       method,
+                       kMethod,
                       ceres::DYNAMIC,
                       internal::DynamicParameterDims,
                       ceres::DYNAMIC,
--- a/extern/ceres/include/ceres/evaluation_callback.h
+++ b/extern/ceres/include/ceres/evaluation_callback.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -66,8 +66,12 @@ class CERES_EXPORT EvaluationCallback {

  // Called before Ceres requests residuals or jacobians for a given setting of
  // the parameters. User parameters (the double* values provided to the cost
-  // functions) are fixed until the next call to PrepareForEvaluation(). If
-  // new_evaluation_point == true, then this is a new point that is different
+  // functions) are fixed until the next call to PrepareForEvaluation().
+  //
+  // If evaluate_jacobians == true, then the user provided CostFunctions will be
+  // asked to evaluate one or more of their Jacobians.
+  //
+  // If new_evaluation_point == true, then this is a new point that is different
  // from the last evaluated point. Otherwise, it is the same point that was
  // evaluated previously (either jacobian or residual) and the user can use
  // cached results from previous evaluations.
--- a/extern/ceres/include/ceres/first_order_function.h
+++ b/extern/ceres/include/ceres/first_order_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/gradient_checker.h
+++ b/extern/ceres/include/ceres/gradient_checker.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
-// Copyright 2007 Google Inc. All Rights Reserved.
+// Copyright 2023 Google Inc. All Rights Reserved.
 //
 // Authors: wjr@google.com (William Rucklidge),
 //          keir@google.com (Keir Mierle),
@@ -44,7 +44,6 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "ceres/internal/fixed_array.h"
-#include "ceres/local_parameterization.h"
 #include "ceres/manifold.h"
 #include "glog/logging.h"

@@ -59,37 +58,15 @@ namespace ceres {
 //   ------------------------------------  <  relative_precision
 //   max(J_actual(i, j), J_numeric(i, j))
 //
-// where J_actual(i, j) is the jacobian as computed by the supplied cost
-// function (by the user) multiplied by the local parameterization Jacobian
-// and J_numeric is the jacobian as computed by finite differences, multiplied
-// by the local parameterization Jacobian as well.
+// where J_actual(i, j) is the Jacobian as computed by the supplied cost
+// function (by the user) multiplied by the manifold Jacobian and J_numeric is
+// the Jacobian as computed by finite differences, multiplied by the manifold
+// Jacobian as well.
 //
 // How to use: Fill in an array of pointers to parameter blocks for your
 // CostFunction, and then call Probe(). Check that the return value is 'true'.
 class CERES_EXPORT GradientChecker {
 public:
-  // This constructor will not take ownership of the cost function or local
-  // parameterizations.
-  //
-  // function: The cost function to probe.
-  //
-  // local_parameterizations: A vector of local parameterizations, one for each
-  // parameter block. May be nullptr or contain nullptrs to indicate that the
-  // respective parameter does not have a local parameterization.
-  //
-  // options: Options to use for numerical differentiation.
-  //
-  // NOTE: This constructor is deprecated and will be removed in the next public
-  // release of Ceres Solver. Please transition to using the Manifold based
-  // version.
-  CERES_DEPRECATED_WITH_MSG(
-      "Local Parameterizations are deprecated. Use the constructor that uses "
-      "Manifolds instead.")
-  GradientChecker(
-      const CostFunction* function,
-      const std::vector<const LocalParameterization*>* local_parameterizations,
-      const NumericDiffOptions& options);
-
  // This will not take ownership of the cost function or manifolds.
  //
  // function: The cost function to probe.
@@ -102,7 +79,6 @@ class CERES_EXPORT GradientChecker {
  GradientChecker(const CostFunction* function,
                  const std::vector<const Manifold*>* manifolds,
                  const NumericDiffOptions& options);
-  ~GradientChecker();

  // Contains results from a call to Probe for later inspection.
  struct CERES_EXPORT ProbeResults {
@@ -166,17 +142,6 @@ class CERES_EXPORT GradientChecker {
  GradientChecker(const GradientChecker&) = delete;
  void operator=(const GradientChecker&) = delete;

-  // This bool is used to determine whether the constructor with the
-  // LocalParameterizations is called or the one with Manifolds is called. If
-  // the former, then the vector of manifolds is a vector of ManifoldAdapter
-  // objects which we own and should be deleted. If the latter then they are
-  // real Manifold objects owned by the caller and will not be deleted.
-  //
-  // This bool is only needed during the LocalParameterization to Manifold
-  // transition, once this transition is complete the LocalParameterization
-  // based constructor and this bool will be removed.
-  const bool delete_manifolds_ = false;
-
  std::vector<const Manifold*> manifolds_;
  const CostFunction* function_;
  std::unique_ptr<CostFunction> finite_diff_cost_function_;
--- a/extern/ceres/include/ceres/gradient_problem.h
+++ b/extern/ceres/include/ceres/gradient_problem.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,6 @@
 #include "ceres/first_order_function.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
-#include "ceres/local_parameterization.h"
 #include "ceres/manifold.h"

 namespace ceres {
@@ -90,47 +89,19 @@ class FirstOrderFunction;
 // };
 //
 // ceres::GradientProblem problem(new Rosenbrock());
-//
-// NOTE: We are currently in the process of transitioning from
-// LocalParameterization to Manifolds in the Ceres API. During this period,
-// GradientProblem will support using both Manifold and LocalParameterization
-// objects interchangably. For methods in the API affected by this change, see
-// their documentation below.
 class CERES_EXPORT GradientProblem {
 public:
  // Takes ownership of the function.
  explicit GradientProblem(FirstOrderFunction* function);

-  // Takes ownership of the function and the parameterization.
-  //
-  // NOTE: This constructor is deprecated and will be removed in the next public
-  // release of Ceres Solver. Please move to using the Manifold based
-  // constructor.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Please use the constructor that "
-      "uses Manifold instead.")
-  GradientProblem(FirstOrderFunction* function,
-                  LocalParameterization* parameterization);
-
  // Takes ownership of the function and the manifold.
  GradientProblem(FirstOrderFunction* function, Manifold* manifold);

  int NumParameters() const;

  // Dimension of the manifold (and its tangent space).
-  //
-  // During the transition from LocalParameterization to Manifold, this method
-  // reports the LocalSize of the LocalParameterization or the TangentSize of
-  // the Manifold object associated with this problem.
  int NumTangentParameters() const;

-  // Dimension of the manifold (and its tangent space).
-  //
-  // NOTE: This method is deprecated and will be removed in the next public
-  // release of Ceres Solver. Please move to using NumTangentParameters()
-  // instead.
-  int NumLocalParameters() const { return NumTangentParameters(); }
-
  // This call is not thread safe.
  bool Evaluate(const double* parameters, double* cost, double* gradient) const;
  bool Plus(const double* x, const double* delta, double* x_plus_delta) const;
@@ -138,42 +109,11 @@ class CERES_EXPORT GradientProblem {
  const FirstOrderFunction* function() const { return function_.get(); }
  FirstOrderFunction* mutable_function() { return function_.get(); }

-  // NOTE: During the transition from LocalParameterization to Manifold we need
-  // to support both The LocalParameterization and Manifold based constructors.
-  //
-  // When the user uses the LocalParameterization, internally the solver will
-  // wrap it in a ManifoldAdapter object and return it when manifold or
-  // mutable_manifold are called.
-  //
-  // As a result this method will return a non-nullptr result if a Manifold or a
-  // LocalParameterization was used when constructing the GradientProblem.
  const Manifold* manifold() const { return manifold_.get(); }
  Manifold* mutable_manifold() { return manifold_.get(); }

-  // If the problem is constructed without a LocalParameterization or with a
-  // Manifold this method will return a nullptr.
-  //
-  // NOTE: This method is deprecated and will be removed in the next public
-  // release of Ceres Solver.
-  CERES_DEPRECATED_WITH_MSG("Use Manifolds instead.")
-  const LocalParameterization* parameterization() const {
-    return parameterization_.get();
-  }
-
-  // If the problem is constructed without a LocalParameterization or with a
-  // Manifold this method will return a nullptr.
-  //
-  // NOTE: This method is deprecated and will be removed in the next public
-  // release of Ceres Solver.
-  CERES_DEPRECATED_WITH_MSG("Use Manifolds instead.")
-  LocalParameterization* mutable_parameterization() {
-    return parameterization_.get();
-  }
-
 private:
  std::unique_ptr<FirstOrderFunction> function_;
-  CERES_DEPRECATED_WITH_MSG("")
-  std::unique_ptr<LocalParameterization> parameterization_;
  std::unique_ptr<Manifold> manifold_;
  std::unique_ptr<double[]> scratch_;
 };
--- a/extern/ceres/include/ceres/gradient_problem_solver.h
+++ b/extern/ceres/include/ceres/gradient_problem_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -305,10 +305,6 @@ class CERES_EXPORT GradientProblemSolver {
    // Number of parameters in the problem.
    int num_parameters = -1;

-    // Dimension of the tangent space of the problem.
-    CERES_DEPRECATED_WITH_MSG("Use num_tangent_parameters.")
-    int num_local_parameters = -1;
-
    // Dimension of the tangent space of the problem.
    int num_tangent_parameters = -1;

--- a/extern/ceres/include/ceres/internal/array_selector.h
+++ b/extern/ceres/include/ceres/internal/array_selector.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2020 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/internal/fixed_array.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // StaticFixedArray selects the best array implementation based on template
 // arguments. If the size is not known at compile-time, pass
@@ -91,7 +90,6 @@ struct ArraySelector<T, num_elements, max_num_elements_on_stack, false, false>
  }
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_ARRAY_SELECTOR_H_
--- a/extern/ceres/include/ceres/internal/autodiff.h
+++ b/extern/ceres/include/ceres/internal/autodiff.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -164,8 +164,7 @@
 #define CERES_AUTODIFF_MAX_RESIDUALS_ON_STACK 20
 #endif

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Extends src by a 1st order perturbation for every dimension and puts it in
 // dst. The size of src is N. Since this is also used for perturbations in
@@ -359,7 +358,6 @@ inline bool AutoDifferentiate(const Functor& functor,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_AUTODIFF_H_
--- a/extern/ceres/include/ceres/internal/disable_warnings.h
+++ b/extern/ceres/include/ceres/internal/disable_warnings.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/internal/eigen.h
+++ b/extern/ceres/include/ceres/internal/eigen.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/internal/euler_angles.h
+++ b/extern/ceres/include/ceres/internal/euler_angles.h
@@ -0,0 +1,199 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef CERES_PUBLIC_INTERNAL_EULER_ANGLES_H_
+#define CERES_PUBLIC_INTERNAL_EULER_ANGLES_H_
+
+#include <type_traits>
+
+namespace ceres {
+namespace internal {
+
+// The EulerSystem struct represents an Euler Angle Convention in compile time.
+// It acts like a trait structure and is also used as a tag for dispatching
+// Euler angle conversion function templates
+//
+// Internally, it implements the convention laid out in "Euler angle
+// conversion", Ken Shoemake, Graphics Gems IV, where a choice of axis for the
+// first rotation (out of 3) and 3 binary choices compactly specify all 24
+// rotation conventions
+//
+//  - InnerAxis: Axis for the first rotation. This is specified by struct tags
+//  axis::X, axis::Y, and axis::Z
+//
+//  - Parity: Defines the parity of the axis permutation. The axis sequence has
+//  Even parity if the second axis of rotation is 'greater-than' the first axis
+//  of rotation according to the order X<Y<Z<X, otherwise it has Odd parity.
+//  This is specified by struct tags Even and Odd
+//
+//  - AngleConvention: Defines whether Proper Euler Angles (originally defined
+//  by Euler, which has the last axis repeated, i.e. ZYZ, ZXZ, etc), or
+//  Tait-Bryan Angles (introduced by the nautical and aerospace fields, i.e.
+//  using ZYX for roll-pitch-yaw) are used. This is specified by struct Tags
+//  ProperEuler and TaitBryan.
+//
+//  - FrameConvention: Defines whether the three rotations are be in a global
+//  frame of reference (extrinsic) or in a body centred frame of reference
+//  (intrinsic). This is specified by struct tags Extrinsic and Intrinsic
+
+namespace axis {
+struct X : std::integral_constant<int, 0> {};
+struct Y : std::integral_constant<int, 1> {};
+struct Z : std::integral_constant<int, 2> {};
+}  // namespace axis
+
+struct Even;
+struct Odd;
+
+struct ProperEuler;
+struct TaitBryan;
+
+struct Extrinsic;
+struct Intrinsic;
+
+template <typename InnerAxisType,
+          typename ParityType,
+          typename AngleConventionType,
+          typename FrameConventionType>
+struct EulerSystem {
+  static constexpr bool kIsParityOdd = std::is_same_v<ParityType, Odd>;
+  static constexpr bool kIsProperEuler =
+      std::is_same_v<AngleConventionType, ProperEuler>;
+  static constexpr bool kIsIntrinsic =
+      std::is_same_v<FrameConventionType, Intrinsic>;
+
+  static constexpr int kAxes[3] = {
+      InnerAxisType::value,
+      (InnerAxisType::value + 1 + static_cast<int>(kIsParityOdd)) % 3,
+      (InnerAxisType::value + 2 - static_cast<int>(kIsParityOdd)) % 3};
+};
+
+}  // namespace internal
+
+// Define human readable aliases to the type of the tags
+using ExtrinsicXYZ = internal::EulerSystem<internal::axis::X,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicXYX = internal::EulerSystem<internal::axis::X,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+using ExtrinsicXZY = internal::EulerSystem<internal::axis::X,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicXZX = internal::EulerSystem<internal::axis::X,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+using ExtrinsicYZX = internal::EulerSystem<internal::axis::Y,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicYZY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+using ExtrinsicYXZ = internal::EulerSystem<internal::axis::Y,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicYXY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+using ExtrinsicZXY = internal::EulerSystem<internal::axis::Z,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicZXZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+using ExtrinsicZYX = internal::EulerSystem<internal::axis::Z,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Extrinsic>;
+using ExtrinsicZYZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Extrinsic>;
+/* Rotating axes */
+using IntrinsicZYX = internal::EulerSystem<internal::axis::X,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicXYX = internal::EulerSystem<internal::axis::X,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+using IntrinsicYZX = internal::EulerSystem<internal::axis::X,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicXZX = internal::EulerSystem<internal::axis::X,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+using IntrinsicXZY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicYZY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+using IntrinsicZXY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicYXY = internal::EulerSystem<internal::axis::Y,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+using IntrinsicYXZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Even,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicZXZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Even,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+using IntrinsicXYZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Odd,
+                                           internal::TaitBryan,
+                                           internal::Intrinsic>;
+using IntrinsicZYZ = internal::EulerSystem<internal::axis::Z,
+                                           internal::Odd,
+                                           internal::ProperEuler,
+                                           internal::Intrinsic>;
+
+}  // namespace ceres
+
+#endif  // CERES_PUBLIC_INTERNAL_EULER_ANGLES_H_
--- a/extern/ceres/include/ceres/internal/fixed_array.h
+++ b/extern/ceres/include/ceres/internal/fixed_array.h
@@ -41,8 +41,7 @@
 #include "ceres/internal/memory.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 constexpr static auto kFixedArrayUseDefault = static_cast<size_t>(-1);

@@ -372,8 +371,8 @@ class FixedArray {
    return std::addressof(ptr->array);
  }

-  static_assert(sizeof(StorageElement) == sizeof(value_type), "");
-  static_assert(alignof(StorageElement) == alignof(value_type), "");
+  static_assert(sizeof(StorageElement) == sizeof(value_type));
+  static_assert(alignof(StorageElement) == alignof(value_type));

  class NonEmptyInlinedStorage {
   public:
@@ -461,7 +460,6 @@ template <typename T, size_t N, typename A>
 constexpr typename FixedArray<T, N, A>::size_type
    FixedArray<T, N, A>::inline_elements;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_FIXED_ARRAY_H_
--- a/extern/ceres/include/ceres/internal/householder_vector.h
+++ b/extern/ceres/include/ceres/internal/householder_vector.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://code.google.com/p/ceres-solver/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,7 @@
 #include "Eigen/Core"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Algorithm 5.1.1 from 'Matrix Computations' by Golub et al. (Johns Hopkins
 // Studies in Mathematical Sciences) but using the nth element of the input
@@ -90,7 +89,6 @@ typename Derived::PlainObject ApplyHouseholderVector(
  return (y - v * (beta * (v.transpose() * y)));
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_HOUSEHOLDER_VECTOR_H_
--- a/extern/ceres/include/ceres/internal/integer_sequence_algorithm.h
+++ b/extern/ceres/include/ceres/internal/integer_sequence_algorithm.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,70 +40,7 @@

 #include "ceres/jet_fwd.h"

-namespace ceres {
-namespace internal {
-
-// Implementation of calculating the sum of an integer sequence.
-// Recursively instantiate SumImpl and calculate the sum of the N first
-// numbers. This reduces the number of instantiations and speeds up
-// compilation.
-//
-// Examples:
-// 1) integer_sequence<int, 5>:
-//   Value = 5
-//
-// 2) integer_sequence<int, 4, 2>:
-//   Value = 4 + 2 + SumImpl<integer_sequence<int>>::Value
-//   Value = 4 + 2 + 0
-//
-// 3) integer_sequence<int, 2, 1, 4>:
-//   Value = 2 + 1 + SumImpl<integer_sequence<int, 4>>::Value
-//   Value = 2 + 1 + 4
-template <typename Seq>
-struct SumImpl;
-
-// Strip of and sum the first number.
-template <typename T, T N, T... Ns>
-struct SumImpl<std::integer_sequence<T, N, Ns...>> {
-  static constexpr T Value =
-      N + SumImpl<std::integer_sequence<T, Ns...>>::Value;
-};
-
-// Strip of and sum the first two numbers.
-template <typename T, T N1, T N2, T... Ns>
-struct SumImpl<std::integer_sequence<T, N1, N2, Ns...>> {
-  static constexpr T Value =
-      N1 + N2 + SumImpl<std::integer_sequence<T, Ns...>>::Value;
-};
-
-// Strip of and sum the first four numbers.
-template <typename T, T N1, T N2, T N3, T N4, T... Ns>
-struct SumImpl<std::integer_sequence<T, N1, N2, N3, N4, Ns...>> {
-  static constexpr T Value =
-      N1 + N2 + N3 + N4 + SumImpl<std::integer_sequence<T, Ns...>>::Value;
-};
-
-// Only one number is left. 'Value' is just that number ('recursion' ends).
-template <typename T, T N>
-struct SumImpl<std::integer_sequence<T, N>> {
-  static constexpr T Value = N;
-};
-
-// No number is left. 'Value' is the identity element (for sum this is zero).
-template <typename T>
-struct SumImpl<std::integer_sequence<T>> {
-  static constexpr T Value = T(0);
-};
-
-// Calculate the sum of an integer sequence. The resulting sum will be stored in
-// 'Value'.
-template <typename Seq>
-class Sum {
-  using T = typename Seq::value_type;
-
- public:
-  static constexpr T Value = SumImpl<Seq>::Value;
-};
+namespace ceres::internal {

 // Implementation of calculating an exclusive scan (exclusive prefix sum) of an
 // integer sequence. Exclusive means that the i-th input element is not included
@@ -232,40 +169,11 @@ struct RemoveValue
 template <typename Sequence, typename Sequence::value_type ValueToRemove>
 using RemoveValue_t = typename RemoveValue<Sequence, ValueToRemove>::type;

-// Determines whether the values of an integer sequence are all the same.
+// Returns true if all elements of Values are equal to HeadValue.
 //
-// The integer sequence must contain at least one value. The predicate is
-// undefined for empty sequences. The evaluation result of the predicate for a
-// sequence containing only one value is defined to be true.
-template <typename... Sequence>
-struct AreAllEqual;
-
-// The predicate result for a sequence containing one element is defined to be
-// true.
-template <typename T, T Value>
-struct AreAllEqual<std::integer_sequence<T, Value>> : std::true_type {};
-
-// Recursion end.
-template <typename T, T Value1, T Value2>
-struct AreAllEqual<std::integer_sequence<T, Value1, Value2>>
-    : std::integral_constant<bool, Value1 == Value2> {};
-
-// Recursion for sequences containing at least two elements.
-template <typename T, T Value1, T Value2, T... Values>
-// clang-format off
-struct AreAllEqual<std::integer_sequence<T, Value1, Value2, Values...> >
-    : std::integral_constant
-<
-    bool,
-    AreAllEqual<std::integer_sequence<T, Value1, Value2> >::value &&
-    AreAllEqual<std::integer_sequence<T, Value2, Values...> >::value
->
-// clang-format on
-{};
-
-// Convenience variable template for AreAllEqual.
-template <class Sequence>
-constexpr bool AreAllEqual_v = AreAllEqual<Sequence>::value;
+// Returns true if Values is empty.
+template <typename T, T HeadValue, T... Values>
+inline constexpr bool AreAllEqual_v = ((HeadValue == Values) && ...);

 // Predicate determining whether an integer sequence is either empty or all
 // values are equal.
@@ -279,13 +187,13 @@ struct IsEmptyOrAreAllEqual<std::integer_sequence<T>> : std::true_type {};
 // General case for sequences containing at least one value.
 template <typename T, T HeadValue, T... Values>
 struct IsEmptyOrAreAllEqual<std::integer_sequence<T, HeadValue, Values...>>
-    : AreAllEqual<std::integer_sequence<T, HeadValue, Values...>> {};
+    : std::integral_constant<bool, AreAllEqual_v<T, HeadValue, Values...>> {};

 // Convenience variable template for IsEmptyOrAreAllEqual.
 template <class Sequence>
-constexpr bool IsEmptyOrAreAllEqual_v = IsEmptyOrAreAllEqual<Sequence>::value;
+inline constexpr bool IsEmptyOrAreAllEqual_v =
+    IsEmptyOrAreAllEqual<Sequence>::value;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_INTEGER_SEQUENCE_ALGORITHM_H_
--- a/extern/ceres/include/ceres/internal/jet_traits.h
+++ b/extern/ceres/include/ceres/internal/jet_traits.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,17 +42,6 @@
 namespace ceres {
 namespace internal {

-// Predicate that determines whether T is a Jet.
-template <typename T, typename E = void>
-struct IsJet : std::false_type {};
-
-template <typename T, int N>
-struct IsJet<Jet<T, N>> : std::true_type {};
-
-// Convenience variable template for IsJet.
-template <typename T>
-constexpr bool IsJet_v = IsJet<T>::value;
-
 // Predicate that determines whether any of the Types is a Jet.
 template <typename... Types>
 struct AreAnyJet : std::false_type {};
@@ -65,7 +54,7 @@ struct AreAnyJet<Jet<T, N>, Types...> : std::true_type {};

 // Convenience variable template for AreAnyJet.
 template <typename... Types>
-constexpr bool AreAnyJet_v = AreAnyJet<Types...>::value;
+inline constexpr bool AreAnyJet_v = AreAnyJet<Types...>::value;

 // Extracts the underlying floating-point from a type T.
 template <typename T, typename E = void>
@@ -84,27 +73,8 @@ using UnderlyingScalar_t = typename UnderlyingScalar<T>::type;
 //
 // Specifically, the predicate applies std::is_same recursively to pairs of
 // Types in the pack.
-//
-// The predicate is defined only for template packs containing at least two
-// types.
-template <typename T1, typename T2, typename... Types>
-// clang-format off
-struct AreAllSame : std::integral_constant
-<
-    bool,
-    AreAllSame<T1, T2>::value &&
-    AreAllSame<T2, Types...>::value
->
-// clang-format on
-{};
-
-// AreAllSame pairwise test.
-template <typename T1, typename T2>
-struct AreAllSame<T1, T2> : std::is_same<T1, T2> {};
-
-// Convenience variable template for AreAllSame.
-template <typename... Types>
-constexpr bool AreAllSame_v = AreAllSame<Types...>::value;
+template <typename T1, typename... Types>
+inline constexpr bool AreAllSame_v = (std::is_same<T1, Types>::value && ...);

 // Determines the rank of a type. This allows to ensure that types passed as
 // arguments are compatible to each other. The rank of Jet is determined by the
@@ -124,7 +94,7 @@ struct Rank<Jet<T, N>> : std::integral_constant<int, N> {};

 // Convenience variable template for Rank.
 template <typename T>
-constexpr int Rank_v = Rank<T>::value;
+inline constexpr int Rank_v = Rank<T>::value;

 // Constructs an integer sequence of ranks for each of the Types in the pack.
 template <typename... Types>
@@ -186,7 +156,8 @@ struct CompatibleJetOperands<> : std::false_type {};
 // This trait is a candidate for a concept definition once C++20 features can
 // be used.
 template <typename... Types>
-constexpr bool CompatibleJetOperands_v = CompatibleJetOperands<Types...>::value;
+inline constexpr bool CompatibleJetOperands_v =
+    CompatibleJetOperands<Types...>::value;

 // Type trait ensuring at least one of the types is a Jet,
 // the underlying scalar types are compatible among each other and Jet
@@ -216,7 +187,8 @@ struct PromotableJetOperands : std::integral_constant
 // This trait is a candidate for a concept definition once C++20 features can
 // be used.
 template <typename... Types>
-constexpr bool PromotableJetOperands_v = PromotableJetOperands<Types...>::value;
+inline constexpr bool PromotableJetOperands_v =
+    PromotableJetOperands<Types...>::value;

 }  // namespace ceres

--- a/extern/ceres/include/ceres/internal/line_parameterization.h
+++ b/extern/ceres/include/ceres/internal/line_parameterization.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2020 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/internal/memory.h
+++ b/extern/ceres/include/ceres/internal/memory.h
@@ -40,8 +40,7 @@
  } while (false)
 #endif  // CERES_HAVE_EXCEPTIONS

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template <typename Allocator, typename Iterator, typename... Args>
 void ConstructRange(Allocator& alloc,
@@ -84,7 +83,6 @@ void CopyRange(Allocator& alloc,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_MEMORY_H_
--- a/extern/ceres/include/ceres/internal/numeric_diff.h
+++ b/extern/ceres/include/ceres/internal/numeric_diff.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // This is split from the main class because C++ doesn't allow partial template
 // specializations for member functions. The alternative is to repeat the main
@@ -502,7 +501,6 @@ struct EvaluateJacobianForParameterBlocks<ParameterDims,
  }
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_NUMERIC_DIFF_H_
--- a/extern/ceres/include/ceres/internal/parameter_dims.h
+++ b/extern/ceres/include/ceres/internal/parameter_dims.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,22 +36,7 @@

 #include "ceres/internal/integer_sequence_algorithm.h"

-namespace ceres {
-namespace internal {
-
-// Checks, whether the given parameter block sizes are valid. Valid means every
-// dimension is bigger than zero.
-constexpr bool IsValidParameterDimensionSequence(std::integer_sequence<int>) {
-  return true;
-}
-
-template <int N, int... Ts>
-constexpr bool IsValidParameterDimensionSequence(
-    std::integer_sequence<int, N, Ts...>) {
-  return (N <= 0) ? false
-                  : IsValidParameterDimensionSequence(
-                        std::integer_sequence<int, Ts...>());
-}
+namespace ceres::internal {

 // Helper class that represents the parameter dimensions. The parameter
 // dimensions are either dynamic or the sizes are known at compile time. It is
@@ -70,8 +55,7 @@ class ParameterDims {

  // The parameter dimensions are only valid if all parameter block dimensions
  // are greater than zero.
-  static constexpr bool kIsValid =
-      IsValidParameterDimensionSequence(Parameters());
+  static constexpr bool kIsValid = ((Ns > 0) && ...);
  static_assert(kIsValid,
                "Invalid parameter block dimension detected. Each parameter "
                "block dimension must be bigger than zero.");
@@ -81,8 +65,7 @@ class ParameterDims {
  static_assert(kIsDynamic || kNumParameterBlocks > 0,
                "At least one parameter block must be specified.");

-  static constexpr int kNumParameters =
-      Sum<std::integer_sequence<int, Ns...>>::Value;
+  static constexpr int kNumParameters = (Ns + ... + 0);

  static constexpr int GetDim(int dim) { return params_[dim]; }

@@ -118,7 +101,6 @@ template <int... Ns>
 using StaticParameterDims = ParameterDims<false, Ns...>;
 using DynamicParameterDims = ParameterDims<true>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_PARAMETER_DIMS_H_
--- a/extern/ceres/include/ceres/internal/port.h
+++ b/extern/ceres/include/ceres/internal/port.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,14 +47,6 @@
 #define CERES_GET_FLAG(X) X
 #endif

-// Indicates whether C++17 is currently active
-#ifndef CERES_HAS_CPP17
-#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
-#define CERES_HAS_CPP17
-#endif  // __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >=
-        // 201703L)
-#endif  // !defined(CERES_HAS_CPP17)
-
 // Indicates whether C++20 is currently active
 #ifndef CERES_HAS_CPP20
 #if __cplusplus >= 202002L || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
@@ -85,4 +77,15 @@
 //
 #define CERES_PREVENT_MACRO_SUBSTITUTION  // Yes, it's empty

+// CERES_DISABLE_DEPRECATED_WARNING and CERES_RESTORE_DEPRECATED_WARNING allow
+// to temporarily disable deprecation warnings
+#if defined(_MSC_VER)
+#define CERES_DISABLE_DEPRECATED_WARNING \
+  _Pragma("warning(push)") _Pragma("warning(disable : 4996)")
+#define CERES_RESTORE_DEPRECATED_WARNING _Pragma("warning(pop)")
+#else  // defined(_MSC_VER)
+#define CERES_DISABLE_DEPRECATED_WARNING
+#define CERES_RESTORE_DEPRECATED_WARNING
+#endif  // defined(_MSC_VER)
+
 #endif  // CERES_PUBLIC_INTERNAL_PORT_H_
--- a/extern/ceres/include/ceres/internal/reenable_warnings.h
+++ b/extern/ceres/include/ceres/internal/reenable_warnings.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/internal/sphere_manifold_functions.h
+++ b/extern/ceres/include/ceres/internal/sphere_manifold_functions.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,6 +32,7 @@
 #ifndef CERES_PUBLIC_INTERNAL_SPHERE_MANIFOLD_HELPERS_H_
 #define CERES_PUBLIC_INTERNAL_SPHERE_MANIFOLD_HELPERS_H_

+#include "ceres/constants.h"
 #include "ceres/internal/householder_vector.h"

 // This module contains functions to compute the SphereManifold plus and minus
@@ -58,26 +59,23 @@
 // used in order to allow also Eigen::Ref and Eigen block expressions to
 // be passed to the function.

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template <typename VT, typename XT, typename DeltaT, typename XPlusDeltaT>
 inline void ComputeSphereManifoldPlus(const VT& v,
                                      double beta,
                                      const XT& x,
                                      const DeltaT& delta,
-                                      double norm_delta,
+                                      const double norm_delta,
                                      XPlusDeltaT* x_plus_delta) {
  constexpr int AmbientDim = VT::RowsAtCompileTime;

  // Map the delta from the minimum representation to the over parameterized
  // homogeneous vector. See B.2 p.25 equation (106) - (107) for more details.
-  const double norm_delta_div_2 = 0.5 * norm_delta;
-  const double sin_delta_by_delta =
-      std::sin(norm_delta_div_2) / norm_delta_div_2;
+  const double sin_delta_by_delta = std::sin(norm_delta) / norm_delta;

  Eigen::Matrix<double, AmbientDim, 1> y(v.size());
-  y << 0.5 * sin_delta_by_delta * delta, std::cos(norm_delta_div_2);
+  y << sin_delta_by_delta * delta, std::cos(norm_delta);

  // Apply the delta update to remain on the sphere.
  *x_plus_delta = x.norm() * ApplyHouseholderVector(y, v, beta);
@@ -99,11 +97,11 @@ inline void ComputeSphereManifoldPlusJacobian(const VT& x,
  // have trouble deducing the type of v automatically.
  ComputeHouseholderVector<VT, double, AmbientSpaceDim>(x, &v, &beta);

-  // The Jacobian is equal to J = 0.5 * H.leftCols(size_ - 1) where H is the
+  // The Jacobian is equal to J = H.leftCols(size_ - 1) where H is the
  // Householder matrix (H = I - beta * v * v').
  for (int i = 0; i < tangent_size; ++i) {
-    (*jacobian).col(i) = -0.5 * beta * v(i) * v;
-    (*jacobian)(i, i) += 0.5;
+    (*jacobian).col(i) = -beta * v(i) * v;
+    (*jacobian)(i, i) += 1.0;
  }
  (*jacobian) *= x.norm();
 }
@@ -116,18 +114,19 @@ inline void ComputeSphereManifoldMinus(
      AmbientSpaceDim == Eigen::Dynamic ? Eigen::Dynamic : AmbientSpaceDim - 1;
  using AmbientVector = Eigen::Matrix<double, AmbientSpaceDim, 1>;

-  const int tanget_size = v.size() - 1;
+  const int tangent_size = v.size() - 1;

  const AmbientVector hy = ApplyHouseholderVector(y, v, beta) / x.norm();

  // Calculate y - x. See B.2 p.25 equation (108).
-  double y_last = hy[tanget_size];
-  double hy_norm = hy.template head<TangentSpaceDim>(tanget_size).norm();
+  const double y_last = hy[tangent_size];
+  const double hy_norm = hy.template head<TangentSpaceDim>(tangent_size).norm();
  if (hy_norm == 0.0) {
    y_minus_x->setZero();
+    y_minus_x->data()[tangent_size - 1] = y_last >= 0 ? 0.0 : constants::pi;
  } else {
-    *y_minus_x = 2.0 * std::atan2(hy_norm, y_last) / hy_norm *
-                 hy.template head<TangentSpaceDim>(tanget_size);
+    *y_minus_x = std::atan2(hy_norm, y_last) / hy_norm *
+                 hy.template head<TangentSpaceDim>(tangent_size);
  }
 }

@@ -147,16 +146,18 @@ inline void ComputeSphereManifoldMinusJacobian(const VT& x,
  // have trouble deducing the type of v automatically.
  ComputeHouseholderVector<VT, double, AmbientSpaceDim>(x, &v, &beta);

-  // The Jacobian is equal to J = 2.0 * H.leftCols(size_ - 1) where H is the
+  // The Jacobian is equal to J = H.leftCols(size_ - 1) where H is the
  // Householder matrix (H = I - beta * v * v').
  for (int i = 0; i < tangent_size; ++i) {
-    (*jacobian).row(i) = -2.0 * beta * v(i) * v;
-    (*jacobian)(i, i) += 2.0;
+    // NOTE: The transpose is used for correctness (the product is expected to
+    // be a row vector), although here there seems to be no difference between
+    // transposing or not for Eigen (possibly a compile-time auto fix).
+    (*jacobian).row(i) = -beta * v(i) * v.transpose();
+    (*jacobian)(i, i) += 1.0;
  }
  (*jacobian) /= x.norm();
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif
--- a/extern/ceres/include/ceres/internal/variadic_evaluate.h
+++ b/extern/ceres/include/ceres/internal/variadic_evaluate.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/cost_function.h"
 #include "ceres/internal/parameter_dims.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // For fixed size cost functors
 template <typename Functor, typename T, int... Indices>
@@ -50,7 +49,7 @@ inline bool VariadicEvaluateImpl(const Functor& functor,
                                 T* output,
                                 std::false_type /*is_dynamic*/,
                                 std::integer_sequence<int, Indices...>) {
-  static_assert(sizeof...(Indices),
+  static_assert(sizeof...(Indices) > 0,
                "Invalid number of parameter blocks. At least one parameter "
                "block must be specified.");
  return functor(input[Indices]..., output);
@@ -107,7 +106,29 @@ inline bool VariadicEvaluate(const Functor& functor,
  return VariadicEvaluateImpl<ParameterDims>(functor, input, output, &functor);
 }

-}  // namespace internal
-}  // namespace ceres
+// When differentiating dynamically sized CostFunctions, VariadicEvaluate
+// expects a functor with the signature:
+//
+// bool operator()(double const* const* parameters, double* cost) const
+//
+// However for NumericDiffFirstOrderFunction, the functor has the signature
+//
+// bool operator()(double const* parameters, double* cost) const
+//
+// This thin wrapper adapts the latter to the former.
+template <typename Functor>
+class FirstOrderFunctorAdapter {
+ public:
+  explicit FirstOrderFunctorAdapter(const Functor& functor)
+      : functor_(functor) {}
+  bool operator()(double const* const* parameters, double* cost) const {
+    return functor_(*parameters, cost);
+  }
+
+ private:
+  const Functor& functor_;
+};
+
+}  // namespace ceres::internal

 #endif  // CERES_PUBLIC_INTERNAL_VARIADIC_EVALUATE_H_
--- a/extern/ceres/include/ceres/iteration_callback.h
+++ b/extern/ceres/include/ceres/iteration_callback.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/jet.h
+++ b/extern/ceres/include/ceres/jet.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -724,7 +724,6 @@ inline Jet<T, N> hypot(const Jet<T, N>& x, const Jet<T, N>& y) {
  return Jet<T, N>(tmp, x.a / tmp * x.v + y.a / tmp * y.v);
 }

-#ifdef CERES_HAS_CPP17
 // Like sqrt(x^2 + y^2 + z^2),
 // but acts to prevent underflow/overflow for small/large x/y/z.
 // Note that the function is non-smooth at x=y=z=0,
@@ -744,7 +743,6 @@ inline Jet<T, N> hypot(const Jet<T, N>& x,
  const T tmp = hypot(x.a, y.a, z.a);
  return Jet<T, N>(tmp, x.a / tmp * x.v + y.a / tmp * y.v + z.a / tmp * z.v);
 }
-#endif  // defined(CERES_HAS_CPP17)

 // Like x * y + z but rounded only once.
 template <typename T, int N>
@@ -757,28 +755,76 @@ inline Jet<T, N> fma(const Jet<T, N>& x,
  return Jet<T, N>(fma(x.a, y.a, z.a), y.a * x.v + x.a * y.v + z.v);
 }

-// Returns the larger of the two arguments. NaNs are treated as missing data.
+// Return value of fmax() and fmin() on equality
+// ---------------------------------------------
+//
+// There is arguably no good answer to what fmax() & fmin() should return on
+// equality, which for Jets by definition ONLY compares the scalar parts. We
+// choose what we think is the least worst option (averaging as Jets) which
+// minimises undesirable/unexpected behaviour as used, and also supports client
+// code written against Ceres versions prior to type promotion being supported
+// in Jet comparisons (< v2.1).
+//
+// The std::max() convention of returning the first argument on equality is
+// problematic, as it means that the derivative component may or may not be
+// preserved (when comparing a Jet with a scalar) depending upon the ordering.
+//
+// Always returning the Jet in {Jet, scalar} cases on equality is problematic
+// as it is inconsistent with the behaviour that would be obtained if the scalar
+// was first cast to Jet and the {Jet, Jet} case was used. Prior to type
+// promotion (Ceres v2.1) client code would typically cast constants to Jets
+// e.g: fmax(x, T(2.0)) which means the {Jet, Jet} case predominates, and we
+// still want the result to be order independent.
+//
+// Our intuition is that preserving a non-zero derivative is best, even if
+// its value does not match either of the inputs. Averaging achieves this
+// whilst ensuring argument ordering independence. This is also the approach
+// used by the Jax library, and TensorFlow's reduce_max().
+
+// Returns the larger of the two arguments, with Jet averaging on equality.
+// NaNs are treated as missing data.
 //
 // NOTE: This function is NOT subject to any of the error conditions specified
-// in `math_errhandling`.
+//       in `math_errhandling`.
 template <typename Lhs,
          typename Rhs,
          std::enable_if_t<CompatibleJetOperands_v<Lhs, Rhs>>* = nullptr>
-inline decltype(auto) fmax(const Lhs& f, const Rhs& g) {
+inline decltype(auto) fmax(const Lhs& x, const Rhs& y) {
  using J = std::common_type_t<Lhs, Rhs>;
-  return (isnan(g) || isgreater(f, g)) ? J{f} : J{g};
+  // As x == y may set FP exceptions in the presence of NaNs when used with
+  // non-default compiler options so we avoid its use here.
+  if (isnan(x) || isnan(y) || islessgreater(x, y)) {
+    return isnan(x) || isless(x, y) ? J{y} : J{x};
+  }
+  // x == y (scalar parts) return the average of their Jet representations.
+#if defined(CERES_HAS_CPP20)
+  return midpoint(J{x}, J{y});
+#else
+  return (J{x} + J{y}) * typename J::Scalar(0.5);
+#endif  // defined(CERES_HAS_CPP20)
 }

-// Returns the smaller of the two arguments. NaNs are treated as missing data.
+// Returns the smaller of the two arguments, with Jet averaging on equality.
+// NaNs are treated as missing data.
 //
 // NOTE: This function is NOT subject to any of the error conditions specified
-// in `math_errhandling`.
+//       in `math_errhandling`.
 template <typename Lhs,
          typename Rhs,
          std::enable_if_t<CompatibleJetOperands_v<Lhs, Rhs>>* = nullptr>
-inline decltype(auto) fmin(const Lhs& f, const Rhs& g) {
+inline decltype(auto) fmin(const Lhs& x, const Rhs& y) {
  using J = std::common_type_t<Lhs, Rhs>;
-  return (isnan(f) || isless(g, f)) ? J{g} : J{f};
+  // As x == y may set FP exceptions in the presence of NaNs when used with
+  // non-default compiler options so we avoid its use here.
+  if (isnan(x) || isnan(y) || islessgreater(x, y)) {
+    return isnan(x) || isgreater(x, y) ? J{y} : J{x};
+  }
+  // x == y (scalar parts) return the average of their Jet representations.
+#if defined(CERES_HAS_CPP20)
+  return midpoint(J{x}, J{y});
+#else
+  return (J{x} + J{y}) * typename J::Scalar(0.5);
+#endif  // defined(CERES_HAS_CPP20)
 }

 // Returns the positive difference (f - g) of two arguments and zero if f <= g.
@@ -804,7 +850,7 @@ template <typename T, int N>
 inline Jet<T, N> erf(const Jet<T, N>& x) {
  // We evaluate the constant as follows:
  //   2 / sqrt(pi) = 1 / sqrt(atan(1.))
-  // On POSIX sytems it is defined as M_2_SQRTPI, but this is not
+  // On POSIX systems it is defined as M_2_SQRTPI, but this is not
  // portable and the type may not be T.  The above expression
  // evaluates to full precision with IEEE arithmetic and, since it's
  // constant, the compiler can generate exactly the same code.  gcc
@@ -828,25 +874,19 @@ inline Jet<T, N> erfc(const Jet<T, N>& x) {
 // function errors in client code (the specific warning is suppressed when
 // Ceres itself is built).
 inline double BesselJ0(double x) {
-#if defined(CERES_MSVC_USE_UNDERSCORE_PREFIXED_BESSEL_FUNCTIONS)
-  return _j0(x);
-#else
+  CERES_DISABLE_DEPRECATED_WARNING
  return j0(x);
-#endif
+  CERES_RESTORE_DEPRECATED_WARNING
 }
 inline double BesselJ1(double x) {
-#if defined(CERES_MSVC_USE_UNDERSCORE_PREFIXED_BESSEL_FUNCTIONS)
-  return _j1(x);
-#else
+  CERES_DISABLE_DEPRECATED_WARNING
  return j1(x);
-#endif
+  CERES_RESTORE_DEPRECATED_WARNING
 }
 inline double BesselJn(int n, double x) {
-#if defined(CERES_MSVC_USE_UNDERSCORE_PREFIXED_BESSEL_FUNCTIONS)
-  return _jn(n, x);
-#else
+  CERES_DISABLE_DEPRECATED_WARNING
  return jn(n, x);
-#endif
+  CERES_RESTORE_DEPRECATED_WARNING
 }

 // For the formulae of the derivatives of the Bessel functions see the book:
@@ -1264,8 +1304,13 @@ struct numeric_limits<ceres::Jet<T, N>> {
  static constexpr bool is_bounded = std::numeric_limits<T>::is_bounded;
  static constexpr bool is_modulo = std::numeric_limits<T>::is_modulo;

+  // has_denorm (and has_denorm_loss, not defined for Jet) has been deprecated
+  // in C++23. However, without an intent to remove the declaration. Disable
+  // deprecation warnings temporarily just for the corresponding symbols.
+  CERES_DISABLE_DEPRECATED_WARNING
  static constexpr std::float_denorm_style has_denorm =
      std::numeric_limits<T>::has_denorm;
+  CERES_RESTORE_DEPRECATED_WARNING
  static constexpr std::float_round_style round_style =
      std::numeric_limits<T>::round_style;

@@ -1335,6 +1380,7 @@ struct NumTraits<ceres::Jet<T, N>> {
  }

  static inline int digits10() { return NumTraits<T>::digits10(); }
+  static inline int max_digits10() { return NumTraits<T>::max_digits10(); }

  enum {
    IsComplex = 0,
--- a/extern/ceres/include/ceres/jet_fwd.h
+++ b/extern/ceres/include/ceres/jet_fwd.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/line_manifold.h
+++ b/extern/ceres/include/ceres/line_manifold.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -156,7 +156,7 @@ bool LineManifold<AmbientSpaceDimension>::Plus(const double* x_ptr,
  //
  // The direction update function Plus_d is the same as as the SphereManifold:
  //
-  //   d* = H_{v(d)} [0.5 sinc(0.5 |delta_d|) delta_d, cos(0.5 |delta_d|)]^T
+  //   d* = H_{v(d)} [sinc(|delta_d|) delta_d, cos(|delta_d|)]^T
  //
  // where H is the householder matrix
  //   H_{v} = I - (2 / |v|^2) v v^T
@@ -165,7 +165,7 @@ bool LineManifold<AmbientSpaceDimension>::Plus(const double* x_ptr,
  //
  // The origin point update function Plus_o is defined as
  //
-  //   o* = o + H_{v(d)} [0.5 delta_o, 0]^T.
+  //   o* = o + H_{v(d)} [delta_o, 0]^T.

  Eigen::Map<const AmbientVector> o(x_ptr, size_);
  Eigen::Map<const AmbientVector> d(x_ptr + size_, size_);
@@ -208,11 +208,8 @@ bool LineManifold<AmbientSpaceDimension>::Plus(const double* x_ptr,
  // perpendicular to the line direction. This is achieved by using the
  // householder matrix of the direction and allow only movements
  // perpendicular to e_n.
-  //
-  // The factor of 0.5 is used to be consistent with the line direction
-  // update.
  AmbientVector y(size_);
-  y << 0.5 * delta_o, 0;
+  y << delta_o, 0;
  o_plus_delta += internal::ApplyHouseholderVector(y, v, beta);

  return true;
@@ -266,7 +263,7 @@ bool LineManifold<AmbientSpaceDimension>::Minus(const double* y_ptr,

  AmbientVector delta_o = y_o - x_o;
  const AmbientVector h_delta_o =
-      2.0 * internal::ApplyHouseholderVector(delta_o, v, beta);
+      internal::ApplyHouseholderVector(delta_o, v, beta);
  y_minus_x_o = h_delta_o.template head<TangentSpaceDimension>(size_ - 1);

  return true;
--- a/extern/ceres/include/ceres/local_parameterization.h
+++ b/extern/ceres/include/ceres/local_parameterization.h
@@ -1,371 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keir@google.com (Keir Mierle)
-//         sameeragarwal@google.com (Sameer Agarwal)
-
-#ifndef CERES_PUBLIC_LOCAL_PARAMETERIZATION_H_
-#define CERES_PUBLIC_LOCAL_PARAMETERIZATION_H_
-
-#include <array>
-#include <memory>
-#include <vector>
-
-#include "ceres/internal/disable_warnings.h"
-#include "ceres/internal/export.h"
-#include "ceres/internal/port.h"
-
-namespace ceres {
-
-// WARNING: LocalParameterizations are deprecated. They will be removed from
-// Ceres Solver in version 2.2.0. Please use Manifolds instead.
-
-// Purpose: Sometimes parameter blocks x can overparameterize a problem
-//
-//   min f(x)
-//    x
-//
-// In that case it is desirable to choose a parameterization for the
-// block itself to remove the null directions of the cost. More
-// generally, if x lies on a manifold of a smaller dimension than the
-// ambient space that it is embedded in, then it is numerically and
-// computationally more effective to optimize it using a
-// parameterization that lives in the tangent space of that manifold
-// at each point.
-//
-// For example, a sphere in three dimensions is a 2 dimensional
-// manifold, embedded in a three dimensional space. At each point on
-// the sphere, the plane tangent to it defines a two dimensional
-// tangent space. For a cost function defined on this sphere, given a
-// point x, moving in the direction normal to the sphere at that point
-// is not useful. Thus a better way to do a local optimization is to
-// optimize over two dimensional vector delta in the tangent space at
-// that point and then "move" to the point x + delta, where the move
-// operation involves projecting back onto the sphere. Doing so
-// removes a redundant dimension from the optimization, making it
-// numerically more robust and efficient.
-//
-// More generally we can define a function
-//
-//   x_plus_delta = Plus(x, delta),
-//
-// where x_plus_delta has the same size as x, and delta is of size
-// less than or equal to x. The function Plus, generalizes the
-// definition of vector addition. Thus it satisfies the identify
-//
-//   Plus(x, 0) = x, for all x.
-//
-// A trivial version of Plus is when delta is of the same size as x
-// and
-//
-//   Plus(x, delta) = x + delta
-//
-// A more interesting case if x is two dimensional vector, and the
-// user wishes to hold the first coordinate constant. Then, delta is a
-// scalar and Plus is defined as
-//
-//   Plus(x, delta) = x + [0] * delta
-//                        [1]
-//
-// An example that occurs commonly in Structure from Motion problems
-// is when camera rotations are parameterized using Quaternion. There,
-// it is useful to only make updates orthogonal to that 4-vector
-// defining the quaternion. One way to do this is to let delta be a 3
-// dimensional vector and define Plus to be
-//
-//   Plus(x, delta) = [cos(|delta|), sin(|delta|) delta / |delta|] * x
-//
-// The multiplication between the two 4-vectors on the RHS is the
-// standard quaternion product.
-//
-// Given f and a point x, optimizing f can now be restated as
-//
-//     min  f(Plus(x, delta))
-//    delta
-//
-// Given a solution delta to this problem, the optimal value is then
-// given by
-//
-//   x* = Plus(x, delta)
-//
-// The class LocalParameterization defines the function Plus and its
-// Jacobian which is needed to compute the Jacobian of f w.r.t delta.
-class CERES_DEPRECATED_WITH_MSG(
-    "LocalParameterizations will be removed from the Ceres Solver API in "
-    "version 2.2.0. Use Manifolds instead.")
-    CERES_EXPORT LocalParameterization {
- public:
-  virtual ~LocalParameterization();
-
-  // Generalization of the addition operation,
-  //
-  //   x_plus_delta = Plus(x, delta)
-  //
-  // with the condition that Plus(x, 0) = x.
-  //
-  virtual bool Plus(const double* x,
-                    const double* delta,
-                    double* x_plus_delta) const = 0;
-
-  // The jacobian of Plus(x, delta) w.r.t delta at delta = 0.
-  //
-  // jacobian is a row-major GlobalSize() x LocalSize() matrix.
-  virtual bool ComputeJacobian(const double* x, double* jacobian) const = 0;
-
-  // local_matrix = global_matrix * jacobian
-  //
-  // global_matrix is a num_rows x GlobalSize  row major matrix.
-  // local_matrix is a num_rows x LocalSize row major matrix.
-  // jacobian(x) is the matrix returned by ComputeJacobian at x.
-  //
-  // This is only used by GradientProblem. For most normal uses, it is
-  // okay to use the default implementation.
-  virtual bool MultiplyByJacobian(const double* x,
-                                  const int num_rows,
-                                  const double* global_matrix,
-                                  double* local_matrix) const;
-
-  // Size of x.
-  virtual int GlobalSize() const = 0;
-
-  // Size of delta.
-  virtual int LocalSize() const = 0;
-};
-
-// Some basic parameterizations
-
-// Identity Parameterization: Plus(x, delta) = x + delta
-class CERES_DEPRECATED_WITH_MSG("Use EuclideanManifold instead.")
-    CERES_EXPORT IdentityParameterization : public LocalParameterization {
- public:
-  explicit IdentityParameterization(int size);
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  bool MultiplyByJacobian(const double* x,
-                          const int num_cols,
-                          const double* global_matrix,
-                          double* local_matrix) const override;
-  int GlobalSize() const override { return size_; }
-  int LocalSize() const override { return size_; }
-
- private:
-  const int size_;
-};
-
-// Hold a subset of the parameters inside a parameter block constant.
-class CERES_DEPRECATED_WITH_MSG("Use SubsetManifold instead.")
-    CERES_EXPORT SubsetParameterization : public LocalParameterization {
- public:
-  explicit SubsetParameterization(int size,
-                                  const std::vector<int>& constant_parameters);
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  bool MultiplyByJacobian(const double* x,
-                          const int num_cols,
-                          const double* global_matrix,
-                          double* local_matrix) const override;
-  int GlobalSize() const override {
-    return static_cast<int>(constancy_mask_.size());
-  }
-  int LocalSize() const override { return local_size_; }
-
- private:
-  const int local_size_;
-  std::vector<char> constancy_mask_;
-};
-
-// Plus(x, delta) = [cos(|delta|), sin(|delta|) delta / |delta|] * x
-// with * being the quaternion multiplication operator. Here we assume
-// that the first element of the quaternion vector is the real (cos
-// theta) part.
-class CERES_DEPRECATED_WITH_MSG("Use QuaternionManifold instead.")
-    CERES_EXPORT QuaternionParameterization : public LocalParameterization {
- public:
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  int GlobalSize() const override { return 4; }
-  int LocalSize() const override { return 3; }
-};
-
-// Implements the quaternion local parameterization for Eigen's representation
-// of the quaternion. Eigen uses a different internal memory layout for the
-// elements of the quaternion than what is commonly used. Specifically, Eigen
-// stores the elements in memory as [x, y, z, w] where the real part is last
-// whereas it is typically stored first. Note, when creating an Eigen quaternion
-// through the constructor the elements are accepted in w, x, y, z order. Since
-// Ceres operates on parameter blocks which are raw double pointers this
-// difference is important and requires a different parameterization.
-//
-// Plus(x, delta) = [sin(|delta|) delta / |delta|, cos(|delta|)] * x
-// with * being the quaternion multiplication operator.
-class CERES_DEPRECATED_WITH_MSG("Use EigenQuaternionManifold instead.")
-    CERES_EXPORT EigenQuaternionParameterization
-    : public ceres::LocalParameterization {
- public:
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  int GlobalSize() const override { return 4; }
-  int LocalSize() const override { return 3; }
-};
-
-// This provides a parameterization for homogeneous vectors which are commonly
-// used in Structure from Motion problems.  One example where they are used is
-// in representing points whose triangulation is ill-conditioned. Here it is
-// advantageous to use an over-parameterization since homogeneous vectors can
-// represent points at infinity.
-//
-// The plus operator is defined as
-// Plus(x, delta) =
-//    [sin(0.5 * |delta|) * delta / |delta|, cos(0.5 * |delta|)] * x
-//
-// with * defined as an operator which applies the update orthogonal to x to
-// remain on the sphere. We assume that the last element of x is the scalar
-// component. The size of the homogeneous vector is required to be greater than
-// 1.
-class CERES_DEPRECATED_WITH_MSG("Use SphereManifold instead.") CERES_EXPORT
-    HomogeneousVectorParameterization : public LocalParameterization {
- public:
-  explicit HomogeneousVectorParameterization(int size);
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  int GlobalSize() const override { return size_; }
-  int LocalSize() const override { return size_ - 1; }
-
- private:
-  const int size_;
-};
-
-// This provides a parameterization for lines, where the line is
-// over-parameterized by an origin point and a direction vector. So the
-// parameter vector size needs to be two times the ambient space dimension,
-// where the first half is interpreted as the origin point and the second half
-// as the direction.
-//
-// The plus operator for the line direction is the same as for the
-// HomogeneousVectorParameterization. The update of the origin point is
-// perpendicular to the line direction before the update.
-//
-// This local parameterization is a special case of the affine Grassmannian
-// manifold (see https://en.wikipedia.org/wiki/Affine_Grassmannian_(manifold))
-// for the case Graff_1(R^n).
-template <int AmbientSpaceDimension>
-class CERES_DEPRECATED_WITH_MSG("Use LineManifold instead.")
-    LineParameterization : public LocalParameterization {
- public:
-  static_assert(AmbientSpaceDimension >= 2,
-                "The ambient space must be at least 2");
-
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  int GlobalSize() const override { return 2 * AmbientSpaceDimension; }
-  int LocalSize() const override { return 2 * (AmbientSpaceDimension - 1); }
-};
-
-// Construct a local parameterization by taking the Cartesian product
-// of a number of other local parameterizations. This is useful, when
-// a parameter block is the cartesian product of two or more
-// manifolds. For example the parameters of a camera consist of a
-// rotation and a translation, i.e., SO(3) x R^3.
-//
-// Example usage:
-//
-// ProductParameterization product_param(new QuaterionionParameterization(),
-//                                       new IdentityParameterization(3));
-//
-// is the local parameterization for a rigid transformation, where the
-// rotation is represented using a quaternion.
-//
-class CERES_DEPRECATED_WITH_MSG("Use ProductManifold instead.")
-    CERES_EXPORT ProductParameterization : public LocalParameterization {
- public:
-  ProductParameterization(const ProductParameterization&) = delete;
-  ProductParameterization& operator=(const ProductParameterization&) = delete;
-  //
-  // NOTE: The constructor takes ownership of the input local
-  // parameterizations.
-  //
-  template <typename... LocalParams>
-  explicit ProductParameterization(LocalParams*... local_params)
-      : local_params_(sizeof...(LocalParams)) {
-    constexpr int kNumLocalParams = sizeof...(LocalParams);
-    static_assert(kNumLocalParams >= 2,
-                  "At least two local parameterizations must be specified.");
-
-    using LocalParameterizationPtr = std::unique_ptr<LocalParameterization>;
-
-    // Wrap all raw pointers into std::unique_ptr for exception safety.
-    std::array<LocalParameterizationPtr, kNumLocalParams> local_params_array{
-        LocalParameterizationPtr(local_params)...};
-
-    // Initialize internal state.
-    for (int i = 0; i < kNumLocalParams; ++i) {
-      LocalParameterizationPtr& param = local_params_[i];
-      param = std::move(local_params_array[i]);
-
-      buffer_size_ =
-          std::max(buffer_size_, param->LocalSize() * param->GlobalSize());
-      global_size_ += param->GlobalSize();
-      local_size_ += param->LocalSize();
-    }
-  }
-
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override;
-  bool ComputeJacobian(const double* x, double* jacobian) const override;
-  int GlobalSize() const override { return global_size_; }
-  int LocalSize() const override { return local_size_; }
-
- private:
-  std::vector<std::unique_ptr<LocalParameterization>> local_params_;
-  int local_size_{0};
-  int global_size_{0};
-  int buffer_size_{0};
-};
-
-}  // namespace ceres
-
-// clang-format off
-#include "ceres/internal/reenable_warnings.h"
-// clang-format on
-
-#include "ceres/internal/line_parameterization.h"
-
-#endif  // CERES_PUBLIC_LOCAL_PARAMETERIZATION_H_
--- a/extern/ceres/include/ceres/loss_function.h
+++ b/extern/ceres/include/ceres/loss_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/manifold.h
+++ b/extern/ceres/include/ceres/manifold.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/manifold_test_utils.h
+++ b/extern/ceres/include/ceres/manifold_test_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,24 +42,54 @@

 namespace ceres {

-// Matchers and macros for help with testing Manifold objects.
+// Matchers and macros to simplify testing of custom Manifold objects using the
+// gtest testing framework.
 //
 // Testing a Manifold has two parts.
 //
-// 1. Checking that Manifold::Plus is correctly defined. This requires per
-// manifold tests.
+// 1. Checking that Manifold::Plus() and Manifold::Minus() are correctly
+//    defined. This requires per manifold tests.
 //
 // 2. The other methods of the manifold have mathematical properties that make
-// it compatible with Plus, as described in:
+//    them compatible with Plus() and Minus(), as described in [1].
 //
-// "Integrating Generic Sensor Fusion Algorithms with Sound State
-// Representations through Encapsulation of Manifolds"
-// By C. Hertzberg, R. Wagner, U. Frese and L. Schroder
-// https://arxiv.org/pdf/1107.1119.pdf
+// To verify these general requirements for a custom Manifold, use the
+// EXPECT_THAT_MANIFOLD_INVARIANTS_HOLD() macro from within a gtest test. Note
+// that additional domain-specific tests may also be prudent, e.g to verify the
+// behaviour of a Quaternion Manifold about pi.
 //
-// These tests are implemented using generic matchers defined below which can
-// all be called by the macro EXPECT_THAT_MANIFOLD_INVARIANTS_HOLD(manifold, x,
-// delta, y, tolerance). See manifold_test.cc for example usage.
+// [1] "Integrating Generic Sensor Fusion Algorithms with Sound State
+//     Representations through Encapsulation of Manifolds", C. Hertzberg,
+//     R. Wagner, U. Frese and L. Schroder, https://arxiv.org/pdf/1107.1119.pdf
+
+// Verifies the general requirements for a custom Manifold are satisfied to
+// within the specified (numerical) tolerance.
+//
+// Example usage for a custom Manifold: ExampleManifold:
+//
+//    TEST(ExampleManifold, ManifoldInvariantsHold) {
+//      constexpr double kTolerance = 1.0e-9;
+//      ExampleManifold manifold;
+//      ceres::Vector x = ceres::Vector::Zero(manifold.AmbientSize());
+//      ceres::Vector y = ceres::Vector::Zero(manifold.AmbientSize());
+//      ceres::Vector delta = ceres::Vector::Zero(manifold.TangentSize());
+//      EXPECT_THAT_MANIFOLD_INVARIANTS_HOLD(manifold, x, delta, y, kTolerance);
+//    }
+#define EXPECT_THAT_MANIFOLD_INVARIANTS_HOLD(manifold, x, delta, y, tolerance) \
+  ::ceres::Vector zero_tangent =                                               \
+      ::ceres::Vector::Zero(manifold.TangentSize());                           \
+  EXPECT_THAT(manifold, ::ceres::XPlusZeroIsXAt(x, tolerance));                \
+  EXPECT_THAT(manifold, ::ceres::XMinusXIsZeroAt(x, tolerance));               \
+  EXPECT_THAT(manifold, ::ceres::MinusPlusIsIdentityAt(x, delta, tolerance));  \
+  EXPECT_THAT(manifold,                                                        \
+              ::ceres::MinusPlusIsIdentityAt(x, zero_tangent, tolerance));     \
+  EXPECT_THAT(manifold, ::ceres::PlusMinusIsIdentityAt(x, x, tolerance));      \
+  EXPECT_THAT(manifold, ::ceres::PlusMinusIsIdentityAt(x, y, tolerance));      \
+  EXPECT_THAT(manifold, ::ceres::HasCorrectPlusJacobianAt(x, tolerance));      \
+  EXPECT_THAT(manifold, ::ceres::HasCorrectMinusJacobianAt(x, tolerance));     \
+  EXPECT_THAT(manifold, ::ceres::MinusPlusJacobianIsIdentityAt(x, tolerance)); \
+  EXPECT_THAT(manifold,                                                        \
+              ::ceres::HasCorrectRightMultiplyByPlusJacobianAt(x, tolerance));

 // Checks that the invariant Plus(x, 0) == x holds.
 MATCHER_P2(XPlusZeroIsXAt, x, tolerance, "") {
@@ -69,7 +99,7 @@ MATCHER_P2(XPlusZeroIsXAt, x, tolerance, "") {
  Vector actual = Vector::Zero(ambient_size);
  Vector zero = Vector::Zero(tangent_size);
  EXPECT_TRUE(arg.Plus(x.data(), zero.data(), actual.data()));
-  const double n = (actual - x).norm();
+  const double n = (actual - Vector{x}).norm();
  const double d = x.norm();
  const double diffnorm = (d == 0.0) ? n : (n / d);
  if (diffnorm > tolerance) {
@@ -159,7 +189,7 @@ MATCHER_P3(MinusPlusIsIdentityAt, x, delta, tolerance, "") {
  Vector actual = Vector::Zero(tangent_size);
  EXPECT_TRUE(arg.Minus(x_plus_delta.data(), x.data(), actual.data()));

-  const double n = (actual - delta).norm();
+  const double n = (actual - Vector{delta}).norm();
  const double d = delta.norm();
  const double diffnorm = (d == 0.0) ? n : (n / d);
  if (diffnorm > tolerance) {
@@ -184,7 +214,7 @@ MATCHER_P3(PlusMinusIsIdentityAt, x, y, tolerance, "") {
  Vector actual = Vector::Zero(ambient_size);
  EXPECT_TRUE(arg.Plus(x.data(), y_minus_x.data(), actual.data()));

-  const double n = (actual - y).norm();
+  const double n = (actual - Vector{y}).norm();
  const double d = y.norm();
  const double diffnorm = (d == 0.0) ? n : (n / d);
  if (diffnorm > tolerance) {
@@ -312,17 +342,4 @@ MATCHER_P2(HasCorrectRightMultiplyByPlusJacobianAt, x, tolerance, "") {
  return true;
 }

-#define EXPECT_THAT_MANIFOLD_INVARIANTS_HOLD(manifold, x, delta, y, tolerance) \
-  Vector zero_tangent = Vector::Zero(manifold.TangentSize());                  \
-  EXPECT_THAT(manifold, XPlusZeroIsXAt(x, tolerance));                         \
-  EXPECT_THAT(manifold, XMinusXIsZeroAt(x, tolerance));                        \
-  EXPECT_THAT(manifold, MinusPlusIsIdentityAt(x, delta, tolerance));           \
-  EXPECT_THAT(manifold, MinusPlusIsIdentityAt(x, zero_tangent, tolerance));    \
-  EXPECT_THAT(manifold, PlusMinusIsIdentityAt(x, x, tolerance));               \
-  EXPECT_THAT(manifold, PlusMinusIsIdentityAt(x, y, tolerance));               \
-  EXPECT_THAT(manifold, HasCorrectPlusJacobianAt(x, tolerance));               \
-  EXPECT_THAT(manifold, HasCorrectMinusJacobianAt(x, tolerance));              \
-  EXPECT_THAT(manifold, MinusPlusJacobianIsIdentityAt(x, tolerance));          \
-  EXPECT_THAT(manifold, HasCorrectRightMultiplyByPlusJacobianAt(x, tolerance));
-
 }  // namespace ceres
--- a/extern/ceres/include/ceres/normal_prior.h
+++ b/extern/ceres/include/ceres/normal_prior.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ class CERES_EXPORT NormalPrior final : public CostFunction {
 public:
  // Check that the number of rows in the vector b are the same as the
  // number of columns in the matrix A, crash otherwise.
-  NormalPrior(const Matrix& A, const Vector& b);
+  NormalPrior(const Matrix& A, Vector b);
  bool Evaluate(double const* const* parameters,
                double* residuals,
                double** jacobians) const override;
--- a/extern/ceres/include/ceres/numeric_diff_cost_function.h
+++ b/extern/ceres/include/ceres/numeric_diff_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -176,7 +176,7 @@
 namespace ceres {

 template <typename CostFunctor,
-          NumericDiffMethodType method = CENTRAL,
+          NumericDiffMethodType kMethod = CENTRAL,
          int kNumResiduals = 0,  // Number of residuals, or ceres::DYNAMIC
          int... Ns>              // Parameters dimensions for each block.
 class NumericDiffCostFunction final
@@ -236,7 +236,7 @@ class NumericDiffCostFunction final
    }

    internal::EvaluateJacobianForParameterBlocks<ParameterDims>::
-        template Apply<method, kNumResiduals>(
+        template Apply<kMethod, kNumResiduals>(
            functor_.get(),
            residuals,
            options_,
--- a/extern/ceres/include/ceres/numeric_diff_first_order_function.h
+++ b/extern/ceres/include/ceres/numeric_diff_first_order_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@
 #include "ceres/internal/variadic_evaluate.h"
 #include "ceres/numeric_diff_options.h"
 #include "ceres/types.h"
+#include "glog/logging.h"

 namespace ceres {

@@ -99,19 +100,55 @@ namespace ceres {
 // "QuadraticCostFunctor", "CENTRAL, 4", describe the finite
 // differencing scheme as "central differencing" and the functor as
 // computing its cost from a 4 dimensional input.
+//
+// If the size of the parameter vector is not known at compile time, then an
+// alternate construction syntax can be used:
+//
+//   FirstOrderFunction* function
+//       = new NumericDiffFirstOrderFunction<MyScalarCostFunctor, CENTRAL>(
+//           new QuadraticCostFunctor(1.0), 4);
+//
+// Note that instead of passing 4 as a template argument, it is now passed as
+// the second argument to the constructor.
 template <typename FirstOrderFunctor,
-          NumericDiffMethodType method,
-          int kNumParameters>
+          NumericDiffMethodType kMethod,
+          int kNumParameters = DYNAMIC>
 class NumericDiffFirstOrderFunction final : public FirstOrderFunction {
 public:
+  // Constructor for the case where the parameter size is known at compile time.
  explicit NumericDiffFirstOrderFunction(
      FirstOrderFunctor* functor,
      Ownership ownership = TAKE_OWNERSHIP,
      const NumericDiffOptions& options = NumericDiffOptions())
-      : functor_(functor), ownership_(ownership), options_(options) {
+      : functor_(functor),
+        num_parameters_(kNumParameters),
+        ownership_(ownership),
+        options_(options) {
+    static_assert(kNumParameters != DYNAMIC,
+                  "Number of parameters must be static when defined via the "
+                  "template parameter. Use the other constructor for "
+                  "dynamically sized functions.");
    static_assert(kNumParameters > 0, "kNumParameters must be positive");
  }

+  // Constructor for the case where the parameter size is specified at run time.
+  explicit NumericDiffFirstOrderFunction(
+      FirstOrderFunctor* functor,
+      int num_parameters,
+      Ownership ownership = TAKE_OWNERSHIP,
+      const NumericDiffOptions& options = NumericDiffOptions())
+      : functor_(functor),
+        num_parameters_(num_parameters),
+        ownership_(ownership),
+        options_(options) {
+    static_assert(
+        kNumParameters == DYNAMIC,
+        "Template parameter must be DYNAMIC when using this constructor. If "
+        "you want to provide the number of parameters statically use the other "
+        "constructor.");
+    CHECK_GT(num_parameters, 0);
+  }
+
  ~NumericDiffFirstOrderFunction() override {
    if (ownership_ != TAKE_OWNERSHIP) {
      functor_.release();
@@ -121,12 +158,8 @@ class NumericDiffFirstOrderFunction final : public FirstOrderFunction {
  bool Evaluate(const double* const parameters,
                double* cost,
                double* gradient) const override {
-    using ParameterDims = internal::StaticParameterDims<kNumParameters>;
-    constexpr int kNumResiduals = 1;
-
    // Get the function value (cost) at the the point to evaluate.
-    if (!internal::VariadicEvaluate<ParameterDims>(
-            *functor_, &parameters, cost)) {
+    if (!(*functor_)(parameters, cost)) {
      return false;
    }

@@ -135,27 +168,47 @@ class NumericDiffFirstOrderFunction final : public FirstOrderFunction {
    }

    // Create a copy of the parameters which will get mutated.
-    internal::FixedArray<double, 32> parameters_copy(kNumParameters);
-    std::copy_n(parameters, kNumParameters, parameters_copy.data());
+    internal::FixedArray<double, 32> parameters_copy(num_parameters_);
+    std::copy_n(parameters, num_parameters_, parameters_copy.data());
    double* parameters_ptr = parameters_copy.data();
-    internal::EvaluateJacobianForParameterBlocks<
-        ParameterDims>::template Apply<method, kNumResiduals>(functor_.get(),
-                                                              cost,
-                                                              options_,
-                                                              kNumResiduals,
-                                                              &parameters_ptr,
-                                                              &gradient);
-    return true;
+    constexpr int kNumResiduals = 1;
+    if constexpr (kNumParameters == DYNAMIC) {
+      internal::FirstOrderFunctorAdapter<FirstOrderFunctor> fofa(*functor_);
+      return internal::NumericDiff<
+          internal::FirstOrderFunctorAdapter<FirstOrderFunctor>,
+          kMethod,
+          kNumResiduals,
+          internal::DynamicParameterDims,
+          0,
+          DYNAMIC>::EvaluateJacobianForParameterBlock(&fofa,
+                                                      cost,
+                                                      options_,
+                                                      kNumResiduals,
+                                                      0,
+                                                      num_parameters_,
+                                                      &parameters_ptr,
+                                                      gradient);
+    } else {
+      return internal::EvaluateJacobianForParameterBlocks<
+          internal::StaticParameterDims<kNumParameters>>::
+          template Apply<kMethod, 1>(functor_.get(),
+                                     cost,
+                                     options_,
+                                     kNumResiduals,
+                                     &parameters_ptr,
+                                     &gradient);
+    }
  }

-  int NumParameters() const override { return kNumParameters; }
+  int NumParameters() const override { return num_parameters_; }

  const FirstOrderFunctor& functor() const { return *functor_; }

 private:
  std::unique_ptr<FirstOrderFunctor> functor_;
-  Ownership ownership_;
-  NumericDiffOptions options_;
+  const int num_parameters_;
+  const Ownership ownership_;
+  const NumericDiffOptions options_;
 };

 }  // namespace ceres
--- a/extern/ceres/include/ceres/numeric_diff_options.h
+++ b/extern/ceres/include/ceres/numeric_diff_options.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/ordered_groups.h
+++ b/extern/ceres/include/ceres/ordered_groups.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/problem.h
+++ b/extern/ceres/include/ceres/problem.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -53,7 +53,6 @@ namespace ceres {
 class CostFunction;
 class EvaluationCallback;
 class LossFunction;
-class LocalParameterization;
 class Manifold;
 class Solver;
 struct CRSMatrix;
@@ -118,29 +117,17 @@ using ResidualBlockId = internal::ResidualBlock*;
 //   problem.AddResidualBlock(new MyBinaryCostFunction(...), nullptr, x2, x3);
 //
 // Please see cost_function.h for details of the CostFunction object.
-//
-// NOTE: We are currently in the process of transitioning from
-// LocalParameterization to Manifolds in the Ceres API. During this period,
-// Problem will support using both Manifold and LocalParameterization objects
-// interchangably. In particular, adding a LocalParameterization to a parameter
-// block is the same as adding a Manifold to that parameter block. For methods
-// in the API affected by this change, see their documentation below.
 class CERES_EXPORT Problem {
 public:
  struct CERES_EXPORT Options {
    // These flags control whether the Problem object owns the CostFunctions,
-    // LossFunctions, LocalParameterizations, and Manifolds passed into the
-    // Problem.
+    // LossFunctions, and Manifolds passed into the Problem.
    //
    // If set to TAKE_OWNERSHIP, then the problem object will delete the
    // corresponding object on destruction. The destructor is careful to delete
    // the pointers only once, since sharing objects is allowed.
    Ownership cost_function_ownership = TAKE_OWNERSHIP;
    Ownership loss_function_ownership = TAKE_OWNERSHIP;
-    CERES_DEPRECATED_WITH_MSG(
-        "Local Parameterizations are deprecated. Use Manifold and "
-        "manifold_ownership instead.")
-    Ownership local_parameterization_ownership = TAKE_OWNERSHIP;
    Ownership manifold_ownership = TAKE_OWNERSHIP;

    // If true, trades memory for faster RemoveResidualBlock() and
@@ -271,66 +258,23 @@ class CERES_EXPORT Problem {
  // pointer but a different size will result in a crash.
  void AddParameterBlock(double* values, int size);

-  // Add a parameter block with appropriate size and parameterization to the
-  // problem. It is okay for local_parameterization to be nullptr.
-  //
-  // Repeated calls with the same arguments are ignored. Repeated calls
-  // with the same double pointer but a different size results in a crash
-  // (unless Solver::Options::diable_all_safety_checks is set to true).
-  //
-  // Repeated calls with the same double pointer and size but different
-  // LocalParameterization is equivalent to calling
-  // SetParameterization(local_parameterization), i.e., any previously
-  // associated LocalParameterization or Manifold object will be replaced with
-  // the local_parameterization.
-  //
-  // NOTE:
-  // ----
-  //
-  // This method is deprecated and will be removed in the next public
-  // release of Ceres Solver. Please move to using the Manifold based version of
-  // AddParameterBlock.
-  //
-  // During the transition from LocalParameterization to Manifold, internally
-  // the LocalParameterization is treated as a Manifold by wrapping it using a
-  // ManifoldAdapter object. So HasManifold() will return true, GetManifold()
-  // will return the wrapped object and ParameterBlockTangentSize() will return
-  // the LocalSize of the LocalParameterization.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Use the version with Manifolds "
-      "instead.")
-  void AddParameterBlock(double* values,
-                         int size,
-                         LocalParameterization* local_parameterization);
-
  // Add a parameter block with appropriate size and Manifold to the
  // problem. It is okay for manifold to be nullptr.
  //
  // Repeated calls with the same arguments are ignored. Repeated calls
  // with the same double pointer but a different size results in a crash
-  // (unless Solver::Options::diable_all_safety_checks is set to true).
+  // (unless Solver::Options::disable_all_safety_checks is set to true).
  //
  // Repeated calls with the same double pointer and size but different Manifold
  // is equivalent to calling SetManifold(manifold), i.e., any previously
-  // associated LocalParameterization or Manifold object will be replaced with
-  // the manifold.
-  //
-  // Note:
-  // ----
-  //
-  // During the transition from LocalParameterization to Manifold, calling
-  // AddParameterBlock with a Manifold when a LocalParameterization is already
-  // associated with the parameter block is okay. It is equivalent to calling
-  // SetManifold(manifold), i.e., any previously associated
-  // LocalParameterization or Manifold object will be replaced with the
-  // manifold.
+  // associated Manifold object will be replaced with the manifold.
  void AddParameterBlock(double* values, int size, Manifold* manifold);

-  // Remove a parameter block from the problem. The LocalParameterization or
-  // Manifold of the parameter block, if it exists, will persist until the
-  // deletion of the problem (similar to cost/loss functions in residual block
-  // removal). Any residual blocks that depend on the parameter are also
-  // removed, as described above in RemoveResidualBlock().
+  // Remove a parameter block from the problem. The Manifold of the parameter
+  // block, if it exists, will persist until the deletion of the problem
+  // (similar to cost/loss functions in residual block removal). Any residual
+  // blocks that depend on the parameter are also removed, as described above
+  // in RemoveResidualBlock().
  //
  // If Problem::Options::enable_fast_removal is true, then the removal is fast
  // (almost constant time). Otherwise, removing a parameter block will incur a
@@ -361,76 +305,15 @@ class CERES_EXPORT Problem {

  // Returns true if a parameter block is set constant, and false otherwise. A
  // parameter block may be set constant in two ways: either by calling
-  // SetParameterBlockConstant or by associating a LocalParameterization or
-  // Manifold with a zero dimensional tangent space with it.
+  // SetParameterBlockConstant or by associating a Manifold with a zero
+  // dimensional tangent space with it.
  bool IsParameterBlockConstant(const double* values) const;

-  // Set the LocalParameterization for the parameter block. Calling
-  // SetParameterization with nullptr will clear any previously set
-  // LocalParameterization or Manifold for the parameter block.
-  //
-  // Repeated calls will cause any previously associated LocalParameterization
-  // or Manifold object to be replaced with the local_parameterization.
-  //
-  // The local_parameterization is owned by the Problem by default (See
-  // Problem::Options to override this behaviour).
-  //
-  // It is acceptable to set the same LocalParameterization for multiple
-  // parameter blocks; the destructor is careful to delete
-  // LocalParamaterizations only once.
-  //
-  // NOTE:
-  // ----
-  //
-  // This method is deprecated and will be removed in the next public
-  // release of Ceres Solver. Please move to using the SetManifold instead.
-  //
-  // During the transition from LocalParameterization to Manifold, internally
-  // the LocalParameterization is treated as a Manifold by wrapping it using a
-  // ManifoldAdapter object. So HasManifold() will return true, GetManifold()
-  // will return the wrapped object and ParameterBlockTangentSize will return
-  // the same value of ParameterBlockLocalSize.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Use SetManifold instead.")
-  void SetParameterization(double* values,
-                           LocalParameterization* local_parameterization);
-
-  // Get the LocalParameterization object associated with this parameter block.
-  // If there is no LocalParameterization associated then nullptr is returned.
-  //
-  // NOTE: This method is deprecated and will be removed in the next public
-  // release of Ceres Solver. Use GetManifold instead.
-  //
-  // Note also that if a LocalParameterization is associated with a parameter
-  // block, HasManifold will return true and GetManifold will return the
-  // LocalParameterization wrapped in a ManifoldAdapter.
-  //
-  // The converse is NOT true, i.e., if a Manifold is associated with a
-  // parameter block, HasParameterization will return false and
-  // GetParameterization will return a nullptr.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Use GetManifold "
-      "instead.")
-  const LocalParameterization* GetParameterization(const double* values) const;
-
-  // Returns true if a LocalParameterization is associated with this parameter
-  // block, false otherwise.
-  //
-  // NOTE: This method is deprecated and will be removed in the next public
-  // release of Ceres Solver. Use HasManifold instead.
-  //
-  // Note also that if a Manifold is associated with the parameter block, this
-  // method will return false.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Use HasManifold instead.")
-  bool HasParameterization(const double* values) const;
-
  // Set the Manifold for the parameter block. Calling SetManifold with nullptr
-  // will clear any previously set LocalParameterization or Manifold for the
-  // parameter block.
+  // will clear any previously set Manifold for the parameter block.
  //
-  // Repeated calls will result in any previously associated
-  // LocalParameterization or Manifold object to be replaced with the manifold.
+  // Repeated calls will result in any previously associated Manifold object to
+  // be replaced with the manifold.
  //
  // The manifold is owned by the Problem by default (See Problem::Options to
  // override this behaviour).
@@ -440,18 +323,11 @@ class CERES_EXPORT Problem {

  // Get the Manifold object associated with this parameter block.
  //
-  // If there is no Manifold Or LocalParameterization object associated then
-  // nullptr is returned.
-  //
-  // NOTE: During the transition from LocalParameterization to Manifold,
-  // internally the LocalParameterization is treated as a Manifold by wrapping
-  // it using a ManifoldAdapter object. So calling GetManifold on a parameter
-  // block with a LocalParameterization associated with it will return the
-  // LocalParameterization wrapped in a ManifoldAdapter
+  // If there is no Manifold object associated then nullptr is returned.
  const Manifold* GetManifold(const double* values) const;

-  // Returns true if a Manifold or a LocalParameterization is associated with
-  // this parameter block, false otherwise.
+  // Returns true if a Manifold is associated with this parameter block, false
+  // otherwise.
  bool HasManifold(const double* values) const;

  // Set the lower/upper bound for the parameter at position "index".
@@ -484,19 +360,9 @@ class CERES_EXPORT Problem {
  // The size of the parameter block.
  int ParameterBlockSize(const double* values) const;

-  // The dimension of the tangent space of the LocalParameterization or Manifold
-  // for the parameter block. If there is no LocalParameterization or Manifold
-  // associated with this parameter block, then ParameterBlockLocalSize =
-  // ParameterBlockSize.
-  CERES_DEPRECATED_WITH_MSG(
-      "LocalParameterizations are deprecated. Use ParameterBlockTangentSize "
-      "instead.")
-  int ParameterBlockLocalSize(const double* values) const;
-
-  // The dimenion of the tangent space of the LocalParameterization or Manifold
-  // for the parameter block. If there is no LocalParameterization or Manifold
-  // associated with this parameter block, then ParameterBlockTangentSize =
-  // ParameterBlockSize.
+  // The dimension of the tangent space of the Manifold for the parameter block.
+  // If there is no Manifold associated with this parameter block, then
+  // ParameterBlockTangentSize = ParameterBlockSize.
  int ParameterBlockTangentSize(const double* values) const;

  // Is the given parameter block present in this problem or not?
@@ -596,11 +462,11 @@ class CERES_EXPORT Problem {
  //
  // is the way to do so.
  //
-  // Note 2: If no LocalParameterizations or Manifolds are used, then the size
-  // of the gradient vector (and the number of columns in the jacobian) is the
-  // sum of the sizes of all the parameter blocks. If a parameter block has a
-  // LocalParameterization or Manifold, then it contributes "TangentSize"
-  // entries to the gradient vector (and the number of columns in the jacobian).
+  // Note 2: If no Manifolds are used, then the size of the gradient vector (and
+  // the number of columns in the jacobian) is the sum of the sizes of all the
+  // parameter blocks. If a parameter block has a Manifold, then it contributes
+  // "TangentSize" entries to the gradient vector (and the number of columns in
+  // the jacobian).
  //
  // Note 3: This function cannot be called while the problem is being solved,
  // for example it cannot be called from an IterationCallback at the end of an
@@ -631,11 +497,10 @@ class CERES_EXPORT Problem {
  // returns false, the caller should expect the output memory locations to have
  // been modified.
  //
-  // The returned cost and jacobians have had robustification and
-  // LocalParameterization/Manifold applied already; for example, the jacobian
-  // for a 4-dimensional quaternion parameter using the
-  // "QuaternionParameterization" is num_residuals by 3 instead of num_residuals
-  // by 4.
+  // The returned cost and jacobians have had robustification and Manifold
+  // applied already; for example, the jacobian for a 4-dimensional quaternion
+  // parameter using the "QuaternionParameterization" is num_residuals by 3
+  // instead of num_residuals by 4.
  //
  // apply_loss_function as the name implies allows the user to switch the
  // application of the loss function on and off.
@@ -672,9 +537,13 @@ class CERES_EXPORT Problem {
      double* residuals,
      double** jacobians) const;

+  // Returns reference to the options with which the Problem was constructed.
+  const Options& options() const;
+
+  // Returns pointer to Problem implementation
+  internal::ProblemImpl* mutable_impl();
+
 private:
-  friend class Solver;
-  friend class Covariance;
  std::unique_ptr<internal::ProblemImpl> impl_;
 };

--- a/extern/ceres/include/ceres/product_manifold.h
+++ b/extern/ceres/include/ceres/product_manifold.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -257,28 +257,21 @@ class ProductManifold final : public Manifold {
  template <typename T, std::size_t N>
  static std::array<T, N> ExclusiveScan(const std::array<T, N>& values) {
    std::array<T, N> result;
+    // TODO Replace with std::exclusive_scan once all platforms have full C++17
+    // STL support.
    T init = 0;
-
-    // TODO Replace by std::exclusive_scan once C++17 is available
    for (std::size_t i = 0; i != N; ++i) {
      result[i] = init;
      init += values[i];
    }
-
    return result;
  }

-  // TODO Replace by std::void_t once C++17 is available
-  template <typename... Types>
-  struct Void {
-    using type = void;
-  };
-
  template <typename T, typename E = void>
  struct IsDereferenceable : std::false_type {};

  template <typename T>
-  struct IsDereferenceable<T, typename Void<decltype(*std::declval<T>())>::type>
+  struct IsDereferenceable<T, std::void_t<decltype(*std::declval<T>())>>
      : std::true_type {};

  template <typename T,
@@ -311,7 +304,6 @@ class ProductManifold final : public Manifold {
  int tangent_size_;
 };

-#ifdef CERES_HAS_CPP17
 // C++17 deduction guide that allows the user to avoid explicitly specifying
 // the template parameters of ProductManifold. The class can instead be
 // instantiated as follows:
@@ -321,7 +313,6 @@ class ProductManifold final : public Manifold {
 template <typename Manifold0, typename Manifold1, typename... Manifolds>
 ProductManifold(Manifold0&&, Manifold1&&, Manifolds&&...)
    -> ProductManifold<Manifold0, Manifold1, Manifolds...>;
-#endif

 }  // namespace ceres

--- a/extern/ceres/include/ceres/rotation.h
+++ b/extern/ceres/include/ceres/rotation.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,9 @@

 #include <algorithm>
 #include <cmath>
-#include <limits>

+#include "ceres/constants.h"
+#include "ceres/internal/euler_angles.h"
 #include "glog/logging.h"

 namespace ceres {
@@ -60,7 +61,7 @@ namespace ceres {
 //
 // the expression  M(i, j) is equivalent to
 //
-//   arrary[i * row_stride + j * col_stride]
+//   array[i * row_stride + j * col_stride]
 //
 // Conversion functions to and from rotation matrices accept
 // MatrixAdapters to permit using row-major and column-major layouts,
@@ -136,6 +137,71 @@ template <typename T, int row_stride, int col_stride>
 void EulerAnglesToRotationMatrix(
    const T* euler, const MatrixAdapter<T, row_stride, col_stride>& R);

+// Convert a generic Euler Angle sequence (in radians) to a 3x3 rotation matrix.
+//
+// Euler Angles define a sequence of 3 rotations about a sequence of axes,
+// typically taken to be the X, Y, or Z axes. The last axis may be the same as
+// the first axis (e.g. ZYZ) per Euler's original definition of his angles
+// (proper Euler angles) or not (e.g. ZYX / yaw-pitch-roll), per common usage in
+// the nautical and aerospace fields (Tait-Bryan angles). The three rotations
+// may be in a global frame of reference (Extrinsic) or in a body fixed frame of
+// reference (Intrinsic) that moves with the rotating object.
+//
+// Internally, Euler Axis sequences are classified by Ken Shoemake's scheme from
+// "Euler angle conversion", Graphics Gems IV, where a choice of axis for the
+// first rotation and 3 binary choices:
+// 1. Parity of the axis permutation. The axis sequence has Even parity if the
+// second axis of rotation is 'greater-than' the first axis of rotation
+// according to the order X<Y<Z<X, otherwise it has Odd parity.
+// 2. Proper Euler Angles v.s. Tait-Bryan Angles
+// 3. Extrinsic Rotations v.s. Intrinsic Rotations
+// compactly represent all 24 possible Euler Angle Conventions
+//
+// One template parameter: EulerSystem must be explicitly given. This parameter
+// is a tag named by 'Extrinsic' or 'Intrinsic' followed by three characters in
+// the set '[XYZ]', specifying the axis sequence, e.g. ceres::ExtrinsicYZY
+// (robotic arms), ceres::IntrinsicZYX (for aerospace), etc.
+//
+// The order of elements in the input array 'euler' follows the axis sequence
+template <typename EulerSystem, typename T>
+inline void EulerAnglesToRotation(const T* euler, T* R);
+
+template <typename EulerSystem, typename T, int row_stride, int col_stride>
+void EulerAnglesToRotation(const T* euler,
+                           const MatrixAdapter<T, row_stride, col_stride>& R);
+
+// Convert a 3x3 rotation matrix to a generic Euler Angle sequence (in radians)
+//
+// Euler Angles define a sequence of 3 rotations about a sequence of axes,
+// typically taken to be the X, Y, or Z axes. The last axis may be the same as
+// the first axis (e.g. ZYZ) per Euler's original definition of his angles
+// (proper Euler angles) or not (e.g. ZYX / yaw-pitch-roll), per common usage in
+// the nautical and aerospace fields (Tait-Bryan angles). The three rotations
+// may be in a global frame of reference (Extrinsic) or in a body fixed frame of
+// reference (Intrinsic) that moves with the rotating object.
+//
+// Internally, Euler Axis sequences are classified by Ken Shoemake's scheme from
+// "Euler angle conversion", Graphics Gems IV, where a choice of axis for the
+// first rotation and 3 binary choices:
+// 1. Oddness of the axis permutation, that defines whether the second axis is
+// 'greater-than' the first axis according to the order X>Y>Z>X)
+// 2. Proper Euler Angles v.s. Tait-Bryan Angles
+// 3. Extrinsic Rotations v.s. Intrinsic Rotations
+// compactly represent all 24 possible Euler Angle Conventions
+//
+// One template parameter: EulerSystem must be explicitly given. This parameter
+// is a tag named by 'Extrinsic' or 'Intrinsic' followed by three characters in
+// the set '[XYZ]', specifying the axis sequence, e.g. ceres::ExtrinsicYZY
+// (robotic arms), ceres::IntrinsicZYX (for aerospace), etc.
+//
+// The order of elements in the output array 'euler' follows the axis sequence
+template <typename EulerSystem, typename T>
+inline void RotationMatrixToEulerAngles(const T* R, T* euler);
+
+template <typename EulerSystem, typename T, int row_stride, int col_stride>
+void RotationMatrixToEulerAngles(
+    const MatrixAdapter<const T, row_stride, col_stride>& R, T* euler);
+
 // Convert a 4-vector to a 3x3 scaled rotation matrix.
 //
 // The choice of rotation is such that the quaternion [1 0 0 0] goes to an
@@ -247,14 +313,15 @@ MatrixAdapter<T, 3, 1> RowMajorAdapter3x3(T* pointer) {

 template <typename T>
 inline void AngleAxisToQuaternion(const T* angle_axis, T* quaternion) {
+  using std::fpclassify;
+  using std::hypot;
  const T& a0 = angle_axis[0];
  const T& a1 = angle_axis[1];
  const T& a2 = angle_axis[2];
-  const T theta_squared = a0 * a0 + a1 * a1 + a2 * a2;
+  const T theta = hypot(a0, a1, a2);

  // For points not at the origin, the full conversion is numerically stable.
-  if (theta_squared > T(0.0)) {
-    const T theta = sqrt(theta_squared);
+  if (fpclassify(theta) != FP_ZERO) {
    const T half_theta = theta * T(0.5);
    const T k = sin(half_theta) / theta;
    quaternion[0] = cos(half_theta);
@@ -276,15 +343,16 @@ inline void AngleAxisToQuaternion(const T* angle_axis, T* quaternion) {

 template <typename T>
 inline void QuaternionToAngleAxis(const T* quaternion, T* angle_axis) {
+  using std::fpclassify;
+  using std::hypot;
  const T& q1 = quaternion[1];
  const T& q2 = quaternion[2];
  const T& q3 = quaternion[3];
-  const T sin_squared_theta = q1 * q1 + q2 * q2 + q3 * q3;
+  const T sin_theta = hypot(q1, q2, q3);

  // For quaternions representing non-zero rotation, the conversion
  // is numerically stable.
-  if (sin_squared_theta > T(0.0)) {
-    const T sin_theta = sqrt(sin_squared_theta);
+  if (fpclassify(sin_theta) != FP_ZERO) {
    const T& cos_theta = quaternion[0];

    // If cos_theta is negative, theta is greater than pi/2, which
@@ -385,13 +453,14 @@ inline void AngleAxisToRotationMatrix(const T* angle_axis, T* R) {
 template <typename T, int row_stride, int col_stride>
 void AngleAxisToRotationMatrix(
    const T* angle_axis, const MatrixAdapter<T, row_stride, col_stride>& R) {
+  using std::fpclassify;
+  using std::hypot;
  static const T kOne = T(1.0);
-  const T theta2 = DotProduct(angle_axis, angle_axis);
-  if (theta2 > T(std::numeric_limits<double>::epsilon())) {
+  const T theta = hypot(angle_axis[0], angle_axis[1], angle_axis[2]);
+  if (fpclassify(theta) != FP_ZERO) {
    // We want to be careful to only evaluate the square root if the
    // norm of the angle_axis vector is greater than zero. Otherwise
    // we get a division by zero.
-    const T theta = sqrt(theta2);
    const T wx = angle_axis[0] / theta;
    const T wy = angle_axis[1] / theta;
    const T wz = angle_axis[2] / theta;
@@ -411,7 +480,7 @@ void AngleAxisToRotationMatrix(
    R(2, 2) =     costheta   + wz*wz*(kOne -    costheta);
    // clang-format on
  } else {
-    // Near zero, we switch to using the first order Taylor expansion.
+    // At zero, we switch to using the first order Taylor expansion.
    R(0, 0) = kOne;
    R(1, 0) = angle_axis[2];
    R(2, 0) = -angle_axis[1];
@@ -424,6 +493,141 @@ void AngleAxisToRotationMatrix(
  }
 }

+template <typename EulerSystem, typename T>
+inline void EulerAnglesToRotation(const T* euler, T* R) {
+  EulerAnglesToRotation<EulerSystem>(euler, RowMajorAdapter3x3(R));
+}
+
+template <typename EulerSystem, typename T, int row_stride, int col_stride>
+void EulerAnglesToRotation(const T* euler,
+                           const MatrixAdapter<T, row_stride, col_stride>& R) {
+  using std::cos;
+  using std::sin;
+
+  const auto [i, j, k] = EulerSystem::kAxes;
+
+  T ea[3];
+  ea[1] = euler[1];
+  if constexpr (EulerSystem::kIsIntrinsic) {
+    ea[0] = euler[2];
+    ea[2] = euler[0];
+  } else {
+    ea[0] = euler[0];
+    ea[2] = euler[2];
+  }
+  if constexpr (EulerSystem::kIsParityOdd) {
+    ea[0] = -ea[0];
+    ea[1] = -ea[1];
+    ea[2] = -ea[2];
+  }
+
+  const T ci = cos(ea[0]);
+  const T cj = cos(ea[1]);
+  const T ch = cos(ea[2]);
+  const T si = sin(ea[0]);
+  const T sj = sin(ea[1]);
+  const T sh = sin(ea[2]);
+  const T cc = ci * ch;
+  const T cs = ci * sh;
+  const T sc = si * ch;
+  const T ss = si * sh;
+  if constexpr (EulerSystem::kIsProperEuler) {
+    R(i, i) = cj;
+    R(i, j) = sj * si;
+    R(i, k) = sj * ci;
+    R(j, i) = sj * sh;
+    R(j, j) = -cj * ss + cc;
+    R(j, k) = -cj * cs - sc;
+    R(k, i) = -sj * ch;
+    R(k, j) = cj * sc + cs;
+    R(k, k) = cj * cc - ss;
+  } else {
+    R(i, i) = cj * ch;
+    R(i, j) = sj * sc - cs;
+    R(i, k) = sj * cc + ss;
+    R(j, i) = cj * sh;
+    R(j, j) = sj * ss + cc;
+    R(j, k) = sj * cs - sc;
+    R(k, i) = -sj;
+    R(k, j) = cj * si;
+    R(k, k) = cj * ci;
+  }
+}
+
+template <typename EulerSystem, typename T>
+inline void RotationMatrixToEulerAngles(const T* R, T* euler) {
+  RotationMatrixToEulerAngles<EulerSystem>(RowMajorAdapter3x3(R), euler);
+}
+
+template <typename EulerSystem, typename T, int row_stride, int col_stride>
+void RotationMatrixToEulerAngles(
+    const MatrixAdapter<const T, row_stride, col_stride>& R, T* euler) {
+  using std::atan2;
+  using std::fpclassify;
+  using std::hypot;
+
+  const auto [i, j, k] = EulerSystem::kAxes;
+
+  T ea[3];
+  if constexpr (EulerSystem::kIsProperEuler) {
+    const T sy = hypot(R(i, j), R(i, k));
+    if (fpclassify(sy) != FP_ZERO) {
+      ea[0] = atan2(R(i, j), R(i, k));
+      ea[1] = atan2(sy, R(i, i));
+      ea[2] = atan2(R(j, i), -R(k, i));
+    } else {
+      ea[0] = atan2(-R(j, k), R(j, j));
+      ea[1] = atan2(sy, R(i, i));
+      ea[2] = T(0.0);
+    }
+  } else {
+    const T cy = hypot(R(i, i), R(j, i));
+    if (fpclassify(cy) != FP_ZERO) {
+      ea[0] = atan2(R(k, j), R(k, k));
+      ea[1] = atan2(-R(k, i), cy);
+      ea[2] = atan2(R(j, i), R(i, i));
+    } else {
+      ea[0] = atan2(-R(j, k), R(j, j));
+      ea[1] = atan2(-R(k, i), cy);
+      ea[2] = T(0.0);
+    }
+  }
+  if constexpr (EulerSystem::kIsParityOdd) {
+    ea[0] = -ea[0];
+    ea[1] = -ea[1];
+    ea[2] = -ea[2];
+  }
+  euler[1] = ea[1];
+  if constexpr (EulerSystem::kIsIntrinsic) {
+    euler[0] = ea[2];
+    euler[2] = ea[0];
+  } else {
+    euler[0] = ea[0];
+    euler[2] = ea[2];
+  }
+
+  // Proper euler angles are defined for angles in
+  //   [-pi, pi) x [0, pi / 2) x [-pi, pi)
+  // which is enforced here
+  if constexpr (EulerSystem::kIsProperEuler) {
+    const T kPi(constants::pi);
+    const T kTwoPi(2.0 * kPi);
+    if (euler[1] < T(0.0) || ea[1] > kPi) {
+      euler[0] += kPi;
+      euler[1] = -euler[1];
+      euler[2] -= kPi;
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      if (euler[i] < -kPi) {
+        euler[i] += kTwoPi;
+      } else if (euler[i] > kPi) {
+        euler[i] -= kTwoPi;
+      }
+    }
+  }
+}
+
 template <typename T>
 inline void EulerAnglesToRotationMatrix(const T* euler,
                                        const int row_stride_parameter,
@@ -589,9 +793,12 @@ inline void AngleAxisRotatePoint(const T angle_axis[3],
                                 const T pt[3],
                                 T result[3]) {
  DCHECK_NE(pt, result) << "Inplace rotation is not supported.";
+  using std::fpclassify;
+  using std::hypot;

-  const T theta2 = DotProduct(angle_axis, angle_axis);
-  if (theta2 > T(std::numeric_limits<double>::epsilon())) {
+  const T theta = hypot(angle_axis[0], angle_axis[1], angle_axis[2]);
+
+  if (fpclassify(theta) != FP_ZERO) {
    // Away from zero, use the rodriguez formula
    //
    //   result = pt costheta +
@@ -602,7 +809,6 @@ inline void AngleAxisRotatePoint(const T angle_axis[3],
    // norm of the angle_axis vector is greater than zero. Otherwise
    // we get a division by zero.
    //
-    const T theta = sqrt(theta2);
    const T costheta = cos(theta);
    const T sintheta = sin(theta);
    const T theta_inverse = T(1.0) / theta;
@@ -623,7 +829,7 @@ inline void AngleAxisRotatePoint(const T angle_axis[3],
    result[1] = pt[1] * costheta + w_cross_pt[1] * sintheta + w[1] * tmp;
    result[2] = pt[2] * costheta + w_cross_pt[2] * sintheta + w[2] * tmp;
  } else {
-    // Near zero, the first order Taylor approximation of the rotation
+    // At zero, the first order Taylor approximation of the rotation
    // matrix R corresponding to a vector w and angle theta is
    //
    //   R = I + hat(w) * sin(theta)
@@ -635,7 +841,7 @@ inline void AngleAxisRotatePoint(const T angle_axis[3],
    // and actually performing multiplication with the point pt, gives us
    // R * pt = pt + angle_axis x pt.
    //
-    // Switching to the Taylor expansion near zero provides meaningful
+    // Switching to the Taylor expansion at zero provides meaningful
    // derivatives when evaluated using Jets.
    //
    // Explicitly inlined evaluation of the cross product for
--- a/extern/ceres/include/ceres/sized_cost_function.h
+++ b/extern/ceres/include/ceres/sized_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/solver.h
+++ b/extern/ceres/include/ceres/solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -64,8 +64,6 @@ class CERES_EXPORT Solver {
    // with a message describing the problem.
    bool IsValid(std::string* error) const;

-    // Minimizer options ----------------------------------------
-
    // Ceres supports the two major families of optimization strategies -
    // Trust Region and Line Search.
    //
@@ -378,88 +376,144 @@ class CERES_EXPORT Solver {
    DenseLinearAlgebraLibraryType dense_linear_algebra_library_type = EIGEN;

    // Ceres supports using multiple sparse linear algebra libraries for sparse
-    // matrix ordering and factorizations. Currently, SUITE_SPARSE and CX_SPARSE
-    // are the valid choices, depending on whether they are linked into Ceres at
-    // build time.
+    // matrix ordering and factorizations.
    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =
 #if !defined(CERES_NO_SUITESPARSE)
        SUITE_SPARSE;
-#elif defined(CERES_USE_EIGEN_SPARSE)
-        EIGEN_SPARSE;
-#elif !defined(CERES_NO_CXSPARSE)
-        CX_SPARSE;
 #elif !defined(CERES_NO_ACCELERATE_SPARSE)
        ACCELERATE_SPARSE;
+#elif defined(CERES_USE_EIGEN_SPARSE)
+        EIGEN_SPARSE;
 #else
        NO_SPARSE;
 #endif

    // The order in which variables are eliminated in a linear solver
-    // can have a significant of impact on the efficiency and accuracy
-    // of the method. e.g., when doing sparse Cholesky factorization,
+    // can have a significant impact on the efficiency and accuracy of
+    // the method. e.g., when doing sparse Cholesky factorization,
    // there are matrices for which a good ordering will give a
    // Cholesky factor with O(n) storage, where as a bad ordering will
    // result in an completely dense factor.
    //
-    // Ceres allows the user to provide varying amounts of hints to
-    // the solver about the variable elimination ordering to use. This
-    // can range from no hints, where the solver is free to decide the
-    // best possible ordering based on the user's choices like the
-    // linear solver being used, to an exact order in which the
-    // variables should be eliminated, and a variety of possibilities
-    // in between.
+    // Sparse direct solvers like SPARSE_NORMAL_CHOLESKY and
+    // SPARSE_SCHUR use a fill reducing ordering of the columns and
+    // rows of the matrix being factorized before computing the
+    // numeric factorization.
    //
-    // Instances of the ParameterBlockOrdering class are used to
-    // communicate this information to Ceres.
+    // This enum controls the type of algorithm used to compute
+    // this fill reducing ordering. There is no single algorithm
+    // that works on all matrices, so determining which algorithm
+    // works better is a matter of empirical experimentation.
    //
-    // Formally an ordering is an ordered partitioning of the
-    // parameter blocks, i.e, each parameter block belongs to exactly
-    // one group, and each group has a unique non-negative integer
-    // associated with it, that determines its order in the set of
-    // groups.
+    // The exact behaviour of this setting is affected by the value of
+    // linear_solver_ordering as described below.
+    LinearSolverOrderingType linear_solver_ordering_type = AMD;
+
+    // Besides specifying the fill reducing ordering via
+    // linear_solver_ordering_type, Ceres allows the user to provide varying
+    // amounts of hints to the linear solver about the variable elimination
+    // ordering to use. This can range from no hints, where the solver is free
+    // to decide the best possible ordering based on the user's choices like the
+    // linear solver being used, to an exact order in which the variables should
+    // be eliminated, and a variety of possibilities in between.
    //
-    // Given such an ordering, Ceres ensures that the parameter blocks in
-    // the lowest numbered group are eliminated first, and then the
-    // parameter blocks in the next lowest numbered group and so on. Within
-    // each group, Ceres is free to order the parameter blocks as it
-    // chooses.
+    // Instances of the ParameterBlockOrdering class are used to communicate
+    // this information to Ceres.
    //
-    // If nullptr, then all parameter blocks are assumed to be in the
-    // same group and the solver is free to decide the best
-    // ordering.
+    // Formally an ordering is an ordered partitioning of the parameter blocks,
+    // i.e, each parameter block belongs to exactly one group, and each group
+    // has a unique non-negative integer associated with it, that determines its
+    // order in the set of groups.
    //
    // e.g. Consider the linear system
    //
    //   x + y = 3
    //   2x + 3y = 7
    //
-    // There are two ways in which it can be solved. First eliminating x
-    // from the two equations, solving for y and then back substituting
-    // for x, or first eliminating y, solving for x and back substituting
-    // for y. The user can construct three orderings here.
+    // There are two ways in which it can be solved. First eliminating x from
+    // the two equations, solving for y and then back substituting for x, or
+    // first eliminating y, solving for x and back substituting for y. The user
+    // can construct three orderings here.
    //
    //   {0: x}, {1: y} - eliminate x first.
    //   {0: y}, {1: x} - eliminate y first.
    //   {0: x, y}      - Solver gets to decide the elimination order.
    //
-    // Thus, to have Ceres determine the ordering automatically using
-    // heuristics, put all the variables in group 0 and to control the
-    // ordering for every variable, create groups 0..N-1, one per
-    // variable, in the desired order.
+    // Thus, to have Ceres determine the ordering automatically, put all the
+    // variables in group 0 and to control the ordering for every variable
+    // create groups 0 ... N-1, one per variable, in the desired
+    // order.
+    //
+    // linear_solver_ordering == nullptr and an ordering where all the parameter
+    // blocks are in one elimination group mean the same thing - the solver is
+    // free to choose what it thinks is the best elimination ordering. Therefore
+    // in the following we will only consider the case where
+    // linear_solver_ordering is nullptr.
+    //
+    // The exact interpretation of this information depends on the values of
+    // linear_solver_ordering_type and linear_solver_type/preconditioner_type
+    // and sparse_linear_algebra_type.
    //
    // Bundle Adjustment
-    // -----------------
+    // =================
    //
-    // A particular case of interest is bundle adjustment, where the user
-    // has two options. The default is to not specify an ordering at all,
-    // the solver will see that the user wants to use a Schur type solver
-    // and figure out the right elimination ordering.
+    // If the user is using one of the Schur solvers (DENSE_SCHUR,
+    // SPARSE_SCHUR, ITERATIVE_SCHUR) and chooses to specify an
+    // ordering, it must have one important property. The lowest
+    // numbered elimination group must form an independent set in the
+    // graph corresponding to the Hessian, or in other words, no two
+    // parameter blocks in in the first elimination group should
+    // co-occur in the same residual block. For the best performance,
+    // this elimination group should be as large as possible. For
+    // standard bundle adjustment problems, this corresponds to the
+    // first elimination group containing all the 3d points, and the
+    // second containing the all the cameras parameter blocks.
    //
-    // But if the user already knows what parameter blocks are points and
-    // what are cameras, they can save preprocessing time by partitioning
-    // the parameter blocks into two groups, one for the points and one
-    // for the cameras, where the group containing the points has an id
-    // smaller than the group containing cameras.
+    // If the user leaves the choice to Ceres, then the solver uses an
+    // approximate maximum independent set algorithm to identify the first
+    // elimination group.
+    //
+    // sparse_linear_algebra_library_type = SUITE_SPARSE
+    // =================================================
+    //
+    // linear_solver_ordering_type = AMD
+    // ---------------------------------
+    //
+    // A Constrained Approximate Minimum Degree (CAMD) ordering used where the
+    // parameter blocks in the lowest numbered group are eliminated first, and
+    // then the parameter blocks in the next lowest numbered group and so
+    // on. Within each group, CAMD free to order the parameter blocks as it
+    // chooses.
+    //
+    // linear_solver_ordering_type = NESDIS
+    // -------------------------------------
+    //
+    // a. linear_solver_type = SPARSE_NORMAL_CHOLESKY or
+    //    linear_solver_type = CGNR and preconditioner_type = SUBSET
+    //
+    // The value of linear_solver_ordering is ignored and a Nested Dissection
+    // algorithm is used to compute a fill reducing ordering.
+    //
+    // b. linear_solver_type = SPARSE_SCHUR/DENSE_SCHUR/ITERATIVE_SCHUR
+    //
+    // ONLY the lowest group are used to compute the Schur complement, and
+    // Nested Dissection is used to compute a fill reducing ordering for the
+    // Schur Complement (or its preconditioner).
+    //
+    // sparse_linear_algebra_library_type = EIGEN_SPARSE or ACCELERATE_SPARSE
+    // ======================================================================
+    //
+    // a. linear_solver_type = SPARSE_NORMAL_CHOLESKY or
+    //    linear_solver_type = CGNR and preconditioner_type = SUBSET
+    //
+    // then the value of linear_solver_ordering is ignored and AMD or NESDIS is
+    // used to compute a fill reducing ordering as requested by the user.
+    //
+    // b. linear_solver_type = SPARSE_SCHUR/DENSE_SCHUR/ITERATIVE_SCHUR
+    //
+    // ONLY the lowest group are used to compute the Schur complement, and AMD
+    // or NESDIS is used to compute a fill reducing ordering for the Schur
+    // Complement (or its preconditioner).
    std::shared_ptr<ParameterBlockOrdering> linear_solver_ordering;

    // Use an explicitly computed Schur complement matrix with
@@ -500,12 +554,6 @@ class CERES_EXPORT Solver {
    // Jacobian matrix and generally speaking, there is no performance
    // penalty for doing so.

-    // In some rare cases, it is worth using a more complicated
-    // reordering algorithm which has slightly better runtime
-    // performance at the expense of an extra copy of the Jacobian
-    // matrix. Setting use_postordering to true enables this tradeoff.
-    bool use_postordering = false;
-
    // Some non-linear least squares problems are symbolically dense but
    // numerically sparse. i.e. at any given state only a small number
    // of jacobian entries are non-zero, but the position and number of
@@ -521,11 +569,6 @@ class CERES_EXPORT Solver {
    // This settings only affects the SPARSE_NORMAL_CHOLESKY solver.
    bool dynamic_sparsity = false;

-    // TODO(sameeragarwal): Further expand the documentation for the
-    // following two options.
-
-    // NOTE1: EXPERIMENTAL FEATURE, UNDER DEVELOPMENT, USE AT YOUR OWN RISK.
-    //
    // If use_mixed_precision_solves is true, the Gauss-Newton matrix
    // is computed in double precision, but its factorization is
    // computed in single precision. This can result in significant
@@ -536,16 +579,57 @@ class CERES_EXPORT Solver {
    // If use_mixed_precision_solves is true, we recommend setting
    // max_num_refinement_iterations to 2-3.
    //
-    // NOTE2: The following two options are currently only applicable
-    // if sparse_linear_algebra_library_type is EIGEN_SPARSE or
-    // ACCELERATE_SPARSE, and linear_solver_type is SPARSE_NORMAL_CHOLESKY
-    // or SPARSE_SCHUR.
+    // This options is available when linear solver uses sparse or dense
+    // cholesky factorization, except when sparse_linear_algebra_library_type =
+    // SUITE_SPARSE.
    bool use_mixed_precision_solves = false;

    // Number steps of the iterative refinement process to run when
    // computing the Gauss-Newton step.
    int max_num_refinement_iterations = 0;

+    // Minimum number of iterations for which the linear solver should
+    // run, even if the convergence criterion is satisfied.
+    int min_linear_solver_iterations = 0;
+
+    // Maximum number of iterations for which the linear solver should
+    // run. If the solver does not converge in less than
+    // max_linear_solver_iterations, then it returns MAX_ITERATIONS,
+    // as its termination type.
+    int max_linear_solver_iterations = 500;
+
+    // Maximum number of iterations performed by SCHUR_POWER_SERIES_EXPANSION.
+    // Each iteration corresponds to one more term in the power series expansion
+    // od the inverse of the Schur complement.  This value controls the maximum
+    // number of iterations whether it is used as a preconditioner or just to
+    // initialize the solution for ITERATIVE_SCHUR.
+    int max_num_spse_iterations = 5;
+
+    // Use SCHUR_POWER_SERIES_EXPANSION to initialize the solution for
+    // ITERATIVE_SCHUR. This option can be set true regardless of what
+    // preconditioner is being used.
+    bool use_spse_initialization = false;
+
+    // When use_spse_initialization is true, this parameter along with
+    // max_num_spse_iterations controls the number of
+    // SCHUR_POWER_SERIES_EXPANSION iterations performed for initialization. It
+    // is not used to control the preconditioner.
+    double spse_tolerance = 0.1;
+
+    // Forcing sequence parameter. The truncated Newton solver uses
+    // this number to control the relative accuracy with which the
+    // Newton step is computed.
+    //
+    // This constant is passed to ConjugateGradientsSolver which uses
+    // it to terminate the iterations when
+    //
+    //  (Q_i - Q_{i-1})/Q_i < eta/i
+    double eta = 1e-1;
+
+    // Normalize the jacobian using Jacobi scaling before calling
+    // the linear least squares solver.
+    bool jacobi_scaling = true;
+
    // Some non-linear least squares problems have additional
    // structure in the way the parameter blocks interact that it is
    // beneficial to modify the way the trust region step is computed.
@@ -629,32 +713,6 @@ class CERES_EXPORT Solver {
    // iterations is disabled.
    double inner_iteration_tolerance = 1e-3;

-    // Minimum number of iterations for which the linear solver should
-    // run, even if the convergence criterion is satisfied.
-    int min_linear_solver_iterations = 0;
-
-    // Maximum number of iterations for which the linear solver should
-    // run. If the solver does not converge in less than
-    // max_linear_solver_iterations, then it returns MAX_ITERATIONS,
-    // as its termination type.
-    int max_linear_solver_iterations = 500;
-
-    // Forcing sequence parameter. The truncated Newton solver uses
-    // this number to control the relative accuracy with which the
-    // Newton step is computed.
-    //
-    // This constant is passed to ConjugateGradientsSolver which uses
-    // it to terminate the iterations when
-    //
-    //  (Q_i - Q_{i-1})/Q_i < eta/i
-    double eta = 1e-1;
-
-    // Normalize the jacobian using Jacobi scaling before calling
-    // the linear least squares solver.
-    bool jacobi_scaling = true;
-
-    // Logging options ---------------------------------------------------------
-
    LoggingType logging_type = PER_MINIMIZER_ITERATION;

    // By default the Minimizer progress is logged to VLOG(1), which
@@ -791,10 +849,9 @@ class CERES_EXPORT Solver {
    // IterationSummary for each minimizer iteration in order.
    std::vector<IterationSummary> iterations;

-    // Number of minimizer iterations in which the step was
-    // accepted. Unless use_non_monotonic_steps is true this is also
-    // the number of steps in which the objective function value/cost
-    // went down.
+    // Number of minimizer iterations in which the step was accepted. Unless
+    // use_nonmonotonic_steps is true this is also the number of steps in which
+    // the objective function value/cost went down.
    int num_successful_steps = -1;

    // Number of minimizer iterations in which the step was rejected
@@ -884,7 +941,7 @@ class CERES_EXPORT Solver {
    // Dimension of the tangent space of the problem (or the number of
    // columns in the Jacobian for the problem). This is different
    // from num_parameters if a parameter block is associated with a
-    // LocalParameterization/Manifold.
+    // Manifold.
    int num_effective_parameters = -1;

    // Number of residual blocks in the problem.
@@ -905,7 +962,7 @@ class CERES_EXPORT Solver {
    // number of columns in the Jacobian for the reduced
    // problem). This is different from num_parameters_reduced if a
    // parameter block in the reduced problem is associated with a
-    // LocalParameterization/Manifold.
+    // Manifold.
    int num_effective_parameters_reduced = -1;

    // Number of residual blocks in the reduced problem.
@@ -922,8 +979,7 @@ class CERES_EXPORT Solver {
    int num_threads_given = -1;

    // Number of threads actually used by the solver for Jacobian and
-    // residual evaluation. This number is not equal to
-    // num_threads_given if OpenMP is not available.
+    // residual evaluation.
    int num_threads_used = -1;

    // Type of the linear solver requested by the user.
@@ -946,6 +1002,10 @@ class CERES_EXPORT Solver {
        SPARSE_NORMAL_CHOLESKY;
 #endif

+    bool mixed_precision_solves_used = false;
+
+    LinearSolverOrderingType linear_solver_ordering_type = AMD;
+
    // Size of the elimination groups given by the user as hints to
    // the linear solver.
    std::vector<int> linear_solver_ordering_given;
@@ -1005,7 +1065,7 @@ class CERES_EXPORT Solver {
    PreconditionerType preconditioner_type_used = IDENTITY;

    // Type of clustering algorithm used for visibility based
-    // preconditioning. Only meaningful when the preconditioner_type
+    // preconditioning. Only meaningful when the preconditioner_type_used
    // is CLUSTER_JACOBI or CLUSTER_TRIDIAGONAL.
    VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS;

--- a/extern/ceres/include/ceres/sphere_manifold.h
+++ b/extern/ceres/include/ceres/sphere_manifold.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -114,12 +114,17 @@ class SphereManifold final : public Manifold {
  static constexpr int TangentSpaceDimension =
      AmbientSpaceDimension > 0 ? AmbientSpaceDimension - 1 : Eigen::Dynamic;

+  // NOTE: Eigen does not allow to have a RowMajor column vector.
+  // In that case, change the storage order
+  static constexpr int SafeRowMajor =
+      TangentSpaceDimension == 1 ? Eigen::ColMajor : Eigen::RowMajor;
+
  using AmbientVector = Eigen::Matrix<double, AmbientSpaceDimension, 1>;
  using TangentVector = Eigen::Matrix<double, TangentSpaceDimension, 1>;
  using MatrixPlusJacobian = Eigen::Matrix<double,
                                           AmbientSpaceDimension,
                                           TangentSpaceDimension,
-                                           Eigen::RowMajor>;
+                                           SafeRowMajor>;
  using MatrixMinusJacobian = Eigen::Matrix<double,
                                            TangentSpaceDimension,
                                            AmbientSpaceDimension,
--- a/extern/ceres/include/ceres/tiny_solver.h
+++ b/extern/ceres/include/ceres/tiny_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -248,10 +248,9 @@ class TinySolver {
      jtj_regularized_ = jtj_;
      const Scalar min_diagonal = 1e-6;
      const Scalar max_diagonal = 1e32;
-      for (int i = 0; i < lm_diagonal_.rows(); ++i) {
-        lm_diagonal_[i] = std::sqrt(
-            u * (std::min)((std::max)(jtj_(i, i), min_diagonal), max_diagonal));
-        jtj_regularized_(i, i) += lm_diagonal_[i] * lm_diagonal_[i];
+      for (int i = 0; i < dx_.rows(); ++i) {
+        jtj_regularized_(i, i) +=
+            u * (std::min)((std::max)(jtj_(i, i), min_diagonal), max_diagonal);
      }

      // TODO(sameeragarwal): Check for failure and deal with it.
@@ -338,7 +337,7 @@ class TinySolver {
  // linear system. This allows reusing the intermediate storage across solves.
  LinearSolver linear_solver_;
  Scalar cost_;
-  Parameters dx_, x_new_, g_, jacobi_scaling_, lm_diagonal_, lm_step_;
+  Parameters dx_, x_new_, g_, jacobi_scaling_, lm_step_;
  Eigen::Matrix<Scalar, NUM_RESIDUALS, 1> residuals_, f_x_new_;
  Eigen::Matrix<Scalar, NUM_RESIDUALS, NUM_PARAMETERS> jacobian_;
  Eigen::Matrix<Scalar, NUM_PARAMETERS, NUM_PARAMETERS> jtj_, jtj_regularized_;
@@ -385,7 +384,6 @@ class TinySolver {
    x_new_.resize(num_parameters);
    g_.resize(num_parameters);
    jacobi_scaling_.resize(num_parameters);
-    lm_diagonal_.resize(num_parameters);
    lm_step_.resize(num_parameters);
    residuals_.resize(num_residuals);
    f_x_new_.resize(num_residuals);
--- a/extern/ceres/include/ceres/tiny_solver_autodiff_function.h
+++ b/extern/ceres/include/ceres/tiny_solver_autodiff_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -171,7 +171,7 @@ class TinySolverAutoDiffFunction {
  const CostFunctor& cost_functor_;

  // The number of residuals at runtime.
-  // This will be overriden if NUM_RESIDUALS == Eigen::Dynamic.
+  // This will be overridden if NUM_RESIDUALS == Eigen::Dynamic.
  int num_residuals_ = kNumResiduals;

  // To evaluate the cost function with jets, temporary storage is needed. These
--- a/extern/ceres/include/ceres/tiny_solver_cost_function_adapter.h
+++ b/extern/ceres/include/ceres/tiny_solver_cost_function_adapter.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/include/ceres/types.h
+++ b/extern/ceres/include/ceres/types.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -67,8 +67,7 @@ enum LinearSolverType {
  // Eigen.
  DENSE_QR,

-  // Solve the normal equations using a sparse cholesky solver; requires
-  // SuiteSparse or CXSparse.
+  // Solve the normal equations using a sparse cholesky solver;
  SPARSE_NORMAL_CHOLESKY,

  // Specialized solvers, specific to problems with a generalized
@@ -98,7 +97,7 @@ enum PreconditionerType {
  // Block diagonal of the Gauss-Newton Hessian.
  JACOBI,

-  // Note: The following three preconditioners can only be used with
+  // Note: The following four preconditioners can only be used with
  // the ITERATIVE_SCHUR solver. They are well suited for Structure
  // from Motion problems.

@@ -106,6 +105,10 @@ enum PreconditionerType {
  // only be used with the ITERATIVE_SCHUR solver.
  SCHUR_JACOBI,

+  // Use power series expansion to approximate the inversion of Schur complement
+  // as a preconditioner.
+  SCHUR_POWER_SERIES_EXPANSION,
+
  // Visibility clustering based preconditioners.
  //
  // The following two preconditioners use the visibility structure of
@@ -134,7 +137,7 @@ enum PreconditionerType {
  // well the matrix Q approximates J'J, or how well the chosen
  // residual blocks approximate the non-linear least squares
  // problem.
-  SUBSET,
+  SUBSET
 };

 enum VisibilityClusteringType {
@@ -165,11 +168,6 @@ enum SparseLinearAlgebraLibraryType {
  // minimum degree ordering.
  SUITE_SPARSE,

-  // A lightweight replacement for SuiteSparse, which does not require
-  // a LAPACK/BLAS implementation. Consequently, its performance is
-  // also a bit lower than SuiteSparse.
-  CX_SPARSE,
-
  // Eigen's sparse linear algebra routines. In particular Ceres uses
  // the Simplicial LDLT routines.
  EIGEN_SPARSE,
@@ -177,12 +175,39 @@ enum SparseLinearAlgebraLibraryType {
  // Apple's Accelerate framework sparse linear algebra routines.
  ACCELERATE_SPARSE,

+  // Nvidia's cuSPARSE library.
+  CUDA_SPARSE,
+
  // No sparse linear solver should be used.  This does not necessarily
  // imply that Ceres was built without any sparse library, although that
  // is the likely use case, merely that one should not be used.
  NO_SPARSE
 };

+// The order in which variables are eliminated in a linear solver
+// can have a significant of impact on the efficiency and accuracy
+// of the method. e.g., when doing sparse Cholesky factorization,
+// there are matrices for which a good ordering will give a
+// Cholesky factor with O(n) storage, where as a bad ordering will
+// result in an completely dense factor.
+//
+// So sparse direct solvers like SPARSE_NORMAL_CHOLESKY and
+// SPARSE_SCHUR and preconditioners like SUBSET, CLUSTER_JACOBI &
+// CLUSTER_TRIDIAGONAL use a fill reducing ordering of the columns and
+// rows of the matrix being factorized before actually the numeric
+// factorization.
+//
+// This enum controls the class of algorithm used to compute this
+// fill reducing ordering. There is no single algorithm that works
+// on all matrices, so determining which algorithm works better is a
+// matter of empirical experimentation.
+enum LinearSolverOrderingType {
+  // Approximate Minimum Degree.
+  AMD,
+  // Nested Dissection.
+  NESDIS
+};
+
 enum DenseLinearAlgebraLibraryType {
  EIGEN,
  LAPACK,
@@ -467,6 +492,11 @@ CERES_EXPORT const char* SparseLinearAlgebraLibraryTypeToString(
 CERES_EXPORT bool StringToSparseLinearAlgebraLibraryType(
    std::string value, SparseLinearAlgebraLibraryType* type);

+CERES_EXPORT const char* LinearSolverOrderingTypeToString(
+    LinearSolverOrderingType type);
+CERES_EXPORT bool StringToLinearSolverOrderingType(
+    std::string value, LinearSolverOrderingType* type);
+
 CERES_EXPORT const char* DenseLinearAlgebraLibraryTypeToString(
    DenseLinearAlgebraLibraryType type);
 CERES_EXPORT bool StringToDenseLinearAlgebraLibraryType(
--- a/extern/ceres/include/ceres/version.h
+++ b/extern/ceres/include/ceres/version.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,7 +32,7 @@
 #define CERES_PUBLIC_VERSION_H_

 #define CERES_VERSION_MAJOR 2
-#define CERES_VERSION_MINOR 1
+#define CERES_VERSION_MINOR 2
 #define CERES_VERSION_REVISION 0

 // Classic CPP stringifcation; the extra level of indirection allows the
--- a/extern/ceres/internal/ceres/accelerate_sparse.cc
+++ b/extern/ceres/internal/ceres/accelerate_sparse.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -61,7 +61,7 @@ const char* SparseStatusToString(SparseStatus_t status) {
    CASESTR(SparseParameterError);
    CASESTR(SparseStatusReleased);
    default:
-      return "UKNOWN";
+      return "UNKNOWN";
  }
 }
 }  // namespace.
@@ -114,12 +114,12 @@ AccelerateSparse<Scalar>::CreateSparseMatrixTransposeView(
  // Accelerate's columnStarts is a long*, not an int*.  These types might be
  // different (e.g. ARM on iOS) so always make a copy.
  column_starts_.resize(A->num_rows() + 1);  // +1 for final column length.
-  std::copy_n(A->rows(), column_starts_.size(), &column_starts_[0]);
+  std::copy_n(A->rows(), column_starts_.size(), column_starts_.data());

  ASSparseMatrix At;
  At.structure.rowCount = A->num_cols();
  At.structure.columnCount = A->num_rows();
-  At.structure.columnStarts = &column_starts_[0];
+  At.structure.columnStarts = column_starts_.data();
  At.structure.rowIndices = A->mutable_cols();
  At.structure.attributes.transpose = false;
  At.structure.attributes.triangle = SparseUpperTriangle;
@@ -127,8 +127,8 @@ AccelerateSparse<Scalar>::CreateSparseMatrixTransposeView(
  At.structure.attributes._reserved = 0;
  At.structure.attributes._allocatedBySparse = 0;
  At.structure.blockSize = 1;
-  if (std::is_same<Scalar, double>::value) {
-    At.data = reinterpret_cast<Scalar*>(A->mutable_values());
+  if constexpr (std::is_same_v<Scalar, double>) {
+    At.data = A->mutable_values();
  } else {
    values_ =
        ConstVectorRef(A->values(), A->num_nonzeros()).template cast<Scalar>();
@@ -139,8 +139,23 @@ AccelerateSparse<Scalar>::CreateSparseMatrixTransposeView(

 template <typename Scalar>
 typename AccelerateSparse<Scalar>::SymbolicFactorization
-AccelerateSparse<Scalar>::AnalyzeCholesky(ASSparseMatrix* A) {
-  return SparseFactor(SparseFactorizationCholesky, A->structure);
+AccelerateSparse<Scalar>::AnalyzeCholesky(OrderingType ordering_type,
+                                          ASSparseMatrix* A) {
+  SparseSymbolicFactorOptions sfoption;
+  sfoption.control = SparseDefaultControl;
+  sfoption.orderMethod = SparseOrderDefault;
+  sfoption.order = nullptr;
+  sfoption.ignoreRowsAndColumns = nullptr;
+  sfoption.malloc = malloc;
+  sfoption.free = free;
+  sfoption.reportError = nullptr;
+
+  if (ordering_type == OrderingType::AMD) {
+    sfoption.orderMethod = SparseOrderAMD;
+  } else if (ordering_type == OrderingType::NESDIS) {
+    sfoption.orderMethod = SparseOrderMetis;
+  }
+  return SparseFactor(SparseFactorizationCholesky, A->structure, sfoption);
 }

 template <typename Scalar>
@@ -190,7 +205,7 @@ AppleAccelerateCholesky<Scalar>::~AppleAccelerateCholesky() {
 template <typename Scalar>
 CompressedRowSparseMatrix::StorageType
 AppleAccelerateCholesky<Scalar>::StorageType() const {
-  return CompressedRowSparseMatrix::LOWER_TRIANGULAR;
+  return CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR;
 }

 template <typename Scalar>
@@ -199,7 +214,7 @@ LinearSolverTerminationType AppleAccelerateCholesky<Scalar>::Factorize(
  CHECK_EQ(lhs->storage_type(), StorageType());
  if (lhs == nullptr) {
    *message = "Failure: Input lhs is nullptr.";
-    return LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  typename SparseTypesTrait<Scalar>::SparseMatrix as_lhs =
      as_.CreateSparseMatrixTransposeView(lhs);
@@ -207,13 +222,14 @@ LinearSolverTerminationType AppleAccelerateCholesky<Scalar>::Factorize(
  if (!symbolic_factor_) {
    symbolic_factor_ = std::make_unique<
        typename SparseTypesTrait<Scalar>::SymbolicFactorization>(
-        as_.AnalyzeCholesky(&as_lhs));
+        as_.AnalyzeCholesky(ordering_type_, &as_lhs));
+
    if (symbolic_factor_->status != SparseStatusOK) {
      *message = StringPrintf(
          "Apple Accelerate Failure : Symbolic factorisation failed: %s",
          SparseStatusToString(symbolic_factor_->status));
      FreeSymbolicFactorization();
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    }
  }

@@ -230,10 +246,10 @@ LinearSolverTerminationType AppleAccelerateCholesky<Scalar>::Factorize(
        "Apple Accelerate Failure : Numeric factorisation failed: %s",
        SparseStatusToString(numeric_factor_->status));
    FreeNumericFactorization();
-    return LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::FAILURE;
  }

-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 template <typename Scalar>
@@ -246,8 +262,8 @@ LinearSolverTerminationType AppleAccelerateCholesky<Scalar>::Solve(

  typename SparseTypesTrait<Scalar>::DenseVector as_rhs_and_solution;
  as_rhs_and_solution.count = num_cols;
-  if (std::is_same<Scalar, double>::value) {
-    as_rhs_and_solution.data = reinterpret_cast<Scalar*>(solution);
+  if constexpr (std::is_same_v<Scalar, double>) {
+    as_rhs_and_solution.data = solution;
    std::copy_n(rhs, num_cols, solution);
  } else {
    scalar_rhs_and_solution_ =
@@ -259,7 +275,7 @@ LinearSolverTerminationType AppleAccelerateCholesky<Scalar>::Solve(
    VectorRef(solution, num_cols) =
        scalar_rhs_and_solution_.template cast<double>();
  }
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 template <typename Scalar>
--- a/extern/ceres/internal/ceres/accelerate_sparse.h
+++ b/extern/ceres/internal/ceres/accelerate_sparse.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -55,18 +55,18 @@ struct SparseTypesTrait {};

 template <>
 struct SparseTypesTrait<double> {
-  typedef DenseVector_Double DenseVector;
-  typedef SparseMatrix_Double SparseMatrix;
-  typedef SparseOpaqueSymbolicFactorization SymbolicFactorization;
-  typedef SparseOpaqueFactorization_Double NumericFactorization;
+  using DenseVector = DenseVector_Double;
+  using SparseMatrix = SparseMatrix_Double;
+  using SymbolicFactorization = SparseOpaqueSymbolicFactorization;
+  using NumericFactorization = SparseOpaqueFactorization_Double;
 };

 template <>
 struct SparseTypesTrait<float> {
-  typedef DenseVector_Float DenseVector;
-  typedef SparseMatrix_Float SparseMatrix;
-  typedef SparseOpaqueSymbolicFactorization SymbolicFactorization;
-  typedef SparseOpaqueFactorization_Float NumericFactorization;
+  using DenseVector = DenseVector_Float;
+  using SparseMatrix = SparseMatrix_Float;
+  using SymbolicFactorization = SparseOpaqueSymbolicFactorization;
+  using NumericFactorization = SparseOpaqueFactorization_Float;
 };

 template <typename Scalar>
@@ -91,7 +91,8 @@ class AccelerateSparse {
  //       objects internally).
  ASSparseMatrix CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A);
  // Computes a symbolic factorisation of A that can be used in Solve().
-  SymbolicFactorization AnalyzeCholesky(ASSparseMatrix* A);
+  SymbolicFactorization AnalyzeCholesky(OrderingType ordering_type,
+                                        ASSparseMatrix* A);
  // Compute the numeric Cholesky factorization of A, given its
  // symbolic factorization.
  NumericFactorization Cholesky(ASSparseMatrix* A,
--- a/extern/ceres/internal/ceres/array_utils.cc
+++ b/extern/ceres/internal/ceres/array_utils.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,14 +38,12 @@

 #include "ceres/stringprintf.h"
 #include "ceres/types.h"
-namespace ceres {
-namespace internal {

-using std::string;
+namespace ceres::internal {

-bool IsArrayValid(const int size, const double* x) {
+bool IsArrayValid(const int64_t size, const double* x) {
  if (x != nullptr) {
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
      if (!std::isfinite(x[i]) || (x[i] == kImpossibleValue)) {
        return false;
      }
@@ -54,12 +52,12 @@ bool IsArrayValid(const int size, const double* x) {
  return true;
 }

-int FindInvalidValue(const int size, const double* x) {
+int64_t FindInvalidValue(const int64_t size, const double* x) {
  if (x == nullptr) {
    return size;
  }

-  for (int i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
    if (!std::isfinite(x[i]) || (x[i] == kImpossibleValue)) {
      return i;
    }
@@ -68,16 +66,18 @@ int FindInvalidValue(const int size, const double* x) {
  return size;
 }

-void InvalidateArray(const int size, double* x) {
+void InvalidateArray(const int64_t size, double* x) {
  if (x != nullptr) {
-    for (int i = 0; i < size; ++i) {
+    for (int64_t i = 0; i < size; ++i) {
      x[i] = kImpossibleValue;
    }
  }
 }

-void AppendArrayToString(const int size, const double* x, string* result) {
-  for (int i = 0; i < size; ++i) {
+void AppendArrayToString(const int64_t size,
+                         const double* x,
+                         std::string* result) {
+  for (int64_t i = 0; i < size; ++i) {
    if (x == nullptr) {
      StringAppendF(result, "Not Computed  ");
    } else {
@@ -90,18 +90,17 @@ void AppendArrayToString(const int size, const double* x, string* result) {
  }
 }

-void MapValuesToContiguousRange(const int size, int* array) {
+void MapValuesToContiguousRange(const int64_t size, int* array) {
  std::vector<int> unique_values(array, array + size);
  std::sort(unique_values.begin(), unique_values.end());
  unique_values.erase(std::unique(unique_values.begin(), unique_values.end()),
                      unique_values.end());

-  for (int i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
    array[i] =
        std::lower_bound(unique_values.begin(), unique_values.end(), array[i]) -
        unique_values.begin();
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/array_utils.h
+++ b/extern/ceres/internal/ceres/array_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,30 +43,30 @@
 #ifndef CERES_INTERNAL_ARRAY_UTILS_H_
 #define CERES_INTERNAL_ARRAY_UTILS_H_

+#include <cstdint>
 #include <string>

 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Fill the array x with an impossible value that the user code is
 // never expected to compute.
-CERES_NO_EXPORT void InvalidateArray(int size, double* x);
+CERES_NO_EXPORT void InvalidateArray(const int64_t size, double* x);

 // Check if all the entries of the array x are valid, i.e. all the
 // values in the array should be finite and none of them should be
 // equal to the "impossible" value used by InvalidateArray.
-CERES_NO_EXPORT bool IsArrayValid(int size, const double* x);
+CERES_NO_EXPORT bool IsArrayValid(const int64_t size, const double* x);

 // If the array contains an invalid value, return the index for it,
 // otherwise return size.
-CERES_NO_EXPORT int FindInvalidValue(const int size, const double* x);
+CERES_NO_EXPORT int64_t FindInvalidValue(const int64_t size, const double* x);

 // Utility routine to print an array of doubles to a string. If the
 // array pointer is nullptr, it is treated as an array of zeros.
-CERES_NO_EXPORT void AppendArrayToString(const int size,
+CERES_NO_EXPORT void AppendArrayToString(const int64_t size,
                                         const double* x,
                                         std::string* result);

@@ -83,10 +83,9 @@ CERES_NO_EXPORT void AppendArrayToString(const int size,
 // gets mapped to
 //
 // [1 0 2 3 0 1 3]
-CERES_NO_EXPORT void MapValuesToContiguousRange(int size, int* array);
+CERES_NO_EXPORT void MapValuesToContiguousRange(const int64_t size, int* array);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_evaluate_preparer.cc
+++ b/extern/ceres/internal/ceres/block_evaluate_preparer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/residual_block.h"
 #include "ceres/sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 void BlockEvaluatePreparer::Init(int const* const* jacobian_layout,
                                 int max_derivatives_per_residual_block) {
@@ -78,5 +77,4 @@ void BlockEvaluatePreparer::Prepare(const ResidualBlock* residual_block,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_evaluate_preparer.h
+++ b/extern/ceres/internal/ceres/block_evaluate_preparer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/scratch_evaluate_preparer.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class ResidualBlock;
 class SparseMatrix;
@@ -72,7 +71,6 @@ class CERES_NO_EXPORT BlockEvaluatePreparer {
  ScratchEvaluatePreparer scratch_evaluate_preparer_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_BLOCK_EVALUATE_PREPARER_H_
--- a/extern/ceres/internal/ceres/block_jacobi_preconditioner.cc
+++ b/extern/ceres/internal/ceres/block_jacobi_preconditioner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,71 +30,197 @@

 #include "ceres/block_jacobi_preconditioner.h"

+#include <memory>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include "Eigen/Dense"
 #include "ceres/block_random_access_diagonal_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/casts.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
+#include "ceres/small_blas.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-BlockJacobiPreconditioner::BlockJacobiPreconditioner(
-    const BlockSparseMatrix& A) {
-  const CompressedRowBlockStructure* bs = A.block_structure();
-  std::vector<int> blocks(bs->cols.size());
-  for (int i = 0; i < blocks.size(); ++i) {
-    blocks[i] = bs->cols[i].size;
-  }
-
-  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks);
+BlockSparseJacobiPreconditioner::BlockSparseJacobiPreconditioner(
+    Preconditioner::Options options, const BlockSparseMatrix& A)
+    : options_(std::move(options)) {
+  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
+      A.block_structure()->cols, options_.context, options_.num_threads);
 }

-BlockJacobiPreconditioner::~BlockJacobiPreconditioner() = default;
+BlockSparseJacobiPreconditioner::~BlockSparseJacobiPreconditioner() = default;

-bool BlockJacobiPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
-                                           const double* D) {
+bool BlockSparseJacobiPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
+                                                 const double* D) {
  const CompressedRowBlockStructure* bs = A.block_structure();
  const double* values = A.values();
  m_->SetZero();
-  for (int i = 0; i < bs->rows.size(); ++i) {
-    const int row_block_size = bs->rows[i].block.size;
-    const std::vector<Cell>& cells = bs->rows[i].cells;
-    for (const auto& cell : cells) {
-      const int block_id = cell.block_id;
-      const int col_block_size = bs->cols[block_id].size;

-      int r, c, row_stride, col_stride;
-      CellInfo* cell_info =
-          m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
-      MatrixRef m(cell_info->values, row_stride, col_stride);
-      ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
-      m.block(r, c, col_block_size, col_block_size) += b.transpose() * b;
-    }
-  }
+  ParallelFor(options_.context,
+              0,
+              bs->rows.size(),
+              options_.num_threads,
+              [this, bs, values](int i) {
+                const int row_block_size = bs->rows[i].block.size;
+                const std::vector<Cell>& cells = bs->rows[i].cells;
+                for (const auto& cell : cells) {
+                  const int block_id = cell.block_id;
+                  const int col_block_size = bs->cols[block_id].size;
+                  int r, c, row_stride, col_stride;
+                  CellInfo* cell_info = m_->GetCell(
+                      block_id, block_id, &r, &c, &row_stride, &col_stride);
+                  MatrixRef m(cell_info->values, row_stride, col_stride);
+                  ConstMatrixRef b(
+                      values + cell.position, row_block_size, col_block_size);
+                  auto lock =
+                      MakeConditionalLock(options_.num_threads, cell_info->m);
+                  // clang-format off
+                  MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+                      Eigen::Dynamic,Eigen::Dynamic, 1>(
+                          values + cell.position, row_block_size,col_block_size,
+                          values + cell.position, row_block_size,col_block_size,
+                          cell_info->values,r, c,row_stride,col_stride);
+                  // clang-format on
+                }
+              });

  if (D != nullptr) {
    // Add the diagonal.
-    int position = 0;
-    for (int i = 0; i < bs->cols.size(); ++i) {
-      const int block_size = bs->cols[i].size;
-      int r, c, row_stride, col_stride;
-      CellInfo* cell_info = m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
-      MatrixRef m(cell_info->values, row_stride, col_stride);
-      m.block(r, c, block_size, block_size).diagonal() +=
-          ConstVectorRef(D + position, block_size).array().square().matrix();
-      position += block_size;
-    }
+    ParallelFor(options_.context,
+                0,
+                bs->cols.size(),
+                options_.num_threads,
+                [this, bs, D](int i) {
+                  const int block_size = bs->cols[i].size;
+                  int r, c, row_stride, col_stride;
+                  CellInfo* cell_info =
+                      m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
+                  MatrixRef m(cell_info->values, row_stride, col_stride);
+                  m.block(r, c, block_size, block_size).diagonal() +=
+                      ConstVectorRef(D + bs->cols[i].position, block_size)
+                          .array()
+                          .square()
+                          .matrix();
+                });
  }

  m_->Invert();
  return true;
 }

-void BlockJacobiPreconditioner::RightMultiply(const double* x,
-                                              double* y) const {
-  m_->RightMultiply(x, y);
+BlockCRSJacobiPreconditioner::BlockCRSJacobiPreconditioner(
+    Preconditioner::Options options, const CompressedRowSparseMatrix& A)
+    : options_(std::move(options)), locks_(A.col_blocks().size()) {
+  auto& col_blocks = A.col_blocks();
+
+  // Compute the number of non-zeros in the preconditioner. This is needed so
+  // that we can construct the CompressedRowSparseMatrix.
+  const int m_nnz = SumSquaredSizes(col_blocks);
+  m_ = std::make_unique<CompressedRowSparseMatrix>(
+      A.num_cols(), A.num_cols(), m_nnz);
+
+  const int num_col_blocks = col_blocks.size();
+
+  // Populate the sparsity structure of the preconditioner matrix.
+  int* m_cols = m_->mutable_cols();
+  int* m_rows = m_->mutable_rows();
+  m_rows[0] = 0;
+  for (int i = 0, idx = 0; i < num_col_blocks; ++i) {
+    // For each column block populate a diagonal block in the preconditioner.
+    // Not that the because of the way the CompressedRowSparseMatrix format
+    // works, the entire diagonal block is laid out contiguously in memory as a
+    // row-major matrix. We will use this when updating the block.
+    auto& block = col_blocks[i];
+    for (int j = 0; j < block.size; ++j) {
+      for (int k = 0; k < block.size; ++k, ++idx) {
+        m_cols[idx] = block.position + k;
+      }
+      m_rows[block.position + j + 1] = idx;
+    }
+  }
+
+  // In reality we only need num_col_blocks locks, however that would require
+  // that in UpdateImpl we are able to look up the column block from the it
+  // first column. To save ourselves this map we will instead spend a few extra
+  // lock objects.
+  std::vector<std::mutex> locks(A.num_cols());
+  locks_.swap(locks);
+  CHECK_EQ(m_rows[A.num_cols()], m_nnz);
 }

-}  // namespace internal
-}  // namespace ceres
+BlockCRSJacobiPreconditioner::~BlockCRSJacobiPreconditioner() = default;
+
+bool BlockCRSJacobiPreconditioner::UpdateImpl(
+    const CompressedRowSparseMatrix& A, const double* D) {
+  const auto& col_blocks = A.col_blocks();
+  const auto& row_blocks = A.row_blocks();
+  const int num_col_blocks = col_blocks.size();
+  const int num_row_blocks = row_blocks.size();
+
+  const int* a_rows = A.rows();
+  const int* a_cols = A.cols();
+  const double* a_values = A.values();
+  double* m_values = m_->mutable_values();
+  const int* m_rows = m_->rows();
+
+  m_->SetZero();
+
+  ParallelFor(
+      options_.context,
+      0,
+      num_row_blocks,
+      options_.num_threads,
+      [this, row_blocks, a_rows, a_cols, a_values, m_values, m_rows](int i) {
+        const int row = row_blocks[i].position;
+        const int row_block_size = row_blocks[i].size;
+        const int row_nnz = a_rows[row + 1] - a_rows[row];
+        ConstMatrixRef row_block(
+            a_values + a_rows[row], row_block_size, row_nnz);
+        int c = 0;
+        while (c < row_nnz) {
+          const int idx = a_rows[row] + c;
+          const int col = a_cols[idx];
+          const int col_block_size = m_rows[col + 1] - m_rows[col];
+
+          // We make use of the fact that the entire diagonal block is
+          // stored contiguously in memory as a row-major matrix.
+          MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+          // We do not have a row_stride version of
+          // MatrixTransposeMatrixMultiply, otherwise we could use it
+          // here to further speed up the following expression.
+          auto b = row_block.middleCols(c, col_block_size);
+          auto lock = MakeConditionalLock(options_.num_threads, locks_[col]);
+          m.noalias() += b.transpose() * b;
+          c += col_block_size;
+        }
+      });
+
+  ParallelFor(
+      options_.context,
+      0,
+      num_col_blocks,
+      options_.num_threads,
+      [col_blocks, m_rows, m_values, D](int i) {
+        const int col = col_blocks[i].position;
+        const int col_block_size = col_blocks[i].size;
+        MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+
+        if (D != nullptr) {
+          m.diagonal() +=
+              ConstVectorRef(D + col, col_block_size).array().square().matrix();
+        }
+
+        // TODO(sameeragarwal): Deal with Cholesky inversion failure here and
+        // elsewhere.
+        m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
+      });
+
+  return true;
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_jacobi_preconditioner.h
+++ b/extern/ceres/internal/ceres/block_jacobi_preconditioner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,34 +38,30 @@
 #include "ceres/internal/export.h"
 #include "ceres/preconditioner.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;
-struct CompressedRowBlockStructure;
+class CompressedRowSparseMatrix;

 // A block Jacobi preconditioner. This is intended for use with
-// conjugate gradients, or other iterative symmetric solvers. To use
-// the preconditioner, create one by passing a BlockSparseMatrix "A"
-// to the constructor. This fixes the sparsity pattern to the pattern
-// of the matrix A^TA.
+// conjugate gradients, or other iterative symmetric solvers.
+
+// This version of the preconditioner is for use with BlockSparseMatrix
+// Jacobians.
 //
-// Before each use of the preconditioner in a solve with conjugate gradients,
-// update the matrix by running Update(A, D). The values of the matrix A are
-// inspected to construct the preconditioner. The vector D is applied as the
-// D^TD diagonal term.
-class CERES_NO_EXPORT BlockJacobiPreconditioner
+// TODO(https://github.com/ceres-solver/ceres-solver/issues/936):
+// BlockSparseJacobiPreconditioner::RightMultiply will benefit from
+// multithreading
+class CERES_NO_EXPORT BlockSparseJacobiPreconditioner
    : public BlockSparseMatrixPreconditioner {
 public:
  // A must remain valid while the BlockJacobiPreconditioner is.
-  explicit BlockJacobiPreconditioner(const BlockSparseMatrix& A);
-  BlockJacobiPreconditioner(const BlockJacobiPreconditioner&) = delete;
-  void operator=(const BlockJacobiPreconditioner&) = delete;
-
-  ~BlockJacobiPreconditioner() override;
-
-  // Preconditioner interface
-  void RightMultiply(const double* x, double* y) const final;
+  explicit BlockSparseJacobiPreconditioner(Preconditioner::Options,
+                                           const BlockSparseMatrix& A);
+  ~BlockSparseJacobiPreconditioner() override;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final {
+    return m_->RightMultiplyAndAccumulate(x, y);
+  }
  int num_rows() const final { return m_->num_rows(); }
  int num_cols() const final { return m_->num_rows(); }
  const BlockRandomAccessDiagonalMatrix& matrix() const { return *m_; }
@@ -73,11 +69,35 @@ class CERES_NO_EXPORT BlockJacobiPreconditioner
 private:
  bool UpdateImpl(const BlockSparseMatrix& A, const double* D) final;

+  Preconditioner::Options options_;
  std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_;
 };

-}  // namespace internal
-}  // namespace ceres
+// This version of the preconditioner is for use with CompressedRowSparseMatrix
+// Jacobians.
+class CERES_NO_EXPORT BlockCRSJacobiPreconditioner
+    : public CompressedRowSparseMatrixPreconditioner {
+ public:
+  // A must remain valid while the BlockJacobiPreconditioner is.
+  explicit BlockCRSJacobiPreconditioner(Preconditioner::Options options,
+                                        const CompressedRowSparseMatrix& A);
+  ~BlockCRSJacobiPreconditioner() override;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final {
+    m_->RightMultiplyAndAccumulate(x, y);
+  }
+  int num_rows() const final { return m_->num_rows(); }
+  int num_cols() const final { return m_->num_rows(); }
+  const CompressedRowSparseMatrix& matrix() const { return *m_; }
+
+ private:
+  bool UpdateImpl(const CompressedRowSparseMatrix& A, const double* D) final;
+
+  Preconditioner::Options options_;
+  std::vector<std::mutex> locks_;
+  std::unique_ptr<CompressedRowSparseMatrix> m_;
+};
+
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_jacobian_writer.cc
+++ b/extern/ceres/internal/ceres/block_jacobian_writer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,6 +32,7 @@

 #include <algorithm>
 #include <memory>
+#include <vector>

 #include "ceres/block_evaluate_preparer.h"
 #include "ceres/block_sparse_matrix.h"
@@ -41,10 +42,7 @@
 #include "ceres/program.h"
 #include "ceres/residual_block.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
+namespace ceres::internal {

 namespace {

@@ -56,19 +54,27 @@ namespace {
 // the first num_eliminate_blocks parameter blocks as indicated by the parameter
 // block ordering. The remaining parameter blocks are the F blocks.
 //
+// In order to simplify handling block-sparse to CRS conversion, cells within
+// the row-block of non-partitioned matrix are stored in memory sequentially in
+// the order of increasing column-block id. In case of partitioned matrices,
+// cells corresponding to F sub-matrix are stored sequentially in the order of
+// increasing column-block id (with cells corresponding to E sub-matrix stored
+// separately).
+//
 // TODO(keir): Consider if we should use a boolean for each parameter block
 // instead of num_eliminate_blocks.
-void BuildJacobianLayout(const Program& program,
+bool BuildJacobianLayout(const Program& program,
                         int num_eliminate_blocks,
-                         vector<int*>* jacobian_layout,
-                         vector<int>* jacobian_layout_storage) {
-  const vector<ResidualBlock*>& residual_blocks = program.residual_blocks();
+                         std::vector<int*>* jacobian_layout,
+                         std::vector<int>* jacobian_layout_storage) {
+  const std::vector<ResidualBlock*>& residual_blocks =
+      program.residual_blocks();

  // Iterate over all the active residual blocks and determine how many E blocks
  // are there. This will determine where the F blocks start in the jacobian
  // matrix. Also compute the number of jacobian blocks.
-  int f_block_pos = 0;
-  int num_jacobian_blocks = 0;
+  unsigned int f_block_pos = 0;
+  unsigned int num_jacobian_blocks = 0;
  for (auto* residual_block : residual_blocks) {
    const int num_residuals = residual_block->NumResiduals();
    const int num_parameter_blocks = residual_block->NumParameterBlocks();
@@ -84,6 +90,11 @@ void BuildJacobianLayout(const Program& program,
        }
      }
    }
+    if (num_jacobian_blocks > std::numeric_limits<int>::max()) {
+      LOG(ERROR) << "Overlow error. Too many blocks in the jacobian matrix : "
+                 << num_jacobian_blocks;
+      return false;
+    }
  }

  // We now know that the E blocks are laid out starting at zero, and the F
@@ -95,65 +106,103 @@ void BuildJacobianLayout(const Program& program,
  jacobian_layout_storage->resize(num_jacobian_blocks);

  int e_block_pos = 0;
-  int* jacobian_pos = &(*jacobian_layout_storage)[0];
+  int* jacobian_pos = jacobian_layout_storage->data();
+  std::vector<std::pair<int, int>> active_parameter_blocks;
  for (int i = 0; i < residual_blocks.size(); ++i) {
    const ResidualBlock* residual_block = residual_blocks[i];
    const int num_residuals = residual_block->NumResiduals();
    const int num_parameter_blocks = residual_block->NumParameterBlocks();

    (*jacobian_layout)[i] = jacobian_pos;
+    // Cells from F sub-matrix are to be stored sequentially with increasing
+    // column block id. For each non-constant parameter block, a pair of indices
+    // (index in the list of active parameter blocks and index in the list of
+    // all parameter blocks) is computed, and index pairs are sorted by the
+    // index of corresponding column block id.
+    active_parameter_blocks.clear();
+    active_parameter_blocks.reserve(num_parameter_blocks);
    for (int j = 0; j < num_parameter_blocks; ++j) {
      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
-      const int parameter_block_index = parameter_block->index();
      if (parameter_block->IsConstant()) {
        continue;
      }
+      const int k = active_parameter_blocks.size();
+      active_parameter_blocks.emplace_back(k, j);
+    }
+    std::sort(active_parameter_blocks.begin(),
+              active_parameter_blocks.end(),
+              [&residual_block](const std::pair<int, int>& a,
+                                const std::pair<int, int>& b) {
+                return residual_block->parameter_blocks()[a.second]->index() <
+                       residual_block->parameter_blocks()[b.second]->index();
+              });
+    // Cell positions for each active parameter block are filled in the order of
+    // active parameter block indices sorted by columnd block index. This
+    // guarantees that cells are laid out sequentially with increasing column
+    // block indices.
+    for (const auto& indices : active_parameter_blocks) {
+      const auto [k, j] = indices;
+      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+      const int parameter_block_index = parameter_block->index();
      const int jacobian_block_size =
          num_residuals * parameter_block->TangentSize();
      if (parameter_block_index < num_eliminate_blocks) {
-        *jacobian_pos = e_block_pos;
+        jacobian_pos[k] = e_block_pos;
        e_block_pos += jacobian_block_size;
      } else {
-        *jacobian_pos = f_block_pos;
+        jacobian_pos[k] = static_cast<int>(f_block_pos);
        f_block_pos += jacobian_block_size;
+        if (f_block_pos > std::numeric_limits<int>::max()) {
+          LOG(ERROR)
+              << "Overlow error. Too many entries in the Jacobian matrix.";
+          return false;
+        }
      }
-      jacobian_pos++;
    }
+    jacobian_pos += active_parameter_blocks.size();
  }
+  return true;
 }

 }  // namespace

 BlockJacobianWriter::BlockJacobianWriter(const Evaluator::Options& options,
                                         Program* program)
-    : program_(program) {
+    : options_(options), program_(program) {
  CHECK_GE(options.num_eliminate_blocks, 0)
      << "num_eliminate_blocks must be greater than 0.";

-  BuildJacobianLayout(*program,
-                      options.num_eliminate_blocks,
-                      &jacobian_layout_,
-                      &jacobian_layout_storage_);
+  jacobian_layout_is_valid_ = BuildJacobianLayout(*program,
+                                                  options.num_eliminate_blocks,
+                                                  &jacobian_layout_,
+                                                  &jacobian_layout_storage_);
 }

 // Create evaluate prepareres that point directly into the final jacobian. This
 // makes the final Write() a nop.
 std::unique_ptr<BlockEvaluatePreparer[]>
-BlockJacobianWriter::CreateEvaluatePreparers(int num_threads) {
-  int max_derivatives_per_residual_block =
+BlockJacobianWriter::CreateEvaluatePreparers(unsigned num_threads) {
+  const int max_derivatives_per_residual_block =
      program_->MaxDerivativesPerResidualBlock();

  auto preparers = std::make_unique<BlockEvaluatePreparer[]>(num_threads);
-  for (int i = 0; i < num_threads; i++) {
-    preparers[i].Init(&jacobian_layout_[0], max_derivatives_per_residual_block);
+  for (unsigned i = 0; i < num_threads; i++) {
+    preparers[i].Init(jacobian_layout_.data(),
+                      max_derivatives_per_residual_block);
  }
  return preparers;
 }

 std::unique_ptr<SparseMatrix> BlockJacobianWriter::CreateJacobian() const {
+  if (!jacobian_layout_is_valid_) {
+    LOG(ERROR) << "Unable to create Jacobian matrix. Too many entries in the "
+                  "Jacobian matrix.";
+    return nullptr;
+  }
+
  auto* bs = new CompressedRowBlockStructure;

-  const vector<ParameterBlock*>& parameter_blocks =
+  const std::vector<ParameterBlock*>& parameter_blocks =
      program_->parameter_blocks();

  // Construct the column blocks.
@@ -167,7 +216,8 @@ std::unique_ptr<SparseMatrix> BlockJacobianWriter::CreateJacobian() const {
  }

  // Construct the cells in each row.
-  const vector<ResidualBlock*>& residual_blocks = program_->residual_blocks();
+  const std::vector<ResidualBlock*>& residual_blocks =
+      program_->residual_blocks();
  int row_block_position = 0;
  bs->rows.resize(residual_blocks.size());
  for (int i = 0; i < residual_blocks.size(); ++i) {
@@ -206,8 +256,8 @@ std::unique_ptr<SparseMatrix> BlockJacobianWriter::CreateJacobian() const {
    std::sort(row->cells.begin(), row->cells.end(), CellLessThan);
  }

-  return std::make_unique<BlockSparseMatrix>(bs);
+  return std::make_unique<BlockSparseMatrix>(
+      bs, options_.sparse_linear_algebra_library_type == CUDA_SPARSE);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_jacobian_writer.h
+++ b/extern/ceres/internal/ceres/block_jacobian_writer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,16 +44,26 @@
 #include "ceres/evaluator.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockEvaluatePreparer;
 class Program;
 class SparseMatrix;

-// TODO(sameeragarwal): This class needs documemtation.
+// TODO(sameeragarwal): This class needs documentation.
 class CERES_NO_EXPORT BlockJacobianWriter {
 public:
+  // Pre-computes positions of cells in block-sparse jacobian.
+  // Two possible memory layouts are implemented:
+  //  - Non-partitioned case
+  //  - Partitioned case (for Schur type linear solver)
+  //
+  // In non-partitioned case, cells are stored sequentially in the
+  // lexicographic order of (row block id, column block id).
+  //
+  // In the case of partitoned matrix, cells of each sub-matrix (E and F) are
+  // stored sequentially in the lexicographic order of (row block id, column
+  // block id) and cells from E sub-matrix precede cells from F sub-matrix.
  BlockJacobianWriter(const Evaluator::Options& options, Program* program);

  // JacobianWriter interface.
@@ -61,7 +71,7 @@ class CERES_NO_EXPORT BlockJacobianWriter {
  // Create evaluate prepareres that point directly into the final jacobian.
  // This makes the final Write() a nop.
  std::unique_ptr<BlockEvaluatePreparer[]> CreateEvaluatePreparers(
-      int num_threads);
+      unsigned num_threads);

  std::unique_ptr<SparseMatrix> CreateJacobian() const;

@@ -75,12 +85,13 @@ class CERES_NO_EXPORT BlockJacobianWriter {
  }

 private:
+  Evaluator::Options options_;
  Program* program_;

  // Stores the position of each residual / parameter jacobian.
  //
  // The block sparse matrix that this writer writes to is stored as a set of
-  // contiguos dense blocks, one after each other; see BlockSparseMatrix. The
+  // contiguous dense blocks, one after each other; see BlockSparseMatrix. The
  // "double* values_" member of the block sparse matrix contains all of these
  // blocks. Given a pointer to the first element of a block and the size of
  // that block, it's possible to write to it.
@@ -122,9 +133,14 @@ class CERES_NO_EXPORT BlockJacobianWriter {

  // The pointers in jacobian_layout_ point directly into this vector.
  std::vector<int> jacobian_layout_storage_;
+
+  // The constructor computes the layout of the Jacobian, and this bool keeps
+  // track of whether the computation of the layout completed successfully or
+  // not, if it is false, then jacobian_layout and jacobian_layout_storage are
+  // both in an invalid state.
+  bool jacobian_layout_is_valid_ = false;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_BLOCK_JACOBIAN_WRITER_H_
--- a/extern/ceres/internal/ceres/block_random_access_dense_matrix.cc
+++ b/extern/ceres/internal/ceres/block_random_access_dense_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,26 +30,21 @@

 #include "ceres/block_random_access_dense_matrix.h"

+#include <utility>
 #include <vector>

 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_vector_ops.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 BlockRandomAccessDenseMatrix::BlockRandomAccessDenseMatrix(
-    const std::vector<int>& blocks) {
-  const int num_blocks = blocks.size();
-  block_layout_.resize(num_blocks, 0);
-  num_rows_ = 0;
-  for (int i = 0; i < num_blocks; ++i) {
-    block_layout_[i] = num_rows_;
-    num_rows_ += blocks[i];
-  }
-
+    std::vector<Block> blocks, ContextImpl* context, int num_threads)
+    : blocks_(std::move(blocks)), context_(context), num_threads_(num_threads) {
+  const int num_blocks = blocks_.size();
+  num_rows_ = NumScalarEntries(blocks_);
  values_ = std::make_unique<double[]>(num_rows_ * num_rows_);
-
  cell_infos_ = std::make_unique<CellInfo[]>(num_blocks * num_blocks);
  for (int i = 0; i < num_blocks * num_blocks; ++i) {
    cell_infos_[i].values = values_.get();
@@ -58,30 +53,23 @@ BlockRandomAccessDenseMatrix::BlockRandomAccessDenseMatrix(
  SetZero();
 }

-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessDenseMatrix::~BlockRandomAccessDenseMatrix() = default;
-
 CellInfo* BlockRandomAccessDenseMatrix::GetCell(const int row_block_id,
                                                const int col_block_id,
                                                int* row,
                                                int* col,
                                                int* row_stride,
                                                int* col_stride) {
-  *row = block_layout_[row_block_id];
-  *col = block_layout_[col_block_id];
+  *row = blocks_[row_block_id].position;
+  *col = blocks_[col_block_id].position;
  *row_stride = num_rows_;
  *col_stride = num_rows_;
-  return &cell_infos_[row_block_id * block_layout_.size() + col_block_id];
+  return &cell_infos_[row_block_id * blocks_.size() + col_block_id];
 }

 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessDenseMatrix::SetZero() {
-  if (num_rows_) {
-    VectorRef(values_.get(), num_rows_ * num_rows_).setZero();
-  }
+  ParallelSetZero(context_, num_threads_, values_.get(), num_rows_ * num_rows_);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_random_access_dense_matrix.h
+++ b/extern/ceres/internal/ceres/block_random_access_dense_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,12 @@
 #include <vector>

 #include "ceres/block_random_access_matrix.h"
+#include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A square block random accessible matrix with the same row and
 // column block structure. All cells are stored in the same single
@@ -56,13 +57,11 @@ class CERES_NO_EXPORT BlockRandomAccessDenseMatrix
 public:
  // blocks is a vector of block sizes. The resulting matrix has
  // blocks.size() * blocks.size() cells.
-  explicit BlockRandomAccessDenseMatrix(const std::vector<int>& blocks);
-  BlockRandomAccessDenseMatrix(const BlockRandomAccessDenseMatrix&) = delete;
-  void operator=(const BlockRandomAccessDenseMatrix&) = delete;
+  explicit BlockRandomAccessDenseMatrix(std::vector<Block> blocks,
+                                        ContextImpl* context,
+                                        int num_threads);

-  // The destructor is not thread safe. It assumes that no one is
-  // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessDenseMatrix() override;
+  ~BlockRandomAccessDenseMatrix() override = default;

  // BlockRandomAccessMatrix interface.
  CellInfo* GetCell(int row_block_id,
@@ -72,8 +71,6 @@ class CERES_NO_EXPORT BlockRandomAccessDenseMatrix
                    int* row_stride,
                    int* col_stride) final;

-  // This is not a thread safe method, it assumes that no cell is
-  // locked.
  void SetZero() final;

  // Since the matrix is square with the same row and column block
@@ -86,14 +83,15 @@ class CERES_NO_EXPORT BlockRandomAccessDenseMatrix
  double* mutable_values() { return values_.get(); }

 private:
-  int num_rows_;
-  std::vector<int> block_layout_;
+  std::vector<Block> blocks_;
+  ContextImpl* context_ = nullptr;
+  int num_threads_ = -1;
+  int num_rows_ = -1;
  std::unique_ptr<double[]> values_;
  std::unique_ptr<CellInfo[]> cell_infos_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_random_access_diagonal_matrix.cc
+++ b/extern/ceres/internal/ceres/block_random_access_diagonal_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,61 +37,26 @@
 #include <vector>

 #include "Eigen/Dense"
+#include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/stl_util.h"
-#include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
-
-// TODO(sameeragarwal): Drop the dependence on TripletSparseMatrix.
+namespace ceres::internal {

 BlockRandomAccessDiagonalMatrix::BlockRandomAccessDiagonalMatrix(
-    const vector<int>& blocks)
-    : blocks_(blocks) {
-  // Build the row/column layout vector and count the number of scalar
-  // rows/columns.
-  int num_cols = 0;
-  int num_nonzeros = 0;
-  vector<int> block_positions;
-  for (int block_size : blocks_) {
-    block_positions.push_back(num_cols);
-    num_cols += block_size;
-    num_nonzeros += block_size * block_size;
+    const std::vector<Block>& blocks, ContextImpl* context, int num_threads)
+    : context_(context), num_threads_(num_threads) {
+  m_ = CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(nullptr, blocks);
+  double* values = m_->mutable_values();
+  layout_.reserve(blocks.size());
+  for (auto& block : blocks) {
+    layout_.emplace_back(std::make_unique<CellInfo>(values));
+    values += block.size * block.size;
  }
-
-  VLOG(1) << "Matrix Size [" << num_cols << "," << num_cols << "] "
-          << num_nonzeros;
-
-  tsm_ =
-      std::make_unique<TripletSparseMatrix>(num_cols, num_cols, num_nonzeros);
-  tsm_->set_num_nonzeros(num_nonzeros);
-  int* rows = tsm_->mutable_rows();
-  int* cols = tsm_->mutable_cols();
-  double* values = tsm_->mutable_values();
-
-  int pos = 0;
-  for (int i = 0; i < blocks_.size(); ++i) {
-    const int block_size = blocks_[i];
-    layout_.push_back(new CellInfo(values + pos));
-    const int block_begin = block_positions[i];
-    for (int r = 0; r < block_size; ++r) {
-      for (int c = 0; c < block_size; ++c, ++pos) {
-        rows[pos] = block_begin + r;
-        cols[pos] = block_begin + c;
-      }
-    }
-  }
-}
-
-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessDiagonalMatrix::~BlockRandomAccessDiagonalMatrix() {
-  STLDeleteContainerPointers(layout_.begin(), layout_.end());
 }

 CellInfo* BlockRandomAccessDiagonalMatrix::GetCell(int row_block_id,
@@ -103,47 +68,51 @@ CellInfo* BlockRandomAccessDiagonalMatrix::GetCell(int row_block_id,
  if (row_block_id != col_block_id) {
    return nullptr;
  }
-  const int stride = blocks_[row_block_id];
+
+  auto& blocks = m_->row_blocks();
+  const int stride = blocks[row_block_id].size;

  // Each cell is stored contiguously as its own little dense matrix.
  *row = 0;
  *col = 0;
  *row_stride = stride;
  *col_stride = stride;
-  return layout_[row_block_id];
+  return layout_[row_block_id].get();
 }

 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessDiagonalMatrix::SetZero() {
-  if (tsm_->num_nonzeros()) {
-    VectorRef(tsm_->mutable_values(), tsm_->num_nonzeros()).setZero();
-  }
+  ParallelSetZero(
+      context_, num_threads_, m_->mutable_values(), m_->num_nonzeros());
 }

 void BlockRandomAccessDiagonalMatrix::Invert() {
-  double* values = tsm_->mutable_values();
-  for (int block_size : blocks_) {
-    MatrixRef block(values, block_size, block_size);
-    block = block.selfadjointView<Eigen::Upper>().llt().solve(
-        Matrix::Identity(block_size, block_size));
-    values += block_size * block_size;
-  }
+  auto& blocks = m_->row_blocks();
+  const int num_blocks = blocks.size();
+  ParallelFor(context_, 0, num_blocks, num_threads_, [this, blocks](int i) {
+    auto* cell_info = layout_[i].get();
+    auto& block = blocks[i];
+    MatrixRef b(cell_info->values, block.size, block.size);
+    b = b.selfadjointView<Eigen::Upper>().llt().solve(
+        Matrix::Identity(block.size, block.size));
+  });
 }

-void BlockRandomAccessDiagonalMatrix::RightMultiply(const double* x,
-                                                    double* y) const {
+void BlockRandomAccessDiagonalMatrix::RightMultiplyAndAccumulate(
+    const double* x, double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);
-  const double* values = tsm_->values();
-  for (int block_size : blocks_) {
-    ConstMatrixRef block(values, block_size, block_size);
-    VectorRef(y, block_size).noalias() += block * ConstVectorRef(x, block_size);
-    x += block_size;
-    y += block_size;
-    values += block_size * block_size;
-  }
+  auto& blocks = m_->row_blocks();
+  const int num_blocks = blocks.size();
+  ParallelFor(
+      context_, 0, num_blocks, num_threads_, [this, blocks, x, y](int i) {
+        auto* cell_info = layout_[i].get();
+        auto& block = blocks[i];
+        ConstMatrixRef b(cell_info->values, block.size, block.size);
+        VectorRef(y + block.position, block.size).noalias() +=
+            b * ConstVectorRef(x + block.position, block.size);
+      });
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_random_access_diagonal_matrix.h
+++ b/extern/ceres/internal/ceres/block_random_access_diagonal_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,33 +32,30 @@
 #define CERES_INTERNAL_BLOCK_RANDOM_ACCESS_DIAGONAL_MATRIX_H_

 #include <memory>
-#include <set>
 #include <utility>
 #include <vector>

 #include "ceres/block_random_access_matrix.h"
+#include "ceres/block_structure.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
-#include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-// A thread safe block diagonal matrix implementation of
-// BlockRandomAccessMatrix.
+// A BlockRandomAccessMatrix which only stores the block diagonal.
+// BlockRandomAccessSparseMatrix can also be used to do this, but this class is
+// more efficient in time and in space.
 class CERES_NO_EXPORT BlockRandomAccessDiagonalMatrix
    : public BlockRandomAccessMatrix {
 public:
  // blocks is an array of block sizes.
-  explicit BlockRandomAccessDiagonalMatrix(const std::vector<int>& blocks);
-  BlockRandomAccessDiagonalMatrix(const BlockRandomAccessDiagonalMatrix&) =
-      delete;
-  void operator=(const BlockRandomAccessDiagonalMatrix&) = delete;
-
-  // The destructor is not thread safe. It assumes that no one is
-  // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessDiagonalMatrix() override;
+  BlockRandomAccessDiagonalMatrix(const std::vector<Block>& blocks,
+                                  ContextImpl* context,
+                                  int num_threads);
+  ~BlockRandomAccessDiagonalMatrix() override = default;

  // BlockRandomAccessMatrix Interface.
  CellInfo* GetCell(int row_block_id,
@@ -68,36 +65,30 @@ class CERES_NO_EXPORT BlockRandomAccessDiagonalMatrix
                    int* row_stride,
                    int* col_stride) final;

-  // This is not a thread safe method, it assumes that no cell is
-  // locked.
+  // m = 0
  void SetZero() final;

-  // Invert the matrix assuming that each block is positive definite.
+  // m = m^{-1}
  void Invert();

-  // y += S * x
-  void RightMultiply(const double* x, double* y) const;
+  // y += m * x
+  void RightMultiplyAndAccumulate(const double* x, double* y) const;

  // Since the matrix is square, num_rows() == num_cols().
-  int num_rows() const final { return tsm_->num_rows(); }
-  int num_cols() const final { return tsm_->num_cols(); }
+  int num_rows() const final { return m_->num_rows(); }
+  int num_cols() const final { return m_->num_cols(); }

-  const TripletSparseMatrix* matrix() const { return tsm_.get(); }
-  TripletSparseMatrix* mutable_matrix() { return tsm_.get(); }
+  const CompressedRowSparseMatrix* matrix() const { return m_.get(); }
+  CompressedRowSparseMatrix* mutable_matrix() { return m_.get(); }

 private:
-  // row/column block sizes.
-  const std::vector<int> blocks_;
-  std::vector<CellInfo*> layout_;
-
-  // The underlying matrix object which actually stores the cells.
-  std::unique_ptr<TripletSparseMatrix> tsm_;
-
-  friend class BlockRandomAccessDiagonalMatrixTest;
+  ContextImpl* context_ = nullptr;
+  const int num_threads_ = 1;
+  std::unique_ptr<CompressedRowSparseMatrix> m_;
+  std::vector<std::unique_ptr<CellInfo>> layout_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_random_access_matrix.cc
+++ b/extern/ceres/internal/ceres/block_random_access_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,8 @@

 #include "ceres/block_random_access_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 BlockRandomAccessMatrix::~BlockRandomAccessMatrix() = default;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_random_access_matrix.h
+++ b/extern/ceres/internal/ceres/block_random_access_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A matrix implementing the BlockRandomAccessMatrix interface is a
 // matrix whose rows and columns are divided into blocks. For example
@@ -123,7 +122,6 @@ class CERES_NO_EXPORT BlockRandomAccessMatrix {
  virtual int num_cols() const = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_BLOCK_RANDOM_ACCESS_MATRIX_H_
--- a/extern/ceres/internal/ceres/block_random_access_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/block_random_access_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,87 +37,63 @@
 #include <vector>

 #include "ceres/internal/export.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::make_pair;
-using std::pair;
-using std::set;
-using std::vector;
+namespace ceres::internal {

 BlockRandomAccessSparseMatrix::BlockRandomAccessSparseMatrix(
-    const vector<int>& blocks, const set<pair<int, int>>& block_pairs)
-    : kMaxRowBlocks(10 * 1000 * 1000), blocks_(blocks) {
-  CHECK_LT(blocks.size(), kMaxRowBlocks);
+    const std::vector<Block>& blocks,
+    const std::set<std::pair<int, int>>& block_pairs,
+    ContextImpl* context,
+    int num_threads)
+    : blocks_(blocks), context_(context), num_threads_(num_threads) {
+  CHECK_LE(blocks.size(), std::numeric_limits<std::int32_t>::max());

-  // Build the row/column layout vector and count the number of scalar
-  // rows/columns.
-  int num_cols = 0;
-  block_positions_.reserve(blocks_.size());
-  for (int block_size : blocks_) {
-    block_positions_.push_back(num_cols);
-    num_cols += block_size;
+  const int num_cols = NumScalarEntries(blocks);
+  const int num_blocks = blocks.size();
+
+  std::vector<int> num_cells_at_row(num_blocks);
+  for (auto& p : block_pairs) {
+    ++num_cells_at_row[p.first];
  }
-
-  // Count the number of scalar non-zero entries and build the layout
-  // object for looking into the values array of the
-  // TripletSparseMatrix.
+  auto block_structure_ = new CompressedRowBlockStructure;
+  block_structure_->cols = blocks;
+  block_structure_->rows.resize(num_blocks);
+  auto p = block_pairs.begin();
  int num_nonzeros = 0;
-  for (const auto& block_pair : block_pairs) {
-    const int row_block_size = blocks_[block_pair.first];
-    const int col_block_size = blocks_[block_pair.second];
-    num_nonzeros += row_block_size * col_block_size;
-  }
-
-  VLOG(1) << "Matrix Size [" << num_cols << "," << num_cols << "] "
-          << num_nonzeros;
-
-  tsm_ =
-      std::make_unique<TripletSparseMatrix>(num_cols, num_cols, num_nonzeros);
-  tsm_->set_num_nonzeros(num_nonzeros);
-  int* rows = tsm_->mutable_rows();
-  int* cols = tsm_->mutable_cols();
-  double* values = tsm_->mutable_values();
-
-  int pos = 0;
-  for (const auto& block_pair : block_pairs) {
-    const int row_block_size = blocks_[block_pair.first];
-    const int col_block_size = blocks_[block_pair.second];
-    cell_values_.emplace_back(block_pair, values + pos);
-    layout_[IntPairToLong(block_pair.first, block_pair.second)] =
-        new CellInfo(values + pos);
-    pos += row_block_size * col_block_size;
-  }
-
-  // Fill the sparsity pattern of the underlying matrix.
-  for (const auto& block_pair : block_pairs) {
-    const int row_block_id = block_pair.first;
-    const int col_block_id = block_pair.second;
-    const int row_block_size = blocks_[row_block_id];
-    const int col_block_size = blocks_[col_block_id];
-    int pos =
-        layout_[IntPairToLong(row_block_id, col_block_id)]->values - values;
-    for (int r = 0; r < row_block_size; ++r) {
-      for (int c = 0; c < col_block_size; ++c, ++pos) {
-        rows[pos] = block_positions_[row_block_id] + r;
-        cols[pos] = block_positions_[col_block_id] + c;
-        values[pos] = 1.0;
-        DCHECK_LT(rows[pos], tsm_->num_rows());
-        DCHECK_LT(cols[pos], tsm_->num_rows());
-      }
+  // Pairs of block indices are sorted lexicographically, thus pairs
+  // corresponding to a single row-block are stored in segments of index pairs
+  // with constant row-block index and increasing column-block index.
+  // CompressedRowBlockStructure is created by traversing block_pairs set.
+  for (int row_block_id = 0; row_block_id < num_blocks; ++row_block_id) {
+    auto& row = block_structure_->rows[row_block_id];
+    row.block = blocks[row_block_id];
+    row.cells.reserve(num_cells_at_row[row_block_id]);
+    const int row_block_size = blocks[row_block_id].size;
+    // Process all index pairs corresponding to the current row block. Because
+    // index pairs are sorted lexicographically, cells are being appended to the
+    // current row-block till the first change in row-block index
+    for (; p != block_pairs.end() && row_block_id == p->first; ++p) {
+      const int col_block_id = p->second;
+      row.cells.emplace_back(col_block_id, num_nonzeros);
+      num_nonzeros += row_block_size * blocks[col_block_id].size;
    }
  }
-}
-
-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessSparseMatrix::~BlockRandomAccessSparseMatrix() {
-  for (const auto& entry : layout_) {
-    delete entry.second;
+  bsm_ = std::make_unique<BlockSparseMatrix>(block_structure_);
+  VLOG(1) << "Matrix Size [" << num_cols << "," << num_cols << "] "
+          << num_nonzeros;
+  double* values = bsm_->mutable_values();
+  for (int row_block_id = 0; row_block_id < num_blocks; ++row_block_id) {
+    const auto& cells = block_structure_->rows[row_block_id].cells;
+    for (auto& c : cells) {
+      const int col_block_id = c.block_id;
+      double* const data = values + c.position;
+      layout_[IntPairToInt64(row_block_id, col_block_id)] =
+          std::make_unique<CellInfo>(data);
+    }
  }
 }

@@ -127,8 +103,7 @@ CellInfo* BlockRandomAccessSparseMatrix::GetCell(int row_block_id,
                                                 int* col,
                                                 int* row_stride,
                                                 int* col_stride) {
-  const LayoutType::iterator it =
-      layout_.find(IntPairToLong(row_block_id, col_block_id));
+  const auto it = layout_.find(IntPairToInt64(row_block_id, col_block_id));
  if (it == layout_.end()) {
    return nullptr;
  }
@@ -136,44 +111,49 @@ CellInfo* BlockRandomAccessSparseMatrix::GetCell(int row_block_id,
  // Each cell is stored contiguously as its own little dense matrix.
  *row = 0;
  *col = 0;
-  *row_stride = blocks_[row_block_id];
-  *col_stride = blocks_[col_block_id];
-  return it->second;
+  *row_stride = blocks_[row_block_id].size;
+  *col_stride = blocks_[col_block_id].size;
+  return it->second.get();
 }

 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessSparseMatrix::SetZero() {
-  if (tsm_->num_nonzeros()) {
-    VectorRef(tsm_->mutable_values(), tsm_->num_nonzeros()).setZero();
-  }
+  bsm_->SetZero(context_, num_threads_);
 }

-void BlockRandomAccessSparseMatrix::SymmetricRightMultiply(const double* x,
-                                                           double* y) const {
-  for (const auto& cell_position_and_data : cell_values_) {
-    const int row = cell_position_and_data.first.first;
-    const int row_block_size = blocks_[row];
-    const int row_block_pos = block_positions_[row];
+void BlockRandomAccessSparseMatrix::SymmetricRightMultiplyAndAccumulate(
+    const double* x, double* y) const {
+  const auto bs = bsm_->block_structure();
+  const auto values = bsm_->values();
+  const int num_blocks = blocks_.size();

-    const int col = cell_position_and_data.first.second;
-    const int col_block_size = blocks_[col];
-    const int col_block_pos = block_positions_[col];
+  for (int row_block_id = 0; row_block_id < num_blocks; ++row_block_id) {
+    const auto& row_block = bs->rows[row_block_id];
+    const int row_block_size = row_block.block.size;
+    const int row_block_pos = row_block.block.position;

-    MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-        cell_position_and_data.second,
-        row_block_size,
-        col_block_size,
-        x + col_block_pos,
-        y + row_block_pos);
+    for (auto& c : row_block.cells) {
+      const int col_block_id = c.block_id;
+      const int col_block_size = blocks_[col_block_id].size;
+      const int col_block_pos = blocks_[col_block_id].position;

-    // Since the matrix is symmetric, but only the upper triangular
-    // part is stored, if the block being accessed is not a diagonal
-    // block, then use the same block to do the corresponding lower
-    // triangular multiply also.
-    if (row != col) {
+      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          values + c.position,
+          row_block_size,
+          col_block_size,
+          x + col_block_pos,
+          y + row_block_pos);
+      if (col_block_id == row_block_id) {
+        continue;
+      }
+
+      // Since the matrix is symmetric, but only the upper triangular
+      // part is stored, if the block being accessed is not a diagonal
+      // block, then use the same block to do the corresponding lower
+      // triangular multiply also
      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          cell_position_and_data.second,
+          values + c.position,
          row_block_size,
          col_block_size,
          x + row_block_pos,
@@ -182,5 +162,4 @@ void BlockRandomAccessSparseMatrix::SymmetricRightMultiply(const double* x,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_random_access_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/block_random_access_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,17 +39,18 @@
 #include <vector>

 #include "ceres/block_random_access_matrix.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 #include "ceres/small_blas.h"
-#include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A thread safe square block sparse implementation of
-// BlockRandomAccessMatrix. Internally a TripletSparseMatrix is used
+// BlockRandomAccessMatrix. Internally a BlockSparseMatrix is used
 // for doing the actual storage. This class augments this matrix with
 // an unordered_map that allows random read/write access.
 class CERES_NO_EXPORT BlockRandomAccessSparseMatrix
@@ -59,14 +60,14 @@ class CERES_NO_EXPORT BlockRandomAccessSparseMatrix
  // <row_block_id, col_block_id> pairs to identify the non-zero cells
  // of this matrix.
  BlockRandomAccessSparseMatrix(
-      const std::vector<int>& blocks,
-      const std::set<std::pair<int, int>>& block_pairs);
-  BlockRandomAccessSparseMatrix(const BlockRandomAccessSparseMatrix&) = delete;
-  void operator=(const BlockRandomAccessSparseMatrix&) = delete;
+      const std::vector<Block>& blocks,
+      const std::set<std::pair<int, int>>& block_pairs,
+      ContextImpl* context,
+      int num_threads);

  // The destructor is not thread safe. It assumes that no one is
  // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessSparseMatrix() override;
+  ~BlockRandomAccessSparseMatrix() override = default;

  // BlockRandomAccessMatrix Interface.
  CellInfo* GetCell(int row_block_id,
@@ -80,53 +81,49 @@ class CERES_NO_EXPORT BlockRandomAccessSparseMatrix
  // locked.
  void SetZero() final;

-  // Assume that the matrix is symmetric and only one half of the
-  // matrix is stored.
+  // Assume that the matrix is symmetric and only one half of the matrix is
+  // stored.
  //
  // y += S * x
-  void SymmetricRightMultiply(const double* x, double* y) const;
+  void SymmetricRightMultiplyAndAccumulate(const double* x, double* y) const;

  // Since the matrix is square, num_rows() == num_cols().
-  int num_rows() const final { return tsm_->num_rows(); }
-  int num_cols() const final { return tsm_->num_cols(); }
+  int num_rows() const final { return bsm_->num_rows(); }
+  int num_cols() const final { return bsm_->num_cols(); }

  // Access to the underlying matrix object.
-  const TripletSparseMatrix* matrix() const { return tsm_.get(); }
-  TripletSparseMatrix* mutable_matrix() { return tsm_.get(); }
+  const BlockSparseMatrix* matrix() const { return bsm_.get(); }
+  BlockSparseMatrix* mutable_matrix() { return bsm_.get(); }

 private:
-  int64_t IntPairToLong(int row, int col) const {
-    return row * kMaxRowBlocks + col;
+  int64_t IntPairToInt64(int row, int col) const {
+    return row * kRowShift + col;
  }

-  void LongToIntPair(int64_t index, int* row, int* col) const {
-    *row = index / kMaxRowBlocks;
-    *col = index % kMaxRowBlocks;
+  void Int64ToIntPair(int64_t index, int* row, int* col) const {
+    *row = index / kRowShift;
+    *col = index % kRowShift;
  }

-  const int64_t kMaxRowBlocks;
+  constexpr static int64_t kRowShift{1ll << 32};

  // row/column block sizes.
-  const std::vector<int> blocks_;
-  std::vector<int> block_positions_;
+  const std::vector<Block> blocks_;
+  ContextImpl* context_ = nullptr;
+  const int num_threads_ = 1;

  // A mapping from <row_block_id, col_block_id> to the position in
  // the values array of tsm_ where the block is stored.
-  using LayoutType = std::unordered_map<long, CellInfo*>;
+  using LayoutType = std::unordered_map<int64_t, std::unique_ptr<CellInfo>>;
  LayoutType layout_;

-  // In order traversal of contents of the matrix. This allows us to
-  // implement a matrix-vector which is 20% faster than using the
-  // iterator in the Layout object instead.
-  std::vector<std::pair<std::pair<int, int>, double*>> cell_values_;
  // The underlying matrix object which actually stores the cells.
-  std::unique_ptr<TripletSparseMatrix> tsm_;
+  std::unique_ptr<BlockSparseMatrix> bsm_;

  friend class BlockRandomAccessSparseMatrixTest;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/block_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,23 +33,151 @@
 #include <algorithm>
 #include <cstddef>
 #include <memory>
+#include <numeric>
+#include <random>
 #include <vector>

 #include "ceres/block_structure.h"
+#include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
-#include "ceres/random.h"
+#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/small_blas.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#endif

-using std::vector;
+namespace ceres::internal {
+
+namespace {
+void ComputeCumulativeNumberOfNonZeros(std::vector<CompressedList>& rows) {
+  if (rows.empty()) {
+    return;
+  }
+  rows[0].cumulative_nnz = rows[0].nnz;
+  for (int c = 1; c < rows.size(); ++c) {
+    const int curr_nnz = rows[c].nnz;
+    rows[c].cumulative_nnz = curr_nnz + rows[c - 1].cumulative_nnz;
+  }
+}
+
+template <bool transpose>
+std::unique_ptr<CompressedRowSparseMatrix>
+CreateStructureOfCompressedRowSparseMatrix(
+    const double* values,
+    int num_rows,
+    int num_cols,
+    int num_nonzeros,
+    const CompressedRowBlockStructure* block_structure) {
+  auto crs_matrix = std::make_unique<CompressedRowSparseMatrix>(
+      num_rows, num_cols, num_nonzeros);
+  auto crs_cols = crs_matrix->mutable_cols();
+  auto crs_rows = crs_matrix->mutable_rows();
+  int value_offset = 0;
+  const int num_row_blocks = block_structure->rows.size();
+  const auto& cols = block_structure->cols;
+  *crs_rows++ = 0;
+  for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
+    const auto& row_block = block_structure->rows[row_block_id];
+    // Empty row block: only requires setting row offsets
+    if (row_block.cells.empty()) {
+      std::fill(crs_rows, crs_rows + row_block.block.size, value_offset);
+      crs_rows += row_block.block.size;
+      continue;
+    }
+
+    int row_nnz = 0;
+    if constexpr (transpose) {
+      // Transposed block structure comes with nnz in row-block filled-in
+      row_nnz = row_block.nnz / row_block.block.size;
+    } else {
+      // Nnz field of non-transposed block structure is not filled and it can
+      // have non-sequential structure (consider the case of jacobian for
+      // Schur-complement solver: E and F blocks are stored separately).
+      for (auto& c : row_block.cells) {
+        row_nnz += cols[c.block_id].size;
+      }
+    }
+
+    // Row-wise setup of matrix structure
+    for (int row = 0; row < row_block.block.size; ++row) {
+      value_offset += row_nnz;
+      *crs_rows++ = value_offset;
+      for (auto& c : row_block.cells) {
+        const int col_block_size = cols[c.block_id].size;
+        const int col_position = cols[c.block_id].position;
+        std::iota(crs_cols, crs_cols + col_block_size, col_position);
+        crs_cols += col_block_size;
+      }
+    }
+  }
+  return crs_matrix;
+}
+
+template <bool transpose>
+void UpdateCompressedRowSparseMatrixImpl(
+    CompressedRowSparseMatrix* crs_matrix,
+    const double* values,
+    const CompressedRowBlockStructure* block_structure) {
+  auto crs_values = crs_matrix->mutable_values();
+  auto crs_rows = crs_matrix->mutable_rows();
+  const int num_row_blocks = block_structure->rows.size();
+  const auto& cols = block_structure->cols;
+  for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
+    const auto& row_block = block_structure->rows[row_block_id];
+    const int row_block_size = row_block.block.size;
+    const int row_nnz = crs_rows[1] - crs_rows[0];
+    crs_rows += row_block_size;
+
+    if (row_nnz == 0) {
+      continue;
+    }
+
+    MatrixRef crs_row_block(crs_values, row_block_size, row_nnz);
+    int col_offset = 0;
+    for (auto& c : row_block.cells) {
+      const int col_block_size = cols[c.block_id].size;
+      auto crs_cell =
+          crs_row_block.block(0, col_offset, row_block_size, col_block_size);
+      if constexpr (transpose) {
+        // Transposed matrix is filled using transposed block-strucutre
+        ConstMatrixRef cell(
+            values + c.position, col_block_size, row_block_size);
+        crs_cell = cell.transpose();
+      } else {
+        ConstMatrixRef cell(
+            values + c.position, row_block_size, col_block_size);
+        crs_cell = cell;
+      }
+      col_offset += col_block_size;
+    }
+    crs_values += row_nnz * row_block_size;
+  }
+}
+
+void SetBlockStructureOfCompressedRowSparseMatrix(
+    CompressedRowSparseMatrix* crs_matrix,
+    CompressedRowBlockStructure* block_structure) {
+  const int num_row_blocks = block_structure->rows.size();
+  auto& row_blocks = *crs_matrix->mutable_row_blocks();
+  row_blocks.resize(num_row_blocks);
+  for (int i = 0; i < num_row_blocks; ++i) {
+    row_blocks[i] = block_structure->rows[i].block;
+  }
+
+  auto& col_blocks = *crs_matrix->mutable_col_blocks();
+  col_blocks = block_structure->cols;
+}
+
+}  // namespace

 BlockSparseMatrix::BlockSparseMatrix(
-    CompressedRowBlockStructure* block_structure)
-    : num_rows_(0),
+    CompressedRowBlockStructure* block_structure, bool use_page_locked_memory)
+    : use_page_locked_memory_(use_page_locked_memory),
+      num_rows_(0),
      num_cols_(0),
      num_nonzeros_(0),
      block_structure_(block_structure) {
@@ -66,7 +194,7 @@ BlockSparseMatrix::BlockSparseMatrix(
    int row_block_size = block_structure_->rows[i].block.size;
    num_rows_ += row_block_size;

-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    const std::vector<Cell>& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
@@ -79,51 +207,138 @@ BlockSparseMatrix::BlockSparseMatrix(
  CHECK_GE(num_nonzeros_, 0);
  VLOG(2) << "Allocating values array with " << num_nonzeros_ * sizeof(double)
          << " bytes.";  // NOLINT
-  values_ = std::make_unique<double[]>(num_nonzeros_);
+
+  values_ = AllocateValues(num_nonzeros_);
  max_num_nonzeros_ = num_nonzeros_;
  CHECK(values_ != nullptr);
+  AddTransposeBlockStructure();
 }

-void BlockSparseMatrix::SetZero() {
-  std::fill(values_.get(), values_.get() + num_nonzeros_, 0.0);
-}
+BlockSparseMatrix::~BlockSparseMatrix() { FreeValues(values_); }

-void BlockSparseMatrix::RightMultiply(const double* x, double* y) const {
-  CHECK(x != nullptr);
-  CHECK(y != nullptr);
-
-  for (int i = 0; i < block_structure_->rows.size(); ++i) {
-    int row_block_pos = block_structure_->rows[i].block.position;
-    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
-    for (const auto& cell : cells) {
-      int col_block_id = cell.block_id;
-      int col_block_size = block_structure_->cols[col_block_id].size;
-      int col_block_pos = block_structure_->cols[col_block_id].position;
-      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values_.get() + cell.position,
-          row_block_size,
-          col_block_size,
-          x + col_block_pos,
-          y + row_block_pos);
-    }
+void BlockSparseMatrix::AddTransposeBlockStructure() {
+  if (transpose_block_structure_ == nullptr) {
+    transpose_block_structure_ = CreateTranspose(*block_structure_);
  }
 }

-void BlockSparseMatrix::LeftMultiply(const double* x, double* y) const {
+void BlockSparseMatrix::SetZero() {
+  std::fill(values_, values_ + num_nonzeros_, 0.0);
+}
+
+void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
+  ParallelSetZero(context, num_threads, values_, num_nonzeros_);
+}
+
+void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                   double* y) const {
+  RightMultiplyAndAccumulate(x, y, nullptr, 1);
+}
+
+void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                   double* y,
+                                                   ContextImpl* context,
+                                                   int num_threads) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);

+  const auto values = values_;
+  const auto block_structure = block_structure_.get();
+  const auto num_row_blocks = block_structure->rows.size();
+
+  ParallelFor(context,
+              0,
+              num_row_blocks,
+              num_threads,
+              [values, block_structure, x, y](int row_block_id) {
+                const int row_block_pos =
+                    block_structure->rows[row_block_id].block.position;
+                const int row_block_size =
+                    block_structure->rows[row_block_id].block.size;
+                const auto& cells = block_structure->rows[row_block_id].cells;
+                for (const auto& cell : cells) {
+                  const int col_block_id = cell.block_id;
+                  const int col_block_size =
+                      block_structure->cols[col_block_id].size;
+                  const int col_block_pos =
+                      block_structure->cols[col_block_id].position;
+                  MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+                      values + cell.position,
+                      row_block_size,
+                      col_block_size,
+                      x + col_block_pos,
+                      y + row_block_pos);
+                }
+              });
+}
+
+// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
+// might benefit from caching column-block partition
+void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+                                                  double* y,
+                                                  ContextImpl* context,
+                                                  int num_threads) const {
+  // While utilizing transposed structure allows to perform parallel
+  // left-multiplication by dense vector, it makes access patterns to matrix
+  // elements scattered. Thus, multiplication using transposed structure
+  // is only useful for parallel execution
+  CHECK(x != nullptr);
+  CHECK(y != nullptr);
+  if (transpose_block_structure_ == nullptr || num_threads == 1) {
+    LeftMultiplyAndAccumulate(x, y);
+    return;
+  }
+
+  auto transpose_bs = transpose_block_structure_.get();
+  const auto values = values_;
+  const int num_col_blocks = transpose_bs->rows.size();
+  if (!num_col_blocks) {
+    return;
+  }
+
+  // Use non-zero count as iteration cost for guided parallel-for loop
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks,
+      num_threads,
+      [values, transpose_bs, x, y](int row_block_id) {
+        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+        int row_block_size = transpose_bs->rows[row_block_id].block.size;
+        auto& cells = transpose_bs->rows[row_block_id].cells;
+
+        for (auto& cell : cells) {
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos);
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
+void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+                                                  double* y) const {
+  CHECK(x != nullptr);
+  CHECK(y != nullptr);
+  // Single-threaded left products are always computed using a non-transpose
+  // block structure, because it has linear acess pattern to matrix elements
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values_.get() + cell.position,
+          values_ + cell.position,
          row_block_size,
          col_block_size,
          x + row_block_pos,
@@ -137,35 +352,144 @@ void BlockSparseMatrix::SquaredColumnNorm(double* x) const {
  VectorRef(x, num_cols_).setZero();
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      const MatrixRef m(
-          values_.get() + cell.position, row_block_size, col_block_size);
+          values_ + cell.position, row_block_size, col_block_size);
      VectorRef(x + col_block_pos, col_block_size) += m.colwise().squaredNorm();
    }
  }
 }

+// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
+// might benefit from caching column-block partition
+void BlockSparseMatrix::SquaredColumnNorm(double* x,
+                                          ContextImpl* context,
+                                          int num_threads) const {
+  if (transpose_block_structure_ == nullptr || num_threads == 1) {
+    SquaredColumnNorm(x);
+    return;
+  }
+
+  CHECK(x != nullptr);
+  ParallelSetZero(context, num_threads, x, num_cols_);
+
+  auto transpose_bs = transpose_block_structure_.get();
+  const auto values = values_;
+  const int num_col_blocks = transpose_bs->rows.size();
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks,
+      num_threads,
+      [values, transpose_bs, x](int row_block_id) {
+        const auto& row = transpose_bs->rows[row_block_id];
+
+        for (auto& cell : row.cells) {
+          const auto& col = transpose_bs->cols[cell.block_id];
+          const MatrixRef m(values + cell.position, col.size, row.block.size);
+          VectorRef(x + row.block.position, row.block.size) +=
+              m.colwise().squaredNorm();
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
 void BlockSparseMatrix::ScaleColumns(const double* scale) {
  CHECK(scale != nullptr);

  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
-      MatrixRef m(
-          values_.get() + cell.position, row_block_size, col_block_size);
+      MatrixRef m(values_ + cell.position, row_block_size, col_block_size);
      m *= ConstVectorRef(scale + col_block_pos, col_block_size).asDiagonal();
    }
  }
 }

+// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
+// might benefit from caching column-block partition
+void BlockSparseMatrix::ScaleColumns(const double* scale,
+                                     ContextImpl* context,
+                                     int num_threads) {
+  if (transpose_block_structure_ == nullptr || num_threads == 1) {
+    ScaleColumns(scale);
+    return;
+  }
+
+  CHECK(scale != nullptr);
+  auto transpose_bs = transpose_block_structure_.get();
+  auto values = values_;
+  const int num_col_blocks = transpose_bs->rows.size();
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks,
+      num_threads,
+      [values, transpose_bs, scale](int row_block_id) {
+        const auto& row = transpose_bs->rows[row_block_id];
+
+        for (auto& cell : row.cells) {
+          const auto& col = transpose_bs->cols[cell.block_id];
+          MatrixRef m(values + cell.position, col.size, row.block.size);
+          m *= ConstVectorRef(scale + row.block.position, row.block.size)
+                   .asDiagonal();
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+std::unique_ptr<CompressedRowSparseMatrix>
+BlockSparseMatrix::ToCompressedRowSparseMatrixTranspose() const {
+  auto bs = transpose_block_structure_.get();
+  auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<true>(
+      values(), num_cols_, num_rows_, num_nonzeros_, bs);
+
+  SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(), bs);
+
+  UpdateCompressedRowSparseMatrixTranspose(crs_matrix.get());
+  return crs_matrix;
+}
+
+std::unique_ptr<CompressedRowSparseMatrix>
+BlockSparseMatrix::ToCompressedRowSparseMatrix() const {
+  auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<false>(
+      values(), num_rows_, num_cols_, num_nonzeros_, block_structure_.get());
+
+  SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(),
+                                               block_structure_.get());
+
+  UpdateCompressedRowSparseMatrix(crs_matrix.get());
+  return crs_matrix;
+}
+
+void BlockSparseMatrix::UpdateCompressedRowSparseMatrixTranspose(
+    CompressedRowSparseMatrix* crs_matrix) const {
+  CHECK(crs_matrix != nullptr);
+  CHECK_EQ(crs_matrix->num_rows(), num_cols_);
+  CHECK_EQ(crs_matrix->num_cols(), num_rows_);
+  CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
+  UpdateCompressedRowSparseMatrixImpl<true>(
+      crs_matrix, values(), transpose_block_structure_.get());
+}
+void BlockSparseMatrix::UpdateCompressedRowSparseMatrix(
+    CompressedRowSparseMatrix* crs_matrix) const {
+  CHECK(crs_matrix != nullptr);
+  CHECK_EQ(crs_matrix->num_rows(), num_rows_);
+  CHECK_EQ(crs_matrix->num_cols(), num_cols_);
+  CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
+  UpdateCompressedRowSparseMatrixImpl<false>(
+      crs_matrix, values(), block_structure_.get());
+}
+
 void BlockSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
  CHECK(dense_matrix != nullptr);

@@ -176,14 +500,14 @@ void BlockSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      int jac_pos = cell.position;
      m.block(row_block_pos, col_block_pos, row_block_size, col_block_size) +=
-          MatrixRef(values_.get() + jac_pos, row_block_size, col_block_size);
+          MatrixRef(values_ + jac_pos, row_block_size, col_block_size);
    }
  }
 }
@@ -199,7 +523,7 @@ void BlockSparseMatrix::ToTripletSparseMatrix(
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
@@ -223,12 +547,19 @@ const CompressedRowBlockStructure* BlockSparseMatrix::block_structure() const {
  return block_structure_.get();
 }

+// Return a pointer to the block structure of matrix transpose. We continue to
+// hold ownership of the object though.
+const CompressedRowBlockStructure*
+BlockSparseMatrix::transpose_block_structure() const {
+  return transpose_block_structure_.get();
+}
+
 void BlockSparseMatrix::ToTextFile(FILE* file) const {
  CHECK(file != nullptr);
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    const int row_block_pos = block_structure_->rows[i].block.position;
    const int row_block_size = block_structure_->rows[i].block.size;
-    const vector<Cell>& cells = block_structure_->rows[i].cells;
+    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      const int col_block_id = cell.block_id;
      const int col_block_size = block_structure_->cols[col_block_id].size;
@@ -293,34 +624,51 @@ void BlockSparseMatrix::AppendRows(const BlockSparseMatrix& m) {

  for (int i = 0; i < m_bs->rows.size(); ++i) {
    const CompressedRow& m_row = m_bs->rows[i];
-    CompressedRow& row = block_structure_->rows[old_num_row_blocks + i];
+    const int row_block_id = old_num_row_blocks + i;
+    CompressedRow& row = block_structure_->rows[row_block_id];
    row.block.size = m_row.block.size;
    row.block.position = num_rows_;
    num_rows_ += m_row.block.size;
    row.cells.resize(m_row.cells.size());
+    if (transpose_block_structure_) {
+      transpose_block_structure_->cols.emplace_back(row.block);
+    }
    for (int c = 0; c < m_row.cells.size(); ++c) {
      const int block_id = m_row.cells[c].block_id;
      row.cells[c].block_id = block_id;
      row.cells[c].position = num_nonzeros_;
-      num_nonzeros_ += m_row.block.size * m_bs->cols[block_id].size;
+
+      const int cell_nnz = m_row.block.size * m_bs->cols[block_id].size;
+      if (transpose_block_structure_) {
+        transpose_block_structure_->rows[block_id].cells.emplace_back(
+            row_block_id, num_nonzeros_);
+        transpose_block_structure_->rows[block_id].nnz += cell_nnz;
+      }
+
+      num_nonzeros_ += cell_nnz;
    }
  }

  if (num_nonzeros_ > max_num_nonzeros_) {
-    std::unique_ptr<double[]> new_values =
-        std::make_unique<double[]>(num_nonzeros_);
-    std::copy_n(values_.get(), old_num_nonzeros, new_values.get());
-    values_ = std::move(new_values);
+    double* old_values = values_;
+    values_ = AllocateValues(num_nonzeros_);
+    std::copy_n(old_values, old_num_nonzeros, values_);
    max_num_nonzeros_ = num_nonzeros_;
+    FreeValues(old_values);
  }

-  std::copy(m.values(),
-            m.values() + m.num_nonzeros(),
-            values_.get() + old_num_nonzeros);
+  std::copy(
+      m.values(), m.values() + m.num_nonzeros(), values_ + old_num_nonzeros);
+
+  if (transpose_block_structure_ == nullptr) {
+    return;
+  }
+  ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
 }

 void BlockSparseMatrix::DeleteRowBlocks(const int delta_row_blocks) {
  const int num_row_blocks = block_structure_->rows.size();
+  const int new_num_row_blocks = num_row_blocks - delta_row_blocks;
  int delta_num_nonzeros = 0;
  int delta_num_rows = 0;
  const std::vector<Block>& column_blocks = block_structure_->cols;
@@ -330,15 +678,40 @@ void BlockSparseMatrix::DeleteRowBlocks(const int delta_row_blocks) {
    for (int c = 0; c < row.cells.size(); ++c) {
      const Cell& cell = row.cells[c];
      delta_num_nonzeros += row.block.size * column_blocks[cell.block_id].size;
+
+      if (transpose_block_structure_) {
+        auto& col_cells = transpose_block_structure_->rows[cell.block_id].cells;
+        while (!col_cells.empty() &&
+               col_cells.back().block_id >= new_num_row_blocks) {
+          const int del_block_id = col_cells.back().block_id;
+          const int del_block_rows =
+              block_structure_->rows[del_block_id].block.size;
+          const int del_block_cols = column_blocks[cell.block_id].size;
+          const int del_cell_nnz = del_block_rows * del_block_cols;
+          transpose_block_structure_->rows[cell.block_id].nnz -= del_cell_nnz;
+          col_cells.pop_back();
+        }
+      }
    }
  }
  num_nonzeros_ -= delta_num_nonzeros;
  num_rows_ -= delta_num_rows;
-  block_structure_->rows.resize(num_row_blocks - delta_row_blocks);
+  block_structure_->rows.resize(new_num_row_blocks);
+
+  if (transpose_block_structure_ == nullptr) {
+    return;
+  }
+  for (int i = 0; i < delta_row_blocks; ++i) {
+    transpose_block_structure_->cols.pop_back();
+  }
+
+  ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
 }

 std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
-    const BlockSparseMatrix::RandomMatrixOptions& options) {
+    const BlockSparseMatrix::RandomMatrixOptions& options,
+    std::mt19937& prng,
+    bool use_page_locked_memory) {
  CHECK_GT(options.num_row_blocks, 0);
  CHECK_GT(options.min_row_block_size, 0);
  CHECK_GT(options.max_row_block_size, 0);
@@ -346,7 +719,11 @@ std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
  CHECK_GT(options.block_density, 0.0);
  CHECK_LE(options.block_density, 1.0);

-  auto* bs = new CompressedRowBlockStructure();
+  std::uniform_int_distribution<int> col_distribution(
+      options.min_col_block_size, options.max_col_block_size);
+  std::uniform_int_distribution<int> row_distribution(
+      options.min_row_block_size, options.max_row_block_size);
+  auto bs = std::make_unique<CompressedRowBlockStructure>();
  if (options.col_blocks.empty()) {
    CHECK_GT(options.num_col_blocks, 0);
    CHECK_GT(options.min_col_block_size, 0);
@@ -356,10 +733,7 @@ std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
    // Generate the col block structure.
    int col_block_position = 0;
    for (int i = 0; i < options.num_col_blocks; ++i) {
-      // Generate a random integer in [min_col_block_size, max_col_block_size]
-      const int delta_block_size =
-          Uniform(options.max_col_block_size - options.min_col_block_size);
-      const int col_block_size = options.min_col_block_size + delta_block_size;
+      const int col_block_size = col_distribution(prng);
      bs->cols.emplace_back(col_block_size, col_block_position);
      col_block_position += col_block_size;
    }
@@ -368,22 +742,21 @@ std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
  }

  bool matrix_has_blocks = false;
+  std::uniform_real_distribution<double> uniform01(0.0, 1.0);
  while (!matrix_has_blocks) {
    VLOG(1) << "Clearing";
    bs->rows.clear();
    int row_block_position = 0;
    int value_position = 0;
    for (int r = 0; r < options.num_row_blocks; ++r) {
-      const int delta_block_size =
-          Uniform(options.max_row_block_size - options.min_row_block_size);
-      const int row_block_size = options.min_row_block_size + delta_block_size;
+      const int row_block_size = row_distribution(prng);
      bs->rows.emplace_back();
      CompressedRow& row = bs->rows.back();
      row.block.size = row_block_size;
      row.block.position = row_block_position;
      row_block_position += row_block_size;
      for (int c = 0; c < bs->cols.size(); ++c) {
-        if (RandDouble() > options.block_density) continue;
+        if (uniform01(prng) > options.block_density) continue;

        row.cells.emplace_back();
        Cell& cell = row.cells.back();
@@ -395,14 +768,76 @@ std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
    }
  }

-  auto matrix = std::make_unique<BlockSparseMatrix>(bs);
+  auto matrix =
+      std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory);
  double* values = matrix->mutable_values();
-  for (int i = 0; i < matrix->num_nonzeros(); ++i) {
-    values[i] = RandNormal();
-  }
+  std::normal_distribution<double> standard_normal_distribution;
+  std::generate_n(
+      values, matrix->num_nonzeros(), [&standard_normal_distribution, &prng] {
+        return standard_normal_distribution(prng);
+      });

  return matrix;
 }

-}  // namespace internal
-}  // namespace ceres
+std::unique_ptr<CompressedRowBlockStructure> CreateTranspose(
+    const CompressedRowBlockStructure& bs) {
+  auto transpose = std::make_unique<CompressedRowBlockStructure>();
+
+  transpose->rows.resize(bs.cols.size());
+  for (int i = 0; i < bs.cols.size(); ++i) {
+    transpose->rows[i].block = bs.cols[i];
+    transpose->rows[i].nnz = 0;
+  }
+
+  transpose->cols.resize(bs.rows.size());
+  for (int i = 0; i < bs.rows.size(); ++i) {
+    auto& row = bs.rows[i];
+    transpose->cols[i] = row.block;
+
+    const int nrows = row.block.size;
+    for (auto& cell : row.cells) {
+      transpose->rows[cell.block_id].cells.emplace_back(i, cell.position);
+      const int ncols = transpose->rows[cell.block_id].block.size;
+      transpose->rows[cell.block_id].nnz += nrows * ncols;
+    }
+  }
+  ComputeCumulativeNumberOfNonZeros(transpose->rows);
+  return transpose;
+}
+
+double* BlockSparseMatrix::AllocateValues(int size) {
+  if (!use_page_locked_memory_) {
+    return new double[size];
+  }
+
+#ifndef CERES_NO_CUDA
+
+  double* values = nullptr;
+  CHECK_EQ(cudaSuccess,
+           cudaHostAlloc(&values, sizeof(double) * size, cudaHostAllocDefault));
+  return values;
+#else
+  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
+             << "This is a Ceres bug; please contact the developers!";
+  return nullptr;
+#endif
+};
+
+void BlockSparseMatrix::FreeValues(double*& values) {
+  if (!use_page_locked_memory_) {
+    delete[] values;
+    values = nullptr;
+    return;
+  }
+
+#ifndef CERES_NO_CUDA
+  CHECK_EQ(cudaSuccess, cudaFreeHost(values));
+  values = nullptr;
+#else
+  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
+             << "This is a Ceres bug; please contact the developers!";
+#endif
+};
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/block_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,17 @@
 #define CERES_INTERNAL_BLOCK_SPARSE_MATRIX_H_

 #include <memory>
+#include <random>

 #include "ceres/block_structure.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "ceres/sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class TripletSparseMatrix;

@@ -63,31 +65,64 @@ class CERES_NO_EXPORT BlockSparseMatrix final : public SparseMatrix {
  //
  // TODO(sameeragarwal): Add a function which will validate legal
  // CompressedRowBlockStructure objects.
-  explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure);
+  explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure,
+                             bool use_page_locked_memory = false);
+  ~BlockSparseMatrix();

-  BlockSparseMatrix();
  BlockSparseMatrix(const BlockSparseMatrix&) = delete;
  void operator=(const BlockSparseMatrix&) = delete;

  // Implementation of SparseMatrix interface.
-  void SetZero() final;
-  void RightMultiply(const double* x, double* y) const final;
-  void LeftMultiply(const double* x, double* y) const final;
+  void SetZero() override final;
+  void SetZero(ContextImpl* context, int num_threads) override final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
+  void LeftMultiplyAndAccumulate(const double* x,
+                                 double* y,
+                                 ContextImpl* context,
+                                 int num_threads) const final;
  void SquaredColumnNorm(double* x) const final;
+  void SquaredColumnNorm(double* x,
+                         ContextImpl* context,
+                         int num_threads) const final;
  void ScaleColumns(const double* scale) final;
+  void ScaleColumns(const double* scale,
+                    ContextImpl* context,
+                    int num_threads) final;
+
+  // Convert to CompressedRowSparseMatrix
+  std::unique_ptr<CompressedRowSparseMatrix> ToCompressedRowSparseMatrix()
+      const;
+  // Create CompressedRowSparseMatrix corresponding to transposed matrix
+  std::unique_ptr<CompressedRowSparseMatrix>
+  ToCompressedRowSparseMatrixTranspose() const;
+  // Copy values to CompressedRowSparseMatrix that has compatible structure
+  void UpdateCompressedRowSparseMatrix(
+      CompressedRowSparseMatrix* crs_matrix) const;
+  // Copy values to CompressedRowSparseMatrix that has structure of transposed
+  // matrix
+  void UpdateCompressedRowSparseMatrixTranspose(
+      CompressedRowSparseMatrix* crs_matrix) const;
  void ToDenseMatrix(Matrix* dense_matrix) const final;
  void ToTextFile(FILE* file) const final;

+  void AddTransposeBlockStructure();
+
  // clang-format off
  int num_rows()         const final { return num_rows_;     }
  int num_cols()         const final { return num_cols_;     }
  int num_nonzeros()     const final { return num_nonzeros_; }
-  const double* values() const final { return values_.get(); }
-  double* mutable_values()     final { return values_.get(); }
+  const double* values() const final { return values_; }
+  double* mutable_values()     final { return values_; }
  // clang-format on

  void ToTripletSparseMatrix(TripletSparseMatrix* matrix) const;
  const CompressedRowBlockStructure* block_structure() const;
+  const CompressedRowBlockStructure* transpose_block_structure() const;

  // Append the contents of m to the bottom of this matrix. m must
  // have the same column blocks structure as this matrix.
@@ -122,15 +157,22 @@ class CERES_NO_EXPORT BlockSparseMatrix final : public SparseMatrix {
  // distributed and whose structure is determined by
  // RandomMatrixOptions.
  static std::unique_ptr<BlockSparseMatrix> CreateRandomMatrix(
-      const RandomMatrixOptions& options);
+      const RandomMatrixOptions& options,
+      std::mt19937& prng,
+      bool use_page_locked_memory = false);

 private:
+  double* AllocateValues(int size);
+  void FreeValues(double*& values);
+
+  const bool use_page_locked_memory_;
  int num_rows_;
  int num_cols_;
  int num_nonzeros_;
  int max_num_nonzeros_;
-  std::unique_ptr<double[]> values_;
+  double* values_;
  std::unique_ptr<CompressedRowBlockStructure> block_structure_;
+  std::unique_ptr<CompressedRowBlockStructure> transpose_block_structure_;
 };

 // A number of algorithms like the SchurEliminator do not need
@@ -158,8 +200,10 @@ class CERES_NO_EXPORT BlockSparseMatrixData {
  const double* values_;
 };

-}  // namespace internal
-}  // namespace ceres
+std::unique_ptr<CompressedRowBlockStructure> CreateTranspose(
+    const CompressedRowBlockStructure& bs);
+
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/block_structure.cc
+++ b/extern/ceres/internal/ceres/block_structure.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,11 @@

 #include "ceres/block_structure.h"

-namespace ceres {
-namespace internal {
+#include <vector>
+
+#include "glog/logging.h"
+
+namespace ceres::internal {

 bool CellLessThan(const Cell& lhs, const Cell& rhs) {
  if (lhs.block_id == rhs.block_id) {
@@ -40,5 +43,28 @@ bool CellLessThan(const Cell& lhs, const Cell& rhs) {
  return (lhs.block_id < rhs.block_id);
 }

-}  // namespace internal
-}  // namespace ceres
+std::vector<Block> Tail(const std::vector<Block>& blocks, int n) {
+  CHECK_LE(n, blocks.size());
+  std::vector<Block> tail;
+  const int num_blocks = blocks.size();
+  const int start = num_blocks - n;
+
+  int position = 0;
+  tail.reserve(n);
+  for (int i = start; i < num_blocks; ++i) {
+    tail.emplace_back(blocks[i].size, position);
+    position += blocks[i].size;
+  }
+
+  return tail;
+}
+
+int SumSquaredSizes(const std::vector<Block>& blocks) {
+  int sum = 0;
+  for (const auto& b : blocks) {
+    sum += b.size * b.size;
+  }
+  return sum;
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/block_structure.h
+++ b/extern/ceres/internal/ceres/block_structure.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,9 @@

 #include "ceres/internal/export.h"

+// This file is being included into source files that are compiled with nvcc.
+// nvcc shipped with ubuntu 20.04 does not support some features of c++17,
+// including nested namespace definitions
 namespace ceres {
 namespace internal {

@@ -50,15 +53,19 @@ using BlockSize = int32_t;

 struct CERES_NO_EXPORT Block {
  Block() = default;
-  Block(int size_, int position_) : size(size_), position(position_) {}
+  Block(int size_, int position_) noexcept : size(size_), position(position_) {}

  BlockSize size{-1};
  int position{-1};  // Position along the row/column.
 };

+inline bool operator==(const Block& left, const Block& right) noexcept {
+  return (left.size == right.size) && (left.position == right.position);
+}
+
 struct CERES_NO_EXPORT Cell {
  Cell() = default;
-  Cell(int block_id_, int position_)
+  Cell(int block_id_, int position_) noexcept
      : block_id(block_id_), position(position_) {}

  // Column or row block id as the case maybe.
@@ -75,14 +82,95 @@ struct CERES_NO_EXPORT CompressedList {

  // Construct a CompressedList with the cells containing num_cells
  // entries.
-  explicit CompressedList(int num_cells) : cells(num_cells) {}
+  explicit CompressedList(int num_cells) noexcept : cells(num_cells) {}
  Block block;
  std::vector<Cell> cells;
+  // Number of non-zeros in cells of this row block
+  int nnz{-1};
+  // Number of non-zeros in cells of this and every preceeding row block in
+  // block-sparse matrix
+  int cumulative_nnz{-1};
 };

 using CompressedRow = CompressedList;
 using CompressedColumn = CompressedList;

+// CompressedRowBlockStructure specifies the storage structure of a row block
+// sparse matrix.
+//
+// Consider the following matrix A:
+// A = [A_11 A_12 ...
+//      A_21 A_22 ...
+//      ...
+//      A_m1 A_m2 ... ]
+//
+// A row block sparse matrix is a matrix where the following properties hold:
+// 1. The number of rows in every block A_ij and A_ik are the same.
+// 2. The number of columns in every block A_ij and A_kj are the same.
+// 3. The number of rows in A_ij and A_kj may be different (i != k).
+// 4. The number of columns in A_ij and A_ik may be different (j != k).
+// 5. Any block A_ij may be all 0s, in which case the block is not stored.
+//
+// The structure of the matrix is stored as follows:
+//
+// The `rows' array contains the following information for each row block:
+// - rows[i].block.size: The number of rows in each block A_ij in the row block.
+// - rows[i].block.position: The starting row in the full matrix A of the
+//       row block i.
+// - rows[i].cells[j].block_id: The index into the `cols' array corresponding to
+//       the non-zero blocks A_ij.
+// - rows[i].cells[j].position: The index in the `values' array for the contents
+//       of block A_ij.
+//
+// The `cols' array contains the following information for block:
+// - cols[.].size: The number of columns spanned by the block.
+// - cols[.].position: The starting column in the full matrix A of the block.
+//
+//
+// Example of a row block sparse matrix:
+// block_id: | 0  |1|2  |3 |
+// rows[0]:  [ 1 2 0 3 4 0 ]
+//           [ 5 6 0 7 8 0 ]
+// rows[1]:  [ 0 0 9 0 0 0 ]
+//
+// This matrix is stored as follows:
+//
+// There are four column blocks:
+// cols[0].size = 2
+// cols[0].position = 0
+// cols[1].size = 1
+// cols[1].position = 2
+// cols[2].size = 2
+// cols[2].position = 3
+// cols[3].size = 1
+// cols[3].position = 5
+
+// The first row block spans two rows, starting at row 0:
+// rows[0].block.size = 2          // This row block spans two rows.
+// rows[0].block.position = 0      // It starts at row 0.
+// rows[0] has two cells, at column blocks 0 and 2:
+// rows[0].cells[0].block_id = 0   // This cell is in column block 0.
+// rows[0].cells[0].position = 0   // See below for an explanation of this.
+// rows[0].cells[1].block_id = 2   // This cell is in column block 2.
+// rows[0].cells[1].position = 4   // See below for an explanation of this.
+//
+// The second row block spans two rows, starting at row 2:
+// rows[1].block.size = 1          // This row block spans one row.
+// rows[1].block.position = 2      // It starts at row 2.
+// rows[1] has one cell at column block 1:
+// rows[1].cells[0].block_id = 1   // This cell is in column block 1.
+// rows[1].cells[0].position = 8   // See below for an explanation of this.
+//
+// The values in each blocks are stored contiguously in row major order.
+// However, there is no unique way to order the blocks -- it is usually
+// optimized to promote cache coherent access, e.g. ordering it so that
+// Jacobian blocks of parameters of the same type are stored nearby.
+// This is one possible way to store the values of the blocks in a values array:
+// values = { 1, 2, 5, 6, 3, 4, 7, 8, 9 }
+//           |           |          |   |    // The three blocks.
+//            ^ rows[0].cells[0].position = 0
+//                        ^ rows[0].cells[1].position = 4
+//                                    ^ rows[1].cells[0].position = 8
 struct CERES_NO_EXPORT CompressedRowBlockStructure {
  std::vector<Block> cols;
  std::vector<CompressedRow> rows;
@@ -93,6 +181,18 @@ struct CERES_NO_EXPORT CompressedColumnBlockStructure {
  std::vector<CompressedColumn> cols;
 };

+inline int NumScalarEntries(const std::vector<Block>& blocks) {
+  if (blocks.empty()) {
+    return 0;
+  }
+
+  auto& block = blocks.back();
+  return block.position + block.size;
+}
+
+std::vector<Block> Tail(const std::vector<Block>& blocks, int n);
+int SumSquaredSizes(const std::vector<Block>& blocks);
+
 }  // namespace internal
 }  // namespace ceres

--- a/extern/ceres/internal/ceres/c_api.cc
+++ b/extern/ceres/internal/ceres/c_api.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/callbacks.cc
+++ b/extern/ceres/internal/ceres/callbacks.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,15 +32,13 @@

 #include <algorithm>
 #include <iostream>  // NO LINT
+#include <string>

 #include "ceres/program.h"
 #include "ceres/stringprintf.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::string;
+namespace ceres::internal {

 StateUpdatingCallback::StateUpdatingCallback(Program* program,
                                             double* parameters)
@@ -49,7 +47,7 @@ StateUpdatingCallback::StateUpdatingCallback(Program* program,
 StateUpdatingCallback::~StateUpdatingCallback() = default;

 CallbackReturnType StateUpdatingCallback::operator()(
-    const IterationSummary& summary) {
+    const IterationSummary& /*summary*/) {
  program_->StateVectorToParameterBlocks(parameters_);
  program_->CopyParameterBlockStateToUserState();
  return SOLVER_CONTINUE;
@@ -83,7 +81,7 @@ LoggingCallback::~LoggingCallback() = default;

 CallbackReturnType LoggingCallback::operator()(
    const IterationSummary& summary) {
-  string output;
+  std::string output;
  if (minimizer_type == LINE_SEARCH) {
    output = StringPrintf(
        "% 4d: f:% 8e d:% 3.2e g:% 3.2e h:% 3.2e s:% 3.2e e:% 3d it:% 3.2e "
@@ -127,5 +125,4 @@ CallbackReturnType LoggingCallback::operator()(
  return SOLVER_CONTINUE;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/callbacks.h
+++ b/extern/ceres/internal/ceres/callbacks.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/iteration_callback.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;

@@ -84,7 +83,6 @@ class CERES_NO_EXPORT LoggingCallback final : public IterationCallback {
  const bool log_to_stdout_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_CALLBACKS_H_
--- a/extern/ceres/internal/ceres/canonical_views_clustering.cc
+++ b/extern/ceres/internal/ceres/canonical_views_clustering.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,16 +33,14 @@

 #include <unordered_map>
 #include <unordered_set>
+#include <vector>

 #include "ceres/graph.h"
 #include "ceres/internal/export.h"
 #include "ceres/map_util.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
+namespace ceres::internal {

 using IntMap = std::unordered_map<int, int>;
 using IntSet = std::unordered_set<int>;
@@ -59,15 +57,15 @@ class CERES_NO_EXPORT CanonicalViewsClustering {
  // are assigned to a cluster with id = kInvalidClusterId.
  void ComputeClustering(const CanonicalViewsClusteringOptions& options,
                         const WeightedGraph<int>& graph,
-                         vector<int>* centers,
+                         std::vector<int>* centers,
                         IntMap* membership);

 private:
  void FindValidViews(IntSet* valid_views) const;
-  double ComputeClusteringQualityDifference(const int candidate,
-                                            const vector<int>& centers) const;
+  double ComputeClusteringQualityDifference(
+      int candidate, const std::vector<int>& centers) const;
  void UpdateCanonicalViewAssignments(const int canonical_view);
-  void ComputeClusterMembership(const vector<int>& centers,
+  void ComputeClusterMembership(const std::vector<int>& centers,
                                IntMap* membership) const;

  CanonicalViewsClusteringOptions options_;
@@ -82,7 +80,7 @@ class CERES_NO_EXPORT CanonicalViewsClustering {
 void ComputeCanonicalViewsClustering(
    const CanonicalViewsClusteringOptions& options,
    const WeightedGraph<int>& graph,
-    vector<int>* centers,
+    std::vector<int>* centers,
    IntMap* membership) {
  time_t start_time = time(nullptr);
  CanonicalViewsClustering cv;
@@ -95,7 +93,7 @@ void ComputeCanonicalViewsClustering(
 void CanonicalViewsClustering::ComputeClustering(
    const CanonicalViewsClusteringOptions& options,
    const WeightedGraph<int>& graph,
-    vector<int>* centers,
+    std::vector<int>* centers,
    IntMap* membership) {
  options_ = options;
  CHECK(centers != nullptr);
@@ -151,7 +149,7 @@ void CanonicalViewsClustering::FindValidViews(IntSet* valid_views) const {
 // Computes the difference in the quality score if 'candidate' were
 // added to the set of canonical views.
 double CanonicalViewsClustering::ComputeClusteringQualityDifference(
-    const int candidate, const vector<int>& centers) const {
+    const int candidate, const std::vector<int>& centers) const {
  // View score.
  double difference =
      options_.view_score_weight * graph_->VertexWeight(candidate);
@@ -198,7 +196,7 @@ void CanonicalViewsClustering::UpdateCanonicalViewAssignments(

 // Assign a cluster id to each view.
 void CanonicalViewsClustering::ComputeClusterMembership(
-    const vector<int>& centers, IntMap* membership) const {
+    const std::vector<int>& centers, IntMap* membership) const {
  CHECK(membership != nullptr);
  membership->clear();

@@ -222,5 +220,4 @@ void CanonicalViewsClustering::ComputeClusterMembership(
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/canonical_views_clustering.h
+++ b/extern/ceres/internal/ceres/canonical_views_clustering.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -48,8 +48,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct CanonicalViewsClusteringOptions;

@@ -120,8 +119,7 @@ struct CERES_NO_EXPORT CanonicalViewsClusteringOptions {
  double view_score_weight = 0.0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/casts.h
+++ b/extern/ceres/internal/ceres/casts.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/cgnr_linear_operator.h
+++ b/extern/ceres/internal/ceres/cgnr_linear_operator.h
@@ -1,123 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keir@google.com (Keir Mierle)
-
-#ifndef CERES_INTERNAL_CGNR_LINEAR_OPERATOR_H_
-#define CERES_INTERNAL_CGNR_LINEAR_OPERATOR_H_
-
-#include <algorithm>
-#include <memory>
-
-#include "ceres/internal/disable_warnings.h"
-#include "ceres/internal/eigen.h"
-#include "ceres/internal/export.h"
-#include "ceres/linear_operator.h"
-
-namespace ceres {
-namespace internal {
-
-class SparseMatrix;
-
-// A linear operator which takes a matrix A and a diagonal vector D and
-// performs products of the form
-//
-//   (A^T A + D^T D)x
-//
-// This is used to implement iterative general sparse linear solving with
-// conjugate gradients, where A is the Jacobian and D is a regularizing
-// parameter. A brief proof that D^T D is the correct regularizer:
-//
-// Given a regularized least squares problem:
-//
-//   min  ||Ax - b||^2 + ||Dx||^2
-//    x
-//
-// First expand into matrix notation:
-//
-//   (Ax - b)^T (Ax - b) + xD^TDx
-//
-// Then multiply out to get:
-//
-//   = xA^TAx - 2b^T Ax + b^Tb + xD^TDx
-//
-// Take the derivative:
-//
-//   0 = 2A^TAx - 2A^T b + 2 D^TDx
-//   0 = A^TAx - A^T b + D^TDx
-//   0 = (A^TA + D^TD)x - A^T b
-//
-// Thus, the symmetric system we need to solve for CGNR is
-//
-//   Sx = z
-//
-// with S = A^TA + D^TD
-//  and z = A^T b
-//
-// Note: This class is not thread safe, since it uses some temporary storage.
-class CERES_NO_EXPORT CgnrLinearOperator final : public LinearOperator {
- public:
-  CgnrLinearOperator(const LinearOperator& A, const double* D)
-      : A_(A), D_(D), z_(new double[A.num_rows()]) {}
-
-  void RightMultiply(const double* x, double* y) const final {
-    std::fill(z_.get(), z_.get() + A_.num_rows(), 0.0);
-
-    // z = Ax
-    A_.RightMultiply(x, z_.get());
-
-    // y = y + Atz
-    A_.LeftMultiply(z_.get(), y);
-
-    // y = y + DtDx
-    if (D_ != nullptr) {
-      int n = A_.num_cols();
-      VectorRef(y, n).array() +=
-          ConstVectorRef(D_, n).array().square() * ConstVectorRef(x, n).array();
-    }
-  }
-
-  void LeftMultiply(const double* x, double* y) const final {
-    RightMultiply(x, y);
-  }
-
-  int num_rows() const final { return A_.num_cols(); }
-  int num_cols() const final { return A_.num_cols(); }
-
- private:
-  const LinearOperator& A_;
-  const double* D_;
-  std::unique_ptr<double[]> z_;
-};
-
-}  // namespace internal
-}  // namespace ceres
-
-#include "ceres/internal/reenable_warnings.h"
-
-#endif  // CERES_INTERNAL_CGNR_LINEAR_OPERATOR_H_
--- a/extern/ceres/internal/ceres/cgnr_solver.cc
+++ b/extern/ceres/internal/ceres/cgnr_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,16 +34,92 @@
 #include <utility>

 #include "ceres/block_jacobi_preconditioner.h"
-#include "ceres/cgnr_linear_operator.h"
 #include "ceres/conjugate_gradients_solver.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
 #include "ceres/subset_preconditioner.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+
+// A linear operator which takes a matrix A and a diagonal vector D and
+// performs products of the form
+//
+//   (A^T A + D^T D)x
+//
+// This is used to implement iterative general sparse linear solving with
+// conjugate gradients, where A is the Jacobian and D is a regularizing
+// parameter. A brief proof that D^T D is the correct regularizer:
+//
+// Given a regularized least squares problem:
+//
+//   min  ||Ax - b||^2 + ||Dx||^2
+//    x
+//
+// First expand into matrix notation:
+//
+//   (Ax - b)^T (Ax - b) + xD^TDx
+//
+// Then multiply out to get:
+//
+//   = xA^TAx - 2b^T Ax + b^Tb + xD^TDx
+//
+// Take the derivative:
+//
+//   0 = 2A^TAx - 2A^T b + 2 D^TDx
+//   0 = A^TAx - A^T b + D^TDx
+//   0 = (A^TA + D^TD)x - A^T b
+//
+// Thus, the symmetric system we need to solve for CGNR is
+//
+//   Sx = z
+//
+// with S = A^TA + D^TD
+//  and z = A^T b
+//
+// Note: This class is not thread safe, since it uses some temporary storage.
+class CERES_NO_EXPORT CgnrLinearOperator final
+    : public ConjugateGradientsLinearOperator<Vector> {
+ public:
+  CgnrLinearOperator(const LinearOperator& A,
+                     const double* D,
+                     ContextImpl* context,
+                     int num_threads)
+      : A_(A),
+        D_(D),
+        z_(Vector::Zero(A.num_rows())),
+        context_(context),
+        num_threads_(num_threads) {}
+
+  void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
+    // z = Ax
+    // y = y + Atz
+    z_.setZero();
+    A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
+    A_.LeftMultiplyAndAccumulate(z_, y, context_, num_threads_);
+
+    // y = y + DtDx
+    if (D_ != nullptr) {
+      int n = A_.num_cols();
+      ParallelAssign(
+          context_,
+          num_threads_,
+          y,
+          y.array() + ConstVectorRef(D_, n).array().square() * x.array());
+    }
+  }
+
+ private:
+  const LinearOperator& A_;
+  const double* D_;
+  Vector z_;
+
+  ContextImpl* context_;
+  int num_threads_;
+};

 CgnrSolver::CgnrSolver(LinearSolver::Options options)
    : options_(std::move(options)) {
@@ -57,7 +133,14 @@ CgnrSolver::CgnrSolver(LinearSolver::Options options)
  }
 }

-CgnrSolver::~CgnrSolver() = default;
+CgnrSolver::~CgnrSolver() {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}

 LinearSolver::Summary CgnrSolver::SolveImpl(
    BlockSparseMatrix* A,
@@ -65,48 +148,244 @@ LinearSolver::Summary CgnrSolver::SolveImpl(
    const LinearSolver::PerSolveOptions& per_solve_options,
    double* x) {
  EventLogger event_logger("CgnrSolver::Solve");
-
-  // Form z = Atb.
-  Vector z(A->num_cols());
-  z.setZero();
-  A->LeftMultiply(b, z.data());
-
  if (!preconditioner_) {
+    Preconditioner::Options preconditioner_options;
+    preconditioner_options.type = options_.preconditioner_type;
+    preconditioner_options.subset_preconditioner_start_row_block =
+        options_.subset_preconditioner_start_row_block;
+    preconditioner_options.sparse_linear_algebra_library_type =
+        options_.sparse_linear_algebra_library_type;
+    preconditioner_options.ordering_type = options_.ordering_type;
+    preconditioner_options.num_threads = options_.num_threads;
+    preconditioner_options.context = options_.context;
+
    if (options_.preconditioner_type == JACOBI) {
-      preconditioner_ = std::make_unique<BlockJacobiPreconditioner>(*A);
+      preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
+          preconditioner_options, *A);
    } else if (options_.preconditioner_type == SUBSET) {
-      Preconditioner::Options preconditioner_options;
-      preconditioner_options.type = SUBSET;
-      preconditioner_options.subset_preconditioner_start_row_block =
-          options_.subset_preconditioner_start_row_block;
-      preconditioner_options.sparse_linear_algebra_library_type =
-          options_.sparse_linear_algebra_library_type;
-      preconditioner_options.use_postordering = options_.use_postordering;
-      preconditioner_options.num_threads = options_.num_threads;
-      preconditioner_options.context = options_.context;
      preconditioner_ =
          std::make_unique<SubsetPreconditioner>(preconditioner_options, *A);
+    } else {
+      preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
    }
  }
+  preconditioner_->Update(*A, per_solve_options.D);

-  if (preconditioner_) {
-    preconditioner_->Update(*A, per_solve_options.D);
+  ConjugateGradientsSolverOptions cg_options;
+  cg_options.min_num_iterations = options_.min_num_iterations;
+  cg_options.max_num_iterations = options_.max_num_iterations;
+  cg_options.residual_reset_period = options_.residual_reset_period;
+  cg_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_options.r_tolerance = per_solve_options.r_tolerance;
+  cg_options.context = options_.context;
+  cg_options.num_threads = options_.num_threads;
+
+  // lhs = AtA + DtD
+  CgnrLinearOperator lhs(
+      *A, per_solve_options.D, options_.context, options_.num_threads);
+  // rhs = Atb.
+  Vector rhs(A->num_cols());
+  rhs.setZero();
+  A->LeftMultiplyAndAccumulate(
+      b, rhs.data(), options_.context, options_.num_threads);
+
+  cg_solution_ = Vector::Zero(A->num_cols());
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i] == nullptr) {
+      scratch_[i] = new Vector(A->num_cols());
+    }
  }
-
-  LinearSolver::PerSolveOptions cg_per_solve_options = per_solve_options;
-  cg_per_solve_options.preconditioner = preconditioner_.get();
-
-  // Solve (AtA + DtD)x = z (= Atb).
-  VectorRef(x, A->num_cols()).setZero();
-  CgnrLinearOperator lhs(*A, per_solve_options.D);
  event_logger.AddEvent("Setup");

-  ConjugateGradientsSolver conjugate_gradient_solver(options_);
-  LinearSolver::Summary summary =
-      conjugate_gradient_solver.Solve(&lhs, z.data(), cg_per_solve_options, x);
+  LinearOperatorAdapter preconditioner(*preconditioner_);
+  auto summary = ConjugateGradientsSolver(
+      cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_);
+  VectorRef(x, A->num_cols()) = cg_solution_;
  event_logger.AddEvent("Solve");
  return summary;
 }

-}  // namespace internal
-}  // namespace ceres
+#ifndef CERES_NO_CUDA
+
+// A linear operator which takes a matrix A and a diagonal vector D and
+// performs products of the form
+//
+//   (A^T A + D^T D)x
+//
+// This is used to implement iterative general sparse linear solving with
+// conjugate gradients, where A is the Jacobian and D is a regularizing
+// parameter. A brief proof is included in cgnr_linear_operator.h.
+class CERES_NO_EXPORT CudaCgnrLinearOperator final
+    : public ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+  CudaCgnrLinearOperator(CudaSparseMatrix& A,
+                         const CudaVector& D,
+                         CudaVector* z)
+      : A_(A), D_(D), z_(z) {}
+
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+    // z = Ax
+    z_->SetZero();
+    A_.RightMultiplyAndAccumulate(x, z_);
+
+    // y = y + Atz
+    //   = y + AtAx
+    A_.LeftMultiplyAndAccumulate(*z_, &y);
+
+    // y = y + DtDx
+    y.DtDxpy(D_, x);
+  }
+
+ private:
+  CudaSparseMatrix& A_;
+  const CudaVector& D_;
+  CudaVector* z_ = nullptr;
+};
+
+class CERES_NO_EXPORT CudaIdentityPreconditioner final
+    : public CudaPreconditioner {
+ public:
+  void Update(const CompressedRowSparseMatrix& A, const double* D) final {}
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+    y.Axpby(1.0, x, 1.0);
+  }
+};
+
+// This class wraps the existing CPU Jacobi preconditioner, caches the structure
+// of the block diagonal, and for each CGNR solve updates the values on the CPU
+// and then copies them over to the GPU.
+class CERES_NO_EXPORT CudaJacobiPreconditioner final
+    : public CudaPreconditioner {
+ public:
+  explicit CudaJacobiPreconditioner(Preconditioner::Options options,
+                                    const CompressedRowSparseMatrix& A)
+      : options_(std::move(options)),
+        cpu_preconditioner_(options_, A),
+        m_(options_.context, cpu_preconditioner_.matrix()) {}
+  ~CudaJacobiPreconditioner() = default;
+
+  void Update(const CompressedRowSparseMatrix& A, const double* D) final {
+    cpu_preconditioner_.Update(A, D);
+    m_.CopyValuesFromCpu(cpu_preconditioner_.matrix());
+  }
+
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+    m_.RightMultiplyAndAccumulate(x, &y);
+  }
+
+ private:
+  Preconditioner::Options options_;
+  BlockCRSJacobiPreconditioner cpu_preconditioner_;
+  CudaSparseMatrix m_;
+};
+
+CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options)
+    : options_(std::move(options)) {}
+
+CudaCgnrSolver::~CudaCgnrSolver() {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}
+
+std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
+    LinearSolver::Options options, std::string* error) {
+  CHECK(error != nullptr);
+  if (options.preconditioner_type != IDENTITY &&
+      options.preconditioner_type != JACOBI) {
+    *error =
+        "CudaCgnrSolver does not support preconditioner type " +
+        std::string(PreconditionerTypeToString(options.preconditioner_type)) +
+        ". ";
+    return nullptr;
+  }
+  CHECK(options.context->IsCudaInitialized())
+      << "CudaCgnrSolver requires CUDA initialization.";
+  auto solver = std::make_unique<CudaCgnrSolver>(options);
+  return solver;
+}
+
+void CudaCgnrSolver::CpuToGpuTransfer(const CompressedRowSparseMatrix& A,
+                                      const double* b,
+                                      const double* D) {
+  if (A_ == nullptr) {
+    // Assume structure is not cached, do an initialization and structural copy.
+    A_ = std::make_unique<CudaSparseMatrix>(options_.context, A);
+    b_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+    x_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+    Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+    Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+    D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+
+    Preconditioner::Options preconditioner_options;
+    preconditioner_options.type = options_.preconditioner_type;
+    preconditioner_options.subset_preconditioner_start_row_block =
+        options_.subset_preconditioner_start_row_block;
+    preconditioner_options.sparse_linear_algebra_library_type =
+        options_.sparse_linear_algebra_library_type;
+    preconditioner_options.ordering_type = options_.ordering_type;
+    preconditioner_options.num_threads = options_.num_threads;
+    preconditioner_options.context = options_.context;
+
+    if (options_.preconditioner_type == JACOBI) {
+      preconditioner_ =
+          std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A);
+    } else {
+      preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
+    }
+    for (int i = 0; i < 4; ++i) {
+      scratch_[i] = new CudaVector(options_.context, A.num_cols());
+    }
+  } else {
+    // Assume structure is cached, do a value copy.
+    A_->CopyValuesFromCpu(A);
+  }
+  b_->CopyFromCpu(ConstVectorRef(b, A.num_rows()));
+  D_->CopyFromCpu(ConstVectorRef(D, A.num_cols()));
+}
+
+LinearSolver::Summary CudaCgnrSolver::SolveImpl(
+    CompressedRowSparseMatrix* A,
+    const double* b,
+    const LinearSolver::PerSolveOptions& per_solve_options,
+    double* x) {
+  EventLogger event_logger("CudaCgnrSolver::Solve");
+  LinearSolver::Summary summary;
+  summary.num_iterations = 0;
+  summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
+
+  CpuToGpuTransfer(*A, b, per_solve_options.D);
+  event_logger.AddEvent("CPU to GPU Transfer");
+  preconditioner_->Update(*A, per_solve_options.D);
+  event_logger.AddEvent("Preconditioner Update");
+
+  // Form z = Atb.
+  Atb_->SetZero();
+  A_->LeftMultiplyAndAccumulate(*b_, Atb_.get());
+
+  // Solve (AtA + DtD)x = z (= Atb).
+  x_->SetZero();
+  CudaCgnrLinearOperator lhs(*A_, *D_, Ax_.get());
+
+  event_logger.AddEvent("Setup");
+
+  ConjugateGradientsSolverOptions cg_options;
+  cg_options.min_num_iterations = options_.min_num_iterations;
+  cg_options.max_num_iterations = options_.max_num_iterations;
+  cg_options.residual_reset_period = options_.residual_reset_period;
+  cg_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_options.r_tolerance = per_solve_options.r_tolerance;
+
+  summary = ConjugateGradientsSolver(
+      cg_options, lhs, *Atb_, *preconditioner_, scratch_, *x_);
+  x_->CopyTo(x);
+  event_logger.AddEvent("Solve");
+  return summary;
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/cgnr_solver.h
+++ b/extern/ceres/internal/ceres/cgnr_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,13 @@

 #include <memory>

+#include "ceres/conjugate_gradients_solver.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Preconditioner;

@@ -65,9 +67,50 @@ class CERES_NO_EXPORT CgnrSolver final : public BlockSparseMatrixSolver {
 private:
  const LinearSolver::Options options_;
  std::unique_ptr<Preconditioner> preconditioner_;
+  Vector cg_solution_;
+  Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
 };

-}  // namespace internal
-}  // namespace ceres
+#ifndef CERES_NO_CUDA
+class CudaPreconditioner : public ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+  virtual void Update(const CompressedRowSparseMatrix& A, const double* D) = 0;
+  virtual ~CudaPreconditioner() = default;
+};
+
+// A Cuda-accelerated version of CgnrSolver.
+// This solver assumes that the sparsity structure of A remains constant for its
+// lifetime.
+class CERES_NO_EXPORT CudaCgnrSolver final
+    : public CompressedRowSparseMatrixSolver {
+ public:
+  explicit CudaCgnrSolver(LinearSolver::Options options);
+  static std::unique_ptr<CudaCgnrSolver> Create(LinearSolver::Options options,
+                                                std::string* error);
+  ~CudaCgnrSolver() override;
+
+  Summary SolveImpl(CompressedRowSparseMatrix* A,
+                    const double* b,
+                    const LinearSolver::PerSolveOptions& per_solve_options,
+                    double* x) final;
+
+ private:
+  void CpuToGpuTransfer(const CompressedRowSparseMatrix& A,
+                        const double* b,
+                        const double* D);
+
+  LinearSolver::Options options_;
+  std::unique_ptr<CudaSparseMatrix> A_;
+  std::unique_ptr<CudaVector> b_;
+  std::unique_ptr<CudaVector> x_;
+  std::unique_ptr<CudaVector> Atb_;
+  std::unique_ptr<CudaVector> Ax_;
+  std::unique_ptr<CudaVector> D_;
+  std::unique_ptr<CudaPreconditioner> preconditioner_;
+  CudaVector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
+};
+#endif  // CERES_NO_CUDA
+
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_CGNR_SOLVER_H_
--- a/extern/ceres/internal/ceres/compressed_col_sparse_matrix_utils.cc
+++ b/extern/ceres/internal/ceres/compressed_col_sparse_matrix_utils.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,30 +36,21 @@
 #include "ceres/internal/export.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-using std::vector;
-
-void CompressedColumnScalarMatrixToBlockMatrix(const int* scalar_rows,
-                                               const int* scalar_cols,
-                                               const vector<int>& row_blocks,
-                                               const vector<int>& col_blocks,
-                                               vector<int>* block_rows,
-                                               vector<int>* block_cols) {
+void CompressedColumnScalarMatrixToBlockMatrix(
+    const int* scalar_rows,
+    const int* scalar_cols,
+    const std::vector<Block>& row_blocks,
+    const std::vector<Block>& col_blocks,
+    std::vector<int>* block_rows,
+    std::vector<int>* block_cols) {
  CHECK(block_rows != nullptr);
  CHECK(block_cols != nullptr);
  block_rows->clear();
  block_cols->clear();
-  const int num_row_blocks = row_blocks.size();
  const int num_col_blocks = col_blocks.size();

-  vector<int> row_block_starts(num_row_blocks);
-  for (int i = 0, cursor = 0; i < num_row_blocks; ++i) {
-    row_block_starts[i] = cursor;
-    cursor += row_blocks[i];
-  }
-
  // This loop extracts the block sparsity of the scalar sparse matrix
  // It does so by iterating over the columns, but only considering
  // the columns corresponding to the first element of each column
@@ -71,52 +62,46 @@ void CompressedColumnScalarMatrixToBlockMatrix(const int* scalar_rows,
  for (int col_block = 0; col_block < num_col_blocks; ++col_block) {
    int column_size = 0;
    for (int idx = scalar_cols[c]; idx < scalar_cols[c + 1]; ++idx) {
-      vector<int>::const_iterator it = std::lower_bound(
-          row_block_starts.begin(), row_block_starts.end(), scalar_rows[idx]);
-      // Since we are using lower_bound, it will return the row id
-      // where the row block starts. For everything but the first row
-      // of the block, where these values will be the same, we can
-      // skip, as we only need the first row to detect the presence of
-      // the block.
+      auto it = std::lower_bound(row_blocks.begin(),
+                                 row_blocks.end(),
+                                 scalar_rows[idx],
+                                 [](const Block& block, double value) {
+                                   return block.position < value;
+                                 });
+      // Since we are using lower_bound, it will return the row id where the row
+      // block starts. For everything but the first row of the block, where
+      // these values will be the same, we can skip, as we only need the first
+      // row to detect the presence of the block.
      //
-      // For rows all but the first row in the last row block,
-      // lower_bound will return row_block_starts.end(), but those can
-      // be skipped like the rows in other row blocks too.
-      if (it == row_block_starts.end() || *it != scalar_rows[idx]) {
+      // For rows all but the first row in the last row block, lower_bound will
+      // return row_blocks_.end(), but those can be skipped like the rows in
+      // other row blocks too.
+      if (it == row_blocks.end() || it->position != scalar_rows[idx]) {
        continue;
      }

-      block_rows->push_back(it - row_block_starts.begin());
+      block_rows->push_back(it - row_blocks.begin());
      ++column_size;
    }
    block_cols->push_back(block_cols->back() + column_size);
-    c += col_blocks[col_block];
+    c += col_blocks[col_block].size;
  }
 }

-void BlockOrderingToScalarOrdering(const vector<int>& blocks,
-                                   const vector<int>& block_ordering,
-                                   vector<int>* scalar_ordering) {
+void BlockOrderingToScalarOrdering(const std::vector<Block>& blocks,
+                                   const std::vector<int>& block_ordering,
+                                   std::vector<int>* scalar_ordering) {
  CHECK_EQ(blocks.size(), block_ordering.size());
  const int num_blocks = blocks.size();
-
-  // block_starts = [0, block1, block1 + block2 ..]
-  vector<int> block_starts(num_blocks);
-  for (int i = 0, cursor = 0; i < num_blocks; ++i) {
-    block_starts[i] = cursor;
-    cursor += blocks[i];
-  }
-
-  scalar_ordering->resize(block_starts.back() + blocks.back());
+  scalar_ordering->resize(NumScalarEntries(blocks));
  int cursor = 0;
  for (int i = 0; i < num_blocks; ++i) {
    const int block_id = block_ordering[i];
-    const int block_size = blocks[block_id];
-    int block_position = block_starts[block_id];
+    const int block_size = blocks[block_id].size;
+    int block_position = blocks[block_id].position;
    for (int j = 0; j < block_size; ++j) {
      (*scalar_ordering)[cursor++] = block_position++;
    }
  }
 }
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/compressed_col_sparse_matrix_utils.h
+++ b/extern/ceres/internal/ceres/compressed_col_sparse_matrix_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,11 +34,11 @@
 #include <algorithm>
 #include <vector>

+#include "ceres/block_structure.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Extract the block sparsity pattern of the scalar compressed columns
 // matrix and return it in compressed column form. The compressed
@@ -53,8 +53,8 @@ namespace internal {
 CERES_NO_EXPORT void CompressedColumnScalarMatrixToBlockMatrix(
    const int* scalar_rows,
    const int* scalar_cols,
-    const std::vector<int>& row_blocks,
-    const std::vector<int>& col_blocks,
+    const std::vector<Block>& row_blocks,
+    const std::vector<Block>& col_blocks,
    std::vector<int>* block_rows,
    std::vector<int>* block_cols);

@@ -62,7 +62,7 @@ CERES_NO_EXPORT void CompressedColumnScalarMatrixToBlockMatrix(
 // the corresponding "scalar" ordering, where the scalar ordering of
 // size sum(blocks).
 CERES_NO_EXPORT void BlockOrderingToScalarOrdering(
-    const std::vector<int>& blocks,
+    const std::vector<Block>& blocks,
    const std::vector<int>& block_ordering,
    std::vector<int>* scalar_ordering);

@@ -141,8 +141,7 @@ void SolveRTRWithSparseRHS(IntegerType num_cols,
  SolveUpperTriangularInPlace(num_cols, rows, cols, values, solution);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/compressed_row_jacobian_writer.cc
+++ b/extern/ceres/internal/ceres/compressed_row_jacobian_writer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,44 +44,42 @@
 #include "ceres/residual_block.h"
 #include "ceres/scratch_evaluate_preparer.h"

-namespace ceres {
-namespace internal {
-
-using std::adjacent_find;
-using std::make_pair;
-using std::pair;
-using std::vector;
-
+namespace ceres::internal {
 void CompressedRowJacobianWriter::PopulateJacobianRowAndColumnBlockVectors(
    const Program* program, CompressedRowSparseMatrix* jacobian) {
-  const vector<ParameterBlock*>& parameter_blocks = program->parameter_blocks();
-  vector<int>& col_blocks = *(jacobian->mutable_col_blocks());
+  const auto& parameter_blocks = program->parameter_blocks();
+  auto& col_blocks = *(jacobian->mutable_col_blocks());
  col_blocks.resize(parameter_blocks.size());
+  int col_pos = 0;
  for (int i = 0; i < parameter_blocks.size(); ++i) {
-    col_blocks[i] = parameter_blocks[i]->TangentSize();
+    col_blocks[i].size = parameter_blocks[i]->TangentSize();
+    col_blocks[i].position = col_pos;
+    col_pos += col_blocks[i].size;
  }

-  const vector<ResidualBlock*>& residual_blocks = program->residual_blocks();
-  vector<int>& row_blocks = *(jacobian->mutable_row_blocks());
+  const auto& residual_blocks = program->residual_blocks();
+  auto& row_blocks = *(jacobian->mutable_row_blocks());
  row_blocks.resize(residual_blocks.size());
+  int row_pos = 0;
  for (int i = 0; i < residual_blocks.size(); ++i) {
-    row_blocks[i] = residual_blocks[i]->NumResiduals();
+    row_blocks[i].size = residual_blocks[i]->NumResiduals();
+    row_blocks[i].position = row_pos;
+    row_pos += row_blocks[i].size;
  }
 }

 void CompressedRowJacobianWriter::GetOrderedParameterBlocks(
    const Program* program,
    int residual_id,
-    vector<pair<int, int>>* evaluated_jacobian_blocks) {
-  const ResidualBlock* residual_block = program->residual_blocks()[residual_id];
+    std::vector<std::pair<int, int>>* evaluated_jacobian_blocks) {
+  auto residual_block = program->residual_blocks()[residual_id];
  const int num_parameter_blocks = residual_block->NumParameterBlocks();

  for (int j = 0; j < num_parameter_blocks; ++j) {
-    const ParameterBlock* parameter_block =
-        residual_block->parameter_blocks()[j];
+    auto parameter_block = residual_block->parameter_blocks()[j];
    if (!parameter_block->IsConstant()) {
      evaluated_jacobian_blocks->push_back(
-          make_pair(parameter_block->index(), j));
+          std::make_pair(parameter_block->index(), j));
    }
  }
  std::sort(evaluated_jacobian_blocks->begin(),
@@ -90,20 +88,29 @@ void CompressedRowJacobianWriter::GetOrderedParameterBlocks(

 std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
    const {
-  const vector<ResidualBlock*>& residual_blocks = program_->residual_blocks();
+  const auto& residual_blocks = program_->residual_blocks();

-  int total_num_residuals = program_->NumResiduals();
-  int total_num_effective_parameters = program_->NumEffectiveParameters();
+  const int total_num_residuals = program_->NumResiduals();
+  const int total_num_effective_parameters = program_->NumEffectiveParameters();

  // Count the number of jacobian nonzeros.
-  int num_jacobian_nonzeros = 0;
+  //
+  // We used an unsigned int here, so that we can compare it INT_MAX without
+  // triggering overflow behaviour.
+  unsigned int num_jacobian_nonzeros = total_num_effective_parameters;
  for (auto* residual_block : residual_blocks) {
    const int num_residuals = residual_block->NumResiduals();
    const int num_parameter_blocks = residual_block->NumParameterBlocks();
    for (int j = 0; j < num_parameter_blocks; ++j) {
-      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+      auto parameter_block = residual_block->parameter_blocks()[j];
      if (!parameter_block->IsConstant()) {
        num_jacobian_nonzeros += num_residuals * parameter_block->TangentSize();
+        if (num_jacobian_nonzeros > std::numeric_limits<int>::max()) {
+          LOG(ERROR) << "Unable to create Jacobian matrix: Too many entries in "
+                        "the Jacobian matrix. num_jacobian_nonzeros = "
+                     << num_jacobian_nonzeros;
+          return nullptr;
+        }
      }
    }
  }
@@ -112,14 +119,14 @@ std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
  // Allocate more space than needed to store the jacobian so that when the LM
  // algorithm adds the diagonal, no reallocation is necessary. This reduces
  // peak memory usage significantly.
-  std::unique_ptr<CompressedRowSparseMatrix> jacobian =
-      std::make_unique<CompressedRowSparseMatrix>(
-          total_num_residuals,
-          total_num_effective_parameters,
-          num_jacobian_nonzeros + total_num_effective_parameters);
+  auto jacobian = std::make_unique<CompressedRowSparseMatrix>(
+      total_num_residuals,
+      total_num_effective_parameters,
+      static_cast<int>(num_jacobian_nonzeros));

-  // At this stage, the CompressedRowSparseMatrix is an invalid state. But this
-  // seems to be the only way to construct it without doing a memory copy.
+  // At this stage, the CompressedRowSparseMatrix is an invalid state. But
+  // this seems to be the only way to construct it without doing a memory
+  // copy.
  int* rows = jacobian->mutable_rows();
  int* cols = jacobian->mutable_cols();

@@ -131,9 +138,9 @@ std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
    // Count the number of derivatives for a row of this residual block and
    // build a list of active parameter block indices.
    int num_derivatives = 0;
-    vector<int> parameter_indices;
+    std::vector<int> parameter_indices;
    for (int j = 0; j < num_parameter_blocks; ++j) {
-      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+      auto parameter_block = residual_block->parameter_blocks()[j];
      if (!parameter_block->IsConstant()) {
        parameter_indices.push_back(parameter_block->index());
        num_derivatives += parameter_block->TangentSize();
@@ -141,12 +148,12 @@ std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
    }

    // Sort the parameters by their position in the state vector.
-    sort(parameter_indices.begin(), parameter_indices.end());
+    std::sort(parameter_indices.begin(), parameter_indices.end());
    if (adjacent_find(parameter_indices.begin(), parameter_indices.end()) !=
        parameter_indices.end()) {
      std::string parameter_block_description;
      for (int j = 0; j < num_parameter_blocks; ++j) {
-        ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+        auto parameter_block = residual_block->parameter_blocks()[j];
        parameter_block_description += parameter_block->ToString() + "\n";
      }
      LOG(FATAL) << "Ceres internal error: "
@@ -168,15 +175,13 @@ std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
    // values are updated.
    int col_pos = 0;
    for (int parameter_index : parameter_indices) {
-      ParameterBlock* parameter_block =
-          program_->parameter_blocks()[parameter_index];
+      auto parameter_block = program_->parameter_blocks()[parameter_index];
      const int parameter_block_size = parameter_block->TangentSize();

      for (int r = 0; r < num_residuals; ++r) {
        // This is the position in the values array of the jacobian where this
        // row of the jacobian block should go.
        const int column_block_begin = rows[row_pos + r] + col_pos;
-
        for (int c = 0; c < parameter_block_size; ++c) {
          cols[column_block_begin + c] = parameter_block->delta_offset() + c;
        }
@@ -185,7 +190,8 @@ std::unique_ptr<SparseMatrix> CompressedRowJacobianWriter::CreateJacobian()
    }
    row_pos += num_residuals;
  }
-  CHECK_EQ(num_jacobian_nonzeros, rows[total_num_residuals]);
+  CHECK_EQ(num_jacobian_nonzeros - total_num_effective_parameters,
+           rows[total_num_residuals]);

  PopulateJacobianRowAndColumnBlockVectors(program_, jacobian.get());

@@ -201,11 +207,10 @@ void CompressedRowJacobianWriter::Write(int residual_id,
  double* jacobian_values = jacobian->mutable_values();
  const int* jacobian_rows = jacobian->rows();

-  const ResidualBlock* residual_block =
-      program_->residual_blocks()[residual_id];
+  auto residual_block = program_->residual_blocks()[residual_id];
  const int num_residuals = residual_block->NumResiduals();

-  vector<pair<int, int>> evaluated_jacobian_blocks;
+  std::vector<std::pair<int, int>> evaluated_jacobian_blocks;
  GetOrderedParameterBlocks(program_, residual_id, &evaluated_jacobian_blocks);

  // Where in the current row does the jacobian for a parameter block begin.
@@ -214,7 +219,7 @@ void CompressedRowJacobianWriter::Write(int residual_id,
  // Iterate over the jacobian blocks in increasing order of their
  // positions in the reduced parameter vector.
  for (auto& evaluated_jacobian_block : evaluated_jacobian_blocks) {
-    const ParameterBlock* parameter_block =
+    auto parameter_block =
        program_->parameter_blocks()[evaluated_jacobian_block.first];
    const int argument = evaluated_jacobian_block.second;
    const int parameter_block_size = parameter_block->TangentSize();
@@ -238,5 +243,4 @@ void CompressedRowJacobianWriter::Write(int residual_id,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/compressed_row_jacobian_writer.h
+++ b/extern/ceres/internal/ceres/compressed_row_jacobian_writer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/scratch_evaluate_preparer.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CompressedRowSparseMatrix;
 class Program;
@@ -107,7 +106,6 @@ class CERES_NO_EXPORT CompressedRowJacobianWriter {
  Program* program_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_COMPRESSED_ROW_JACOBIAN_WRITER_H_
--- a/extern/ceres/internal/ceres/compressed_row_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/compressed_row_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,25 +31,24 @@
 #include "ceres/compressed_row_sparse_matrix.h"

 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <numeric>
+#include <random>
 #include <vector>

+#include "ceres/context_impl.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/export.h"
-#include "ceres/random.h"
+#include "ceres/parallel_for.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
-
+namespace ceres::internal {
 namespace {

 // Helper functor used by the constructor for reordering the contents
-// of a TripletSparseMatrix. This comparator assumes thay there are no
+// of a TripletSparseMatrix. This comparator assumes that there are no
 // duplicates in the pair of arrays rows and cols, i.e., there is no
 // indices i and j (not equal to each other) s.t.
 //
@@ -119,10 +118,12 @@ void TransposeForCompressedRowSparseStructure(const int num_rows,
  transpose_rows[0] = 0;
 }

+template <class RandomNormalFunctor>
 void AddRandomBlock(const int num_rows,
                    const int num_cols,
                    const int row_block_begin,
                    const int col_block_begin,
+                    RandomNormalFunctor&& randn,
                    std::vector<int>* rows,
                    std::vector<int>* cols,
                    std::vector<double>* values) {
@@ -130,19 +131,21 @@ void AddRandomBlock(const int num_rows,
    for (int c = 0; c < num_cols; ++c) {
      rows->push_back(row_block_begin + r);
      cols->push_back(col_block_begin + c);
-      values->push_back(RandNormal());
+      values->push_back(randn());
    }
  }
 }

+template <class RandomNormalFunctor>
 void AddSymmetricRandomBlock(const int num_rows,
                             const int row_block_begin,
+                             RandomNormalFunctor&& randn,
                             std::vector<int>* rows,
                             std::vector<int>* cols,
                             std::vector<double>* values) {
  for (int r = 0; r < num_rows; ++r) {
    for (int c = r; c < num_rows; ++c) {
-      const double v = RandNormal();
+      const double v = randn();
      rows->push_back(row_block_begin + r);
      cols->push_back(row_block_begin + c);
      values->push_back(v);
@@ -163,7 +166,7 @@ CompressedRowSparseMatrix::CompressedRowSparseMatrix(int num_rows,
                                                     int max_num_nonzeros) {
  num_rows_ = num_rows;
  num_cols_ = num_cols;
-  storage_type_ = UNSYMMETRIC;
+  storage_type_ = StorageType::UNSYMMETRIC;
  rows_.resize(num_rows + 1, 0);
  cols_.resize(max_num_nonzeros, 0);
  values_.resize(max_num_nonzeros, 0.0);
@@ -202,7 +205,7 @@ CompressedRowSparseMatrix::FromTripletSparseMatrix(
  }

  // index is the list of indices into the TripletSparseMatrix input.
-  vector<int> index(input.num_nonzeros(), 0);
+  std::vector<int> index(input.num_nonzeros(), 0);
  for (int i = 0; i < input.num_nonzeros(); ++i) {
    index[i] = i;
  }
@@ -217,9 +220,8 @@ CompressedRowSparseMatrix::FromTripletSparseMatrix(
              input.num_nonzeros() * sizeof(int) +     // NOLINT
              input.num_nonzeros() * sizeof(double));  // NOLINT

-  std::unique_ptr<CompressedRowSparseMatrix> output =
-      std::make_unique<CompressedRowSparseMatrix>(
-          num_rows, num_cols, input.num_nonzeros());
+  auto output = std::make_unique<CompressedRowSparseMatrix>(
+      num_rows, num_cols, input.num_nonzeros());

  if (num_rows == 0) {
    // No data to copy.
@@ -255,7 +257,7 @@ CompressedRowSparseMatrix::CompressedRowSparseMatrix(const double* diagonal,

  num_rows_ = num_rows;
  num_cols_ = num_rows;
-  storage_type_ = UNSYMMETRIC;
+  storage_type_ = StorageType::UNSYMMETRIC;
  rows_.resize(num_rows + 1);
  cols_.resize(num_rows);
  values_.resize(num_rows);
@@ -276,22 +278,37 @@ void CompressedRowSparseMatrix::SetZero() {
  std::fill(values_.begin(), values_.end(), 0);
 }

-// TODO(sameeragarwal): Make RightMultiply and LeftMultiply
-// block-aware for higher performance.
-void CompressedRowSparseMatrix::RightMultiply(const double* x,
-                                              double* y) const {
+// TODO(sameeragarwal): Make RightMultiplyAndAccumulate and
+// LeftMultiplyAndAccumulate block-aware for higher performance.
+void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(
+    const double* x, double* y, ContextImpl* context, int num_threads) const {
+  if (storage_type_ != StorageType::UNSYMMETRIC) {
+    RightMultiplyAndAccumulate(x, y);
+    return;
+  }
+
+  auto values = values_.data();
+  auto rows = rows_.data();
+  auto cols = cols_.data();
+
+  ParallelFor(
+      context, 0, num_rows_, num_threads, [values, rows, cols, x, y](int row) {
+        for (int idx = rows[row]; idx < rows[row + 1]; ++idx) {
+          const int c = cols[idx];
+          const double v = values[idx];
+          y[row] += v * x[c];
+        }
+      });
+}
+
+void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                           double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);

-  if (storage_type_ == UNSYMMETRIC) {
-    for (int r = 0; r < num_rows_; ++r) {
-      for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
-        const int c = cols_[idx];
-        const double v = values_[idx];
-        y[r] += v * x[c];
-      }
-    }
-  } else if (storage_type_ == UPPER_TRIANGULAR) {
+  if (storage_type_ == StorageType::UNSYMMETRIC) {
+    RightMultiplyAndAccumulate(x, y, nullptr, 1);
+  } else if (storage_type_ == StorageType::UPPER_TRIANGULAR) {
    // Because of their block structure, we will have entries that lie
    // above (below) the diagonal for lower (upper) triangular matrices,
    // so the loops below need to account for this.
@@ -317,7 +334,7 @@ void CompressedRowSparseMatrix::RightMultiply(const double* x,
        }
      }
    }
-  } else if (storage_type_ == LOWER_TRIANGULAR) {
+  } else if (storage_type_ == StorageType::LOWER_TRIANGULAR) {
    for (int r = 0; r < num_rows_; ++r) {
      int idx = rows_[r];
      const int idx_end = rows_[r + 1];
@@ -340,19 +357,21 @@ void CompressedRowSparseMatrix::RightMultiply(const double* x,
  }
 }

-void CompressedRowSparseMatrix::LeftMultiply(const double* x, double* y) const {
+void CompressedRowSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+                                                          double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);

-  if (storage_type_ == UNSYMMETRIC) {
+  if (storage_type_ == StorageType::UNSYMMETRIC) {
    for (int r = 0; r < num_rows_; ++r) {
      for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
        y[cols_[idx]] += values_[idx] * x[r];
      }
    }
  } else {
-    // Since the matrix is symmetric, LeftMultiply = RightMultiply.
-    RightMultiply(x, y);
+    // Since the matrix is symmetric, LeftMultiplyAndAccumulate =
+    // RightMultiplyAndAccumulate.
+    RightMultiplyAndAccumulate(x, y);
  }
 }

@@ -360,11 +379,11 @@ void CompressedRowSparseMatrix::SquaredColumnNorm(double* x) const {
  CHECK(x != nullptr);

  std::fill(x, x + num_cols_, 0.0);
-  if (storage_type_ == UNSYMMETRIC) {
+  if (storage_type_ == StorageType::UNSYMMETRIC) {
    for (int idx = 0; idx < rows_[num_rows_]; ++idx) {
      x[cols_[idx]] += values_[idx] * values_[idx];
    }
-  } else if (storage_type_ == UPPER_TRIANGULAR) {
+  } else if (storage_type_ == StorageType::UPPER_TRIANGULAR) {
    // Because of their block structure, we will have entries that lie
    // above (below) the diagonal for lower (upper) triangular
    // matrices, so the loops below need to account for this.
@@ -390,7 +409,7 @@ void CompressedRowSparseMatrix::SquaredColumnNorm(double* x) const {
        }
      }
    }
-  } else if (storage_type_ == LOWER_TRIANGULAR) {
+  } else if (storage_type_ == StorageType::LOWER_TRIANGULAR) {
    for (int r = 0; r < num_rows_; ++r) {
      int idx = rows_[r];
      const int idx_end = rows_[r + 1];
@@ -435,7 +454,7 @@ void CompressedRowSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
 void CompressedRowSparseMatrix::DeleteRows(int delta_rows) {
  CHECK_GE(delta_rows, 0);
  CHECK_LE(delta_rows, num_rows_);
-  CHECK_EQ(storage_type_, UNSYMMETRIC);
+  CHECK_EQ(storage_type_, StorageType::UNSYMMETRIC);

  num_rows_ -= delta_rows;
  rows_.resize(num_rows_ + 1);
@@ -451,7 +470,7 @@ void CompressedRowSparseMatrix::DeleteRows(int delta_rows) {
  int num_row_blocks = 0;
  int num_rows = 0;
  while (num_row_blocks < row_blocks_.size() && num_rows < num_rows_) {
-    num_rows += row_blocks_[num_row_blocks];
+    num_rows += row_blocks_[num_row_blocks].size;
    ++num_row_blocks;
  }

@@ -459,7 +478,7 @@ void CompressedRowSparseMatrix::DeleteRows(int delta_rows) {
 }

 void CompressedRowSparseMatrix::AppendRows(const CompressedRowSparseMatrix& m) {
-  CHECK_EQ(storage_type_, UNSYMMETRIC);
+  CHECK_EQ(storage_type_, StorageType::UNSYMMETRIC);
  CHECK_EQ(m.num_cols(), num_cols_);

  CHECK((row_blocks_.empty() && m.row_blocks().empty()) ||
@@ -539,17 +558,15 @@ void CompressedRowSparseMatrix::SetMaxNumNonZeros(int num_nonzeros) {

 std::unique_ptr<CompressedRowSparseMatrix>
 CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(
-    const double* diagonal, const vector<int>& blocks) {
-  int num_rows = 0;
+    const double* diagonal, const std::vector<Block>& blocks) {
+  const int num_rows = NumScalarEntries(blocks);
  int num_nonzeros = 0;
-  for (int block_size : blocks) {
-    num_rows += block_size;
-    num_nonzeros += block_size * block_size;
+  for (auto& block : blocks) {
+    num_nonzeros += block.size * block.size;
  }

-  std::unique_ptr<CompressedRowSparseMatrix> matrix =
-      std::make_unique<CompressedRowSparseMatrix>(
-          num_rows, num_rows, num_nonzeros);
+  auto matrix = std::make_unique<CompressedRowSparseMatrix>(
+      num_rows, num_rows, num_nonzeros);

  int* rows = matrix->mutable_rows();
  int* cols = matrix->mutable_cols();
@@ -558,15 +575,17 @@ CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(

  int idx_cursor = 0;
  int col_cursor = 0;
-  for (int block_size : blocks) {
-    for (int r = 0; r < block_size; ++r) {
+  for (auto& block : blocks) {
+    for (int r = 0; r < block.size; ++r) {
      *(rows++) = idx_cursor;
-      values[idx_cursor + r] = diagonal[col_cursor + r];
-      for (int c = 0; c < block_size; ++c, ++idx_cursor) {
+      if (diagonal != nullptr) {
+        values[idx_cursor + r] = diagonal[col_cursor + r];
+      }
+      for (int c = 0; c < block.size; ++c, ++idx_cursor) {
        *(cols++) = col_cursor + c;
      }
    }
-    col_cursor += block_size;
+    col_cursor += block.size;
  }
  *rows = idx_cursor;

@@ -580,19 +599,18 @@ CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(

 std::unique_ptr<CompressedRowSparseMatrix>
 CompressedRowSparseMatrix::Transpose() const {
-  std::unique_ptr<CompressedRowSparseMatrix> transpose =
-      std::make_unique<CompressedRowSparseMatrix>(
-          num_cols_, num_rows_, num_nonzeros());
+  auto transpose = std::make_unique<CompressedRowSparseMatrix>(
+      num_cols_, num_rows_, num_nonzeros());

  switch (storage_type_) {
-    case UNSYMMETRIC:
-      transpose->set_storage_type(UNSYMMETRIC);
+    case StorageType::UNSYMMETRIC:
+      transpose->set_storage_type(StorageType::UNSYMMETRIC);
      break;
-    case LOWER_TRIANGULAR:
-      transpose->set_storage_type(UPPER_TRIANGULAR);
+    case StorageType::LOWER_TRIANGULAR:
+      transpose->set_storage_type(StorageType::UPPER_TRIANGULAR);
      break;
-    case UPPER_TRIANGULAR:
-      transpose->set_storage_type(LOWER_TRIANGULAR);
+    case StorageType::UPPER_TRIANGULAR:
+      transpose->set_storage_type(StorageType::LOWER_TRIANGULAR);
      break;
    default:
      LOG(FATAL) << "Unknown storage type: " << storage_type_;
@@ -621,13 +639,14 @@ CompressedRowSparseMatrix::Transpose() const {

 std::unique_ptr<CompressedRowSparseMatrix>
 CompressedRowSparseMatrix::CreateRandomMatrix(
-    CompressedRowSparseMatrix::RandomMatrixOptions options) {
+    CompressedRowSparseMatrix::RandomMatrixOptions options,
+    std::mt19937& prng) {
  CHECK_GT(options.num_row_blocks, 0);
  CHECK_GT(options.min_row_block_size, 0);
  CHECK_GT(options.max_row_block_size, 0);
  CHECK_LE(options.min_row_block_size, options.max_row_block_size);

-  if (options.storage_type == UNSYMMETRIC) {
+  if (options.storage_type == StorageType::UNSYMMETRIC) {
    CHECK_GT(options.num_col_blocks, 0);
    CHECK_GT(options.min_col_block_size, 0);
    CHECK_GT(options.max_col_block_size, 0);
@@ -642,33 +661,42 @@ CompressedRowSparseMatrix::CreateRandomMatrix(
  CHECK_GT(options.block_density, 0.0);
  CHECK_LE(options.block_density, 1.0);

-  vector<int> row_blocks;
-  vector<int> col_blocks;
+  std::vector<Block> row_blocks;
+  row_blocks.reserve(options.num_row_blocks);
+  std::vector<Block> col_blocks;
+  col_blocks.reserve(options.num_col_blocks);
+
+  std::uniform_int_distribution<int> col_distribution(
+      options.min_col_block_size, options.max_col_block_size);
+  std::uniform_int_distribution<int> row_distribution(
+      options.min_row_block_size, options.max_row_block_size);
+  std::uniform_real_distribution<double> uniform01(0.0, 1.0);
+  std::normal_distribution<double> standard_normal_distribution;

  // Generate the row block structure.
+  int row_pos = 0;
  for (int i = 0; i < options.num_row_blocks; ++i) {
    // Generate a random integer in [min_row_block_size, max_row_block_size]
-    const int delta_block_size =
-        Uniform(options.max_row_block_size - options.min_row_block_size);
-    row_blocks.push_back(options.min_row_block_size + delta_block_size);
+    row_blocks.emplace_back(row_distribution(prng), row_pos);
+    row_pos += row_blocks.back().size;
  }

-  if (options.storage_type == UNSYMMETRIC) {
+  if (options.storage_type == StorageType::UNSYMMETRIC) {
    // Generate the col block structure.
+    int col_pos = 0;
    for (int i = 0; i < options.num_col_blocks; ++i) {
      // Generate a random integer in [min_col_block_size, max_col_block_size]
-      const int delta_block_size =
-          Uniform(options.max_col_block_size - options.min_col_block_size);
-      col_blocks.push_back(options.min_col_block_size + delta_block_size);
+      col_blocks.emplace_back(col_distribution(prng), col_pos);
+      col_pos += col_blocks.back().size;
    }
  } else {
    // Symmetric matrices (LOWER_TRIANGULAR or UPPER_TRIANGULAR);
    col_blocks = row_blocks;
  }

-  vector<int> tsm_rows;
-  vector<int> tsm_cols;
-  vector<double> tsm_values;
+  std::vector<int> tsm_rows;
+  std::vector<int> tsm_cols;
+  std::vector<double> tsm_values;

  // For ease of construction, we are going to generate the
  // CompressedRowSparseMatrix by generating it as a
@@ -687,51 +715,55 @@ CompressedRowSparseMatrix::CreateRandomMatrix(
    for (int r = 0; r < options.num_row_blocks; ++r) {
      int col_block_begin = 0;
      for (int c = 0; c < options.num_col_blocks; ++c) {
-        if (((options.storage_type == UPPER_TRIANGULAR) && (r > c)) ||
-            ((options.storage_type == LOWER_TRIANGULAR) && (r < c))) {
-          col_block_begin += col_blocks[c];
+        if (((options.storage_type == StorageType::UPPER_TRIANGULAR) &&
+             (r > c)) ||
+            ((options.storage_type == StorageType::LOWER_TRIANGULAR) &&
+             (r < c))) {
+          col_block_begin += col_blocks[c].size;
          continue;
        }

        // Randomly determine if this block is present or not.
-        if (RandDouble() <= options.block_density) {
+        if (uniform01(prng) <= options.block_density) {
+          auto randn = [&standard_normal_distribution, &prng] {
+            return standard_normal_distribution(prng);
+          };
          // If the matrix is symmetric, then we take care to generate
          // symmetric diagonal blocks.
-          if (options.storage_type == UNSYMMETRIC || r != c) {
-            AddRandomBlock(row_blocks[r],
-                           col_blocks[c],
+          if (options.storage_type == StorageType::UNSYMMETRIC || r != c) {
+            AddRandomBlock(row_blocks[r].size,
+                           col_blocks[c].size,
                           row_block_begin,
                           col_block_begin,
+                           randn,
                           &tsm_rows,
                           &tsm_cols,
                           &tsm_values);
          } else {
-            AddSymmetricRandomBlock(row_blocks[r],
+            AddSymmetricRandomBlock(row_blocks[r].size,
                                    row_block_begin,
+                                    randn,
                                    &tsm_rows,
                                    &tsm_cols,
                                    &tsm_values);
          }
        }
-        col_block_begin += col_blocks[c];
+        col_block_begin += col_blocks[c].size;
      }
-      row_block_begin += row_blocks[r];
+      row_block_begin += row_blocks[r].size;
    }
  }

-  const int num_rows = std::accumulate(row_blocks.begin(), row_blocks.end(), 0);
-  const int num_cols = std::accumulate(col_blocks.begin(), col_blocks.end(), 0);
+  const int num_rows = NumScalarEntries(row_blocks);
+  const int num_cols = NumScalarEntries(col_blocks);
  const bool kDoNotTranspose = false;
-  std::unique_ptr<CompressedRowSparseMatrix> matrix =
-      CompressedRowSparseMatrix::FromTripletSparseMatrix(
-          TripletSparseMatrix(
-              num_rows, num_cols, tsm_rows, tsm_cols, tsm_values),
-          kDoNotTranspose);
+  auto matrix = CompressedRowSparseMatrix::FromTripletSparseMatrix(
+      TripletSparseMatrix(num_rows, num_cols, tsm_rows, tsm_cols, tsm_values),
+      kDoNotTranspose);
  (*matrix->mutable_row_blocks()) = row_blocks;
  (*matrix->mutable_col_blocks()) = col_blocks;
  matrix->set_storage_type(options.storage_type);
  return matrix;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/compressed_row_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/compressed_row_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,10 @@
 #define CERES_INTERNAL_COMPRESSED_ROW_SPARSE_MATRIX_H_

 #include <memory>
+#include <random>
 #include <vector>

+#include "ceres/block_structure.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 #include "ceres/sparse_matrix.h"
@@ -46,11 +48,12 @@ struct CRSMatrix;

 namespace internal {

+class ContextImpl;
 class TripletSparseMatrix;

 class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
 public:
-  enum StorageType {
+  enum class StorageType {
    UNSYMMETRIC,
    // Matrix is assumed to be symmetric but only the lower triangular
    // part of the matrix is stored.
@@ -100,8 +103,12 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  // SparseMatrix interface.
  ~CompressedRowSparseMatrix() override;
  void SetZero() final;
-  void RightMultiply(const double* x, double* y) const final;
-  void LeftMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
  void SquaredColumnNorm(double* x) const final;
  void ScaleColumns(const double* scale) final;
  void ToDenseMatrix(Matrix* dense_matrix) const final;
@@ -109,8 +116,8 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  int num_rows() const final { return num_rows_; }
  int num_cols() const final { return num_cols_; }
  int num_nonzeros() const final { return rows_[num_rows_]; }
-  const double* values() const final { return &values_[0]; }
-  double* mutable_values() final { return &values_[0]; }
+  const double* values() const final { return values_.data(); }
+  double* mutable_values() final { return values_.data(); }

  // Delete the bottom delta_rows.
  // num_rows -= delta_rows
@@ -132,28 +139,28 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  void set_num_cols(const int num_cols) { num_cols_ = num_cols; }

  // Low level access methods that expose the structure of the matrix.
-  const int* cols() const { return &cols_[0]; }
-  int* mutable_cols() { return &cols_[0]; }
+  const int* cols() const { return cols_.data(); }
+  int* mutable_cols() { return cols_.data(); }

-  const int* rows() const { return &rows_[0]; }
-  int* mutable_rows() { return &rows_[0]; }
+  const int* rows() const { return rows_.data(); }
+  int* mutable_rows() { return rows_.data(); }

  StorageType storage_type() const { return storage_type_; }
  void set_storage_type(const StorageType storage_type) {
    storage_type_ = storage_type;
  }

-  const std::vector<int>& row_blocks() const { return row_blocks_; }
-  std::vector<int>* mutable_row_blocks() { return &row_blocks_; }
+  const std::vector<Block>& row_blocks() const { return row_blocks_; }
+  std::vector<Block>* mutable_row_blocks() { return &row_blocks_; }

-  const std::vector<int>& col_blocks() const { return col_blocks_; }
-  std::vector<int>* mutable_col_blocks() { return &col_blocks_; }
+  const std::vector<Block>& col_blocks() const { return col_blocks_; }
+  std::vector<Block>* mutable_col_blocks() { return &col_blocks_; }

  // Create a block diagonal CompressedRowSparseMatrix with the given
  // block structure. The individual blocks are assumed to be laid out
  // contiguously in the diagonal array, one block at a time.
  static std::unique_ptr<CompressedRowSparseMatrix> CreateBlockDiagonalMatrix(
-      const double* diagonal, const std::vector<int>& blocks);
+      const double* diagonal, const std::vector<Block>& blocks);

  // Options struct to control the generation of random block sparse
  // matrices in compressed row sparse format.
@@ -165,7 +172,7 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  // given bounds.
  //
  // Then we walk the block structure of the resulting matrix, and with
-  // probability block_density detemine whether they are structurally
+  // probability block_density determine whether they are structurally
  // zero or not. If the answer is no, then we generate entries for the
  // block which are distributed normally.
  struct RandomMatrixOptions {
@@ -176,7 +183,7 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
    // (lower triangular) part. In this case, num_col_blocks,
    // min_col_block_size and max_col_block_size will be ignored and
    // assumed to be equal to the corresponding row settings.
-    StorageType storage_type = UNSYMMETRIC;
+    StorageType storage_type = StorageType::UNSYMMETRIC;

    int num_row_blocks = 0;
    int min_row_block_size = 0;
@@ -195,7 +202,7 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  // normally distributed and whose structure is determined by
  // RandomMatrixOptions.
  static std::unique_ptr<CompressedRowSparseMatrix> CreateRandomMatrix(
-      RandomMatrixOptions options);
+      RandomMatrixOptions options, std::mt19937& prng);

 private:
  static std::unique_ptr<CompressedRowSparseMatrix> FromTripletSparseMatrix(
@@ -209,14 +216,31 @@ class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
  StorageType storage_type_;

  // If the matrix has an underlying block structure, then it can also
-  // carry with it row and column block sizes. This is auxilliary and
+  // carry with it row and column block sizes. This is auxiliary and
  // optional information for use by algorithms operating on the
  // matrix. The class itself does not make use of this information in
  // any way.
-  std::vector<int> row_blocks_;
-  std::vector<int> col_blocks_;
+  std::vector<Block> row_blocks_;
+  std::vector<Block> col_blocks_;
 };

+inline std::ostream& operator<<(std::ostream& s,
+                                CompressedRowSparseMatrix::StorageType type) {
+  switch (type) {
+    case CompressedRowSparseMatrix::StorageType::UNSYMMETRIC:
+      s << "UNSYMMETRIC";
+      break;
+    case CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR:
+      s << "UPPER_TRIANGULAR";
+      break;
+    case CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR:
+      s << "LOWER_TRIANGULAR";
+      break;
+    default:
+      s << "UNKNOWN CompressedRowSparseMatrix::StorageType";
+  }
+  return s;
+}
 }  // namespace internal
 }  // namespace ceres

--- a/extern/ceres/internal/ceres/concurrent_queue.h
+++ b/extern/ceres/internal/ceres/concurrent_queue.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@

 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A thread-safe multi-producer, multi-consumer queue for queueing items that
 // are typically handled asynchronously by multiple threads. The ConcurrentQueue
@@ -152,7 +151,6 @@ class ConcurrentQueue {
  bool wait_{true};
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_CONCURRENT_QUEUE_H_
--- a/extern/ceres/internal/ceres/conditioned_cost_function.cc
+++ b/extern/ceres/internal/ceres/conditioned_cost_function.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/conjugate_gradients_solver.cc
+++ b/extern/ceres/internal/ceres/conjugate_gradients_solver.cc
@@ -1,253 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: sameeragarwal@google.com (Sameer Agarwal)
-//
-// A preconditioned conjugate gradients solver
-// (ConjugateGradientsSolver) for positive semidefinite linear
-// systems.
-//
-// We have also augmented the termination criterion used by this
-// solver to support not just residual based termination but also
-// termination based on decrease in the value of the quadratic model
-// that CG optimizes.
-
-#include "ceres/conjugate_gradients_solver.h"
-
-#include <cmath>
-#include <cstddef>
-#include <utility>
-
-#include "ceres/internal/eigen.h"
-#include "ceres/linear_operator.h"
-#include "ceres/stringprintf.h"
-#include "ceres/types.h"
-#include "glog/logging.h"
-
-namespace ceres {
-namespace internal {
-namespace {
-
-bool IsZeroOrInfinity(double x) { return ((x == 0.0) || std::isinf(x)); }
-
-}  // namespace
-
-ConjugateGradientsSolver::ConjugateGradientsSolver(
-    LinearSolver::Options options)
-    : options_(std::move(options)) {}
-
-LinearSolver::Summary ConjugateGradientsSolver::Solve(
-    LinearOperator* A,
-    const double* b,
-    const LinearSolver::PerSolveOptions& per_solve_options,
-    double* x) {
-  CHECK(A != nullptr);
-  CHECK(x != nullptr);
-  CHECK(b != nullptr);
-  CHECK_EQ(A->num_rows(), A->num_cols());
-
-  LinearSolver::Summary summary;
-  summary.termination_type = LINEAR_SOLVER_NO_CONVERGENCE;
-  summary.message = "Maximum number of iterations reached.";
-  summary.num_iterations = 0;
-
-  const int num_cols = A->num_cols();
-  VectorRef xref(x, num_cols);
-  ConstVectorRef bref(b, num_cols);
-
-  const double norm_b = bref.norm();
-  if (norm_b == 0.0) {
-    xref.setZero();
-    summary.termination_type = LINEAR_SOLVER_SUCCESS;
-    summary.message = "Convergence. |b| = 0.";
-    return summary;
-  }
-
-  Vector r(num_cols);
-  Vector p(num_cols);
-  Vector z(num_cols);
-  Vector tmp(num_cols);
-
-  const double tol_r = per_solve_options.r_tolerance * norm_b;
-
-  tmp.setZero();
-  A->RightMultiply(x, tmp.data());
-  r = bref - tmp;
-  double norm_r = r.norm();
-  if (options_.min_num_iterations == 0 && norm_r <= tol_r) {
-    summary.termination_type = LINEAR_SOLVER_SUCCESS;
-    summary.message =
-        StringPrintf("Convergence. |r| = %e <= %e.", norm_r, tol_r);
-    return summary;
-  }
-
-  double rho = 1.0;
-
-  // Initial value of the quadratic model Q = x'Ax - 2 * b'x.
-  double Q0 = -1.0 * xref.dot(bref + r);
-
-  for (summary.num_iterations = 1;; ++summary.num_iterations) {
-    // Apply preconditioner
-    if (per_solve_options.preconditioner != nullptr) {
-      z.setZero();
-      per_solve_options.preconditioner->RightMultiply(r.data(), z.data());
-    } else {
-      z = r;
-    }
-
-    double last_rho = rho;
-    rho = r.dot(z);
-    if (IsZeroOrInfinity(rho)) {
-      summary.termination_type = LINEAR_SOLVER_FAILURE;
-      summary.message = StringPrintf("Numerical failure. rho = r'z = %e.", rho);
-      break;
-    }
-
-    if (summary.num_iterations == 1) {
-      p = z;
-    } else {
-      double beta = rho / last_rho;
-      if (IsZeroOrInfinity(beta)) {
-        summary.termination_type = LINEAR_SOLVER_FAILURE;
-        summary.message = StringPrintf(
-            "Numerical failure. beta = rho_n / rho_{n-1} = %e, "
-            "rho_n = %e, rho_{n-1} = %e",
-            beta,
-            rho,
-            last_rho);
-        break;
-      }
-      p = z + beta * p;
-    }
-
-    Vector& q = z;
-    q.setZero();
-    A->RightMultiply(p.data(), q.data());
-    const double pq = p.dot(q);
-    if ((pq <= 0) || std::isinf(pq)) {
-      summary.termination_type = LINEAR_SOLVER_NO_CONVERGENCE;
-      summary.message = StringPrintf(
-          "Matrix is indefinite, no more progress can be made. "
-          "p'q = %e. |p| = %e, |q| = %e",
-          pq,
-          p.norm(),
-          q.norm());
-      break;
-    }
-
-    const double alpha = rho / pq;
-    if (std::isinf(alpha)) {
-      summary.termination_type = LINEAR_SOLVER_FAILURE;
-      summary.message = StringPrintf(
-          "Numerical failure. alpha = rho / pq = %e, rho = %e, pq = %e.",
-          alpha,
-          rho,
-          pq);
-      break;
-    }
-
-    xref = xref + alpha * p;
-
-    // Ideally we would just use the update r = r - alpha*q to keep
-    // track of the residual vector. However this estimate tends to
-    // drift over time due to round off errors. Thus every
-    // residual_reset_period iterations, we calculate the residual as
-    // r = b - Ax. We do not do this every iteration because this
-    // requires an additional matrix vector multiply which would
-    // double the complexity of the CG algorithm.
-    if (summary.num_iterations % options_.residual_reset_period == 0) {
-      tmp.setZero();
-      A->RightMultiply(x, tmp.data());
-      r = bref - tmp;
-    } else {
-      r = r - alpha * q;
-    }
-
-    // Quadratic model based termination.
-    //   Q1 = x'Ax - 2 * b' x.
-    const double Q1 = -1.0 * xref.dot(bref + r);
-
-    // For PSD matrices A, let
-    //
-    //   Q(x) = x'Ax - 2b'x
-    //
-    // be the cost of the quadratic function defined by A and b. Then,
-    // the solver terminates at iteration i if
-    //
-    //   i * (Q(x_i) - Q(x_i-1)) / Q(x_i) < q_tolerance.
-    //
-    // This termination criterion is more useful when using CG to
-    // solve the Newton step. This particular convergence test comes
-    // from Stephen Nash's work on truncated Newton
-    // methods. References:
-    //
-    //   1. Stephen G. Nash & Ariela Sofer, Assessing A Search
-    //   Direction Within A Truncated Newton Method, Operation
-    //   Research Letters 9(1990) 219-221.
-    //
-    //   2. Stephen G. Nash, A Survey of Truncated Newton Methods,
-    //   Journal of Computational and Applied Mathematics,
-    //   124(1-2), 45-59, 2000.
-    //
-    const double zeta = summary.num_iterations * (Q1 - Q0) / Q1;
-    if (zeta < per_solve_options.q_tolerance &&
-        summary.num_iterations >= options_.min_num_iterations) {
-      summary.termination_type = LINEAR_SOLVER_SUCCESS;
-      summary.message =
-          StringPrintf("Iteration: %d Convergence: zeta = %e < %e. |r| = %e",
-                       summary.num_iterations,
-                       zeta,
-                       per_solve_options.q_tolerance,
-                       r.norm());
-      break;
-    }
-    Q0 = Q1;
-
-    // Residual based termination.
-    norm_r = r.norm();
-    if (norm_r <= tol_r &&
-        summary.num_iterations >= options_.min_num_iterations) {
-      summary.termination_type = LINEAR_SOLVER_SUCCESS;
-      summary.message =
-          StringPrintf("Iteration: %d Convergence. |r| = %e <= %e.",
-                       summary.num_iterations,
-                       norm_r,
-                       tol_r);
-      break;
-    }
-
-    if (summary.num_iterations >= options_.max_num_iterations) {
-      break;
-    }
-  }
-
-  return summary;
-}
-
-}  // namespace internal
-}  // namespace ceres
--- a/extern/ceres/internal/ceres/conjugate_gradients_solver.h
+++ b/extern/ceres/internal/ceres/conjugate_gradients_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,42 +34,277 @@
 #ifndef CERES_INTERNAL_CONJUGATE_GRADIENTS_SOLVER_H_
 #define CERES_INTERNAL_CONJUGATE_GRADIENTS_SOLVER_H_

+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+#include "ceres/eigen_vector_ops.h"
 #include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
+#include "ceres/linear_operator.h"
 #include "ceres/linear_solver.h"
+#include "ceres/stringprintf.h"
+#include "ceres/types.h"
+#include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-class LinearOperator;
-
-// This class implements the now classical Conjugate Gradients
-// algorithm of Hestenes & Stiefel for solving postive semidefinite
-// linear sytems. Optionally it can use a preconditioner also to
-// reduce the condition number of the linear system and improve the
-// convergence rate. Modern references for Conjugate Gradients are the
-// books by Yousef Saad and Trefethen & Bau. This implementation of CG
-// has been augmented with additional termination tests that are
-// needed for forcing early termination when used as part of an
-// inexact Newton solver.
-//
-// For more details see the documentation for
-// LinearSolver::PerSolveOptions::r_tolerance and
-// LinearSolver::PerSolveOptions::q_tolerance in linear_solver.h.
-class CERES_NO_EXPORT ConjugateGradientsSolver final : public LinearSolver {
+// Interface for the linear operator used by ConjugateGradientsSolver.
+template <typename DenseVectorType>
+class ConjugateGradientsLinearOperator {
 public:
-  explicit ConjugateGradientsSolver(LinearSolver::Options options);
-  Summary Solve(LinearOperator* A,
-                const double* b,
-                const LinearSolver::PerSolveOptions& per_solve_options,
-                double* x) final;
-
- private:
-  const LinearSolver::Options options_;
+  ~ConjugateGradientsLinearOperator() = default;
+  virtual void RightMultiplyAndAccumulate(const DenseVectorType& x,
+                                          DenseVectorType& y) = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+// Adapter class that makes LinearOperator appear like an instance of
+// ConjugateGradientsLinearOperator.
+class LinearOperatorAdapter : public ConjugateGradientsLinearOperator<Vector> {
+ public:
+  LinearOperatorAdapter(LinearOperator& linear_operator)
+      : linear_operator_(linear_operator) {}
+
+  void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
+    linear_operator_.RightMultiplyAndAccumulate(x, y);
+  }
+
+ private:
+  LinearOperator& linear_operator_;
+};
+
+// Options to control the ConjugateGradientsSolver. For detailed documentation
+// for each of these options see linear_solver.h
+struct ConjugateGradientsSolverOptions {
+  int min_num_iterations = 1;
+  int max_num_iterations = 1;
+  int residual_reset_period = 10;
+  double r_tolerance = 0.0;
+  double q_tolerance = 0.0;
+  ContextImpl* context = nullptr;
+  int num_threads = 1;
+};
+
+// This function implements the now classical Conjugate Gradients algorithm of
+// Hestenes & Stiefel for solving positive semidefinite linear systems.
+// Optionally it can use a preconditioner also to reduce the condition number of
+// the linear system and improve the convergence rate. Modern references for
+// Conjugate Gradients are the books by Yousef Saad and Trefethen & Bau. This
+// implementation of CG has been augmented with additional termination tests
+// that are needed for forcing early termination when used as part of an inexact
+// Newton solver.
+//
+// This implementation is templated over DenseVectorType and then in turn on
+// ConjugateGradientsLinearOperator, which allows us to write an abstract
+// implementaion of the Conjugate Gradients algorithm without worrying about how
+// these objects are implemented or where they are stored. In particular it
+// allows us to have a single implementation that works on CPU and GPU based
+// matrices and vectors.
+//
+// scratch must contain pointers to four DenseVector objects of the same size as
+// rhs and solution. By asking the user for scratch space, we guarantee that we
+// will not perform any allocations inside this function.
+template <typename DenseVectorType>
+LinearSolver::Summary ConjugateGradientsSolver(
+    const ConjugateGradientsSolverOptions options,
+    ConjugateGradientsLinearOperator<DenseVectorType>& lhs,
+    const DenseVectorType& rhs,
+    ConjugateGradientsLinearOperator<DenseVectorType>& preconditioner,
+    DenseVectorType* scratch[4],
+    DenseVectorType& solution) {
+  auto IsZeroOrInfinity = [](double x) {
+    return ((x == 0.0) || std::isinf(x));
+  };
+
+  DenseVectorType& p = *scratch[0];
+  DenseVectorType& r = *scratch[1];
+  DenseVectorType& z = *scratch[2];
+  DenseVectorType& tmp = *scratch[3];
+
+  LinearSolver::Summary summary;
+  summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
+  summary.message = "Maximum number of iterations reached.";
+  summary.num_iterations = 0;
+
+  const double norm_rhs = Norm(rhs, options.context, options.num_threads);
+  if (norm_rhs == 0.0) {
+    SetZero(solution, options.context, options.num_threads);
+    summary.termination_type = LinearSolverTerminationType::SUCCESS;
+    summary.message = "Convergence. |b| = 0.";
+    return summary;
+  }
+
+  const double tol_r = options.r_tolerance * norm_rhs;
+
+  SetZero(tmp, options.context, options.num_threads);
+  lhs.RightMultiplyAndAccumulate(solution, tmp);
+
+  // r = rhs - tmp
+  Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
+
+  double norm_r = Norm(r, options.context, options.num_threads);
+  if (options.min_num_iterations == 0 && norm_r <= tol_r) {
+    summary.termination_type = LinearSolverTerminationType::SUCCESS;
+    summary.message =
+        StringPrintf("Convergence. |r| = %e <= %e.", norm_r, tol_r);
+    return summary;
+  }
+
+  double rho = 1.0;
+
+  // Initial value of the quadratic model Q = x'Ax - 2 * b'x.
+  // double Q0 = -1.0 * solution.dot(rhs + r);
+  Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+  double Q0 = -Dot(solution, tmp, options.context, options.num_threads);
+
+  for (summary.num_iterations = 1;; ++summary.num_iterations) {
+    SetZero(z, options.context, options.num_threads);
+    preconditioner.RightMultiplyAndAccumulate(r, z);
+
+    const double last_rho = rho;
+    // rho = r.dot(z);
+    rho = Dot(r, z, options.context, options.num_threads);
+    if (IsZeroOrInfinity(rho)) {
+      summary.termination_type = LinearSolverTerminationType::FAILURE;
+      summary.message = StringPrintf("Numerical failure. rho = r'z = %e.", rho);
+      break;
+    }
+
+    if (summary.num_iterations == 1) {
+      Copy(z, p, options.context, options.num_threads);
+    } else {
+      const double beta = rho / last_rho;
+      if (IsZeroOrInfinity(beta)) {
+        summary.termination_type = LinearSolverTerminationType::FAILURE;
+        summary.message = StringPrintf(
+            "Numerical failure. beta = rho_n / rho_{n-1} = %e, "
+            "rho_n = %e, rho_{n-1} = %e",
+            beta,
+            rho,
+            last_rho);
+        break;
+      }
+      // p = z + beta * p;
+      Axpby(1.0, z, beta, p, p, options.context, options.num_threads);
+    }
+
+    DenseVectorType& q = z;
+    SetZero(q, options.context, options.num_threads);
+    lhs.RightMultiplyAndAccumulate(p, q);
+    const double pq = Dot(p, q, options.context, options.num_threads);
+    if ((pq <= 0) || std::isinf(pq)) {
+      summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
+      summary.message = StringPrintf(
+          "Matrix is indefinite, no more progress can be made. "
+          "p'q = %e. |p| = %e, |q| = %e",
+          pq,
+          Norm(p, options.context, options.num_threads),
+          Norm(q, options.context, options.num_threads));
+      break;
+    }
+
+    const double alpha = rho / pq;
+    if (std::isinf(alpha)) {
+      summary.termination_type = LinearSolverTerminationType::FAILURE;
+      summary.message = StringPrintf(
+          "Numerical failure. alpha = rho / pq = %e, rho = %e, pq = %e.",
+          alpha,
+          rho,
+          pq);
+      break;
+    }
+
+    // solution = solution + alpha * p;
+    Axpby(1.0,
+          solution,
+          alpha,
+          p,
+          solution,
+          options.context,
+          options.num_threads);
+
+    // Ideally we would just use the update r = r - alpha*q to keep
+    // track of the residual vector. However this estimate tends to
+    // drift over time due to round off errors. Thus every
+    // residual_reset_period iterations, we calculate the residual as
+    // r = b - Ax. We do not do this every iteration because this
+    // requires an additional matrix vector multiply which would
+    // double the complexity of the CG algorithm.
+    if (summary.num_iterations % options.residual_reset_period == 0) {
+      SetZero(tmp, options.context, options.num_threads);
+      lhs.RightMultiplyAndAccumulate(solution, tmp);
+      Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
+      // r = rhs - tmp;
+    } else {
+      Axpby(1.0, r, -alpha, q, r, options.context, options.num_threads);
+      // r = r - alpha * q;
+    }
+
+    // Quadratic model based termination.
+    //   Q1 = x'Ax - 2 * b' x.
+    // const double Q1 = -1.0 * solution.dot(rhs + r);
+    Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+    const double Q1 = -Dot(solution, tmp, options.context, options.num_threads);
+
+    // For PSD matrices A, let
+    //
+    //   Q(x) = x'Ax - 2b'x
+    //
+    // be the cost of the quadratic function defined by A and b. Then,
+    // the solver terminates at iteration i if
+    //
+    //   i * (Q(x_i) - Q(x_i-1)) / Q(x_i) < q_tolerance.
+    //
+    // This termination criterion is more useful when using CG to
+    // solve the Newton step. This particular convergence test comes
+    // from Stephen Nash's work on truncated Newton
+    // methods. References:
+    //
+    //   1. Stephen G. Nash & Ariela Sofer, Assessing A Search
+    //   Direction Within A Truncated Newton Method, Operation
+    //   Research Letters 9(1990) 219-221.
+    //
+    //   2. Stephen G. Nash, A Survey of Truncated Newton Methods,
+    //   Journal of Computational and Applied Mathematics,
+    //   124(1-2), 45-59, 2000.
+    //
+    const double zeta = summary.num_iterations * (Q1 - Q0) / Q1;
+    if (zeta < options.q_tolerance &&
+        summary.num_iterations >= options.min_num_iterations) {
+      summary.termination_type = LinearSolverTerminationType::SUCCESS;
+      summary.message =
+          StringPrintf("Iteration: %d Convergence: zeta = %e < %e. |r| = %e",
+                       summary.num_iterations,
+                       zeta,
+                       options.q_tolerance,
+                       Norm(r, options.context, options.num_threads));
+      break;
+    }
+    Q0 = Q1;
+
+    // Residual based termination.
+    norm_r = Norm(r, options.context, options.num_threads);
+    if (norm_r <= tol_r &&
+        summary.num_iterations >= options.min_num_iterations) {
+      summary.termination_type = LinearSolverTerminationType::SUCCESS;
+      summary.message =
+          StringPrintf("Iteration: %d Convergence. |r| = %e <= %e.",
+                       summary.num_iterations,
+                       norm_r,
+                       tol_r);
+      break;
+    }
+
+    if (summary.num_iterations >= options.max_num_iterations) {
+      break;
+    }
+  }
+
+  return summary;
+}
+
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/context.cc
+++ b/extern/ceres/internal/ceres/context.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/context_impl.cc
+++ b/extern/ceres/internal/ceres/context_impl.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,8 @@
 #include <string>

 #include "ceres/internal/config.h"
+#include "ceres/stringprintf.h"
+#include "ceres/wall_time.h"

 #ifndef CERES_NO_CUDA
 #include "cublas_v2.h"
@@ -40,69 +42,155 @@
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 ContextImpl::ContextImpl() = default;

 #ifndef CERES_NO_CUDA
-bool ContextImpl::InitCUDA(std::string* message) {
-  if (cuda_initialized_) {
+void ContextImpl::TearDown() {
+  if (cusolver_handle_ != nullptr) {
+    cusolverDnDestroy(cusolver_handle_);
+    cusolver_handle_ = nullptr;
+  }
+  if (cublas_handle_ != nullptr) {
+    cublasDestroy(cublas_handle_);
+    cublas_handle_ = nullptr;
+  }
+  if (cusparse_handle_ != nullptr) {
+    cusparseDestroy(cusparse_handle_);
+    cusparse_handle_ = nullptr;
+  }
+  for (auto& s : streams_) {
+    if (s != nullptr) {
+      cudaStreamDestroy(s);
+      s = nullptr;
+    }
+  }
+  is_cuda_initialized_ = false;
+}
+
+std::string ContextImpl::CudaConfigAsString() const {
+  return ceres::internal::StringPrintf(
+      "======================= CUDA Device Properties ======================\n"
+      "Cuda version              : %d.%d\n"
+      "Device ID                 : %d\n"
+      "Device name               : %s\n"
+      "Total GPU memory          : %6.f MiB\n"
+      "GPU memory available      : %6.f MiB\n"
+      "Compute capability        : %d.%d\n"
+      "Warp size                 : %d\n"
+      "Max threads per block     : %d\n"
+      "Max threads per dim       : %d %d %d\n"
+      "Max grid size             : %d %d %d\n"
+      "Multiprocessor count      : %d\n"
+      "cudaMallocAsync supported : %s\n"
+      "====================================================================",
+      cuda_version_major_,
+      cuda_version_minor_,
+      gpu_device_id_in_use_,
+      gpu_device_properties_.name,
+      gpu_device_properties_.totalGlobalMem / 1024.0 / 1024.0,
+      GpuMemoryAvailable() / 1024.0 / 1024.0,
+      gpu_device_properties_.major,
+      gpu_device_properties_.minor,
+      gpu_device_properties_.warpSize,
+      gpu_device_properties_.maxThreadsPerBlock,
+      gpu_device_properties_.maxThreadsDim[0],
+      gpu_device_properties_.maxThreadsDim[1],
+      gpu_device_properties_.maxThreadsDim[2],
+      gpu_device_properties_.maxGridSize[0],
+      gpu_device_properties_.maxGridSize[1],
+      gpu_device_properties_.maxGridSize[2],
+      gpu_device_properties_.multiProcessorCount,
+      // In CUDA 12.0.0+ cudaDeviceProp has field memoryPoolsSupported, but it
+      // is not available in older versions
+      is_cuda_memory_pools_supported_ ? "Yes" : "No");
+}
+
+size_t ContextImpl::GpuMemoryAvailable() const {
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  return free;
+}
+
+bool ContextImpl::InitCuda(std::string* message) {
+  if (is_cuda_initialized_) {
    return true;
  }
+  CHECK_EQ(cudaGetDevice(&gpu_device_id_in_use_), cudaSuccess);
+  int cuda_version;
+  CHECK_EQ(cudaRuntimeGetVersion(&cuda_version), cudaSuccess);
+  cuda_version_major_ = cuda_version / 1000;
+  cuda_version_minor_ = (cuda_version % 1000) / 10;
+  CHECK_EQ(
+      cudaGetDeviceProperties(&gpu_device_properties_, gpu_device_id_in_use_),
+      cudaSuccess);
+#if CUDART_VERSION >= 11020
+  int is_cuda_memory_pools_supported;
+  CHECK_EQ(cudaDeviceGetAttribute(&is_cuda_memory_pools_supported,
+                                  cudaDevAttrMemoryPoolsSupported,
+                                  gpu_device_id_in_use_),
+           cudaSuccess);
+  is_cuda_memory_pools_supported_ = is_cuda_memory_pools_supported == 1;
+#endif
+  VLOG(3) << "\n" << CudaConfigAsString();
+  EventLogger event_logger("InitCuda");
  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-    *message = "cuBLAS::cublasCreate failed.";
-    cublas_handle_ = nullptr;
-    return false;
-  }
-  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnCreate failed.";
-    cusolver_handle_ = nullptr;
-    cublasDestroy(cublas_handle_);
-    cublas_handle_ = nullptr;
-    return false;
-  }
-  if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
-      cudaSuccess) {
-    *message = "CUDA::cudaStreamCreateWithFlags failed.";
-    cusolverDnDestroy(cusolver_handle_);
-    cublasDestroy(cublas_handle_);
-    cusolver_handle_ = nullptr;
-    cublas_handle_ = nullptr;
-    stream_ = nullptr;
-    return false;
-  }
-  if (cusolverDnSetStream(cusolver_handle_, stream_) !=
-          CUSOLVER_STATUS_SUCCESS ||
-      cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) {
    *message =
-        "cuSolverDN::cusolverDnSetStream or cuBLAS::cublasSetStream failed.";
-    cusolverDnDestroy(cusolver_handle_);
-    cublasDestroy(cublas_handle_);
-    cudaStreamDestroy(stream_);
-    cusolver_handle_ = nullptr;
+        "CUDA initialization failed because cuBLAS::cublasCreate failed.";
    cublas_handle_ = nullptr;
-    stream_ = nullptr;
    return false;
  }
-  cuda_initialized_ = true;
+  event_logger.AddEvent("cublasCreate");
+  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+    *message =
+        "CUDA initialization failed because cuSolverDN::cusolverDnCreate "
+        "failed.";
+    TearDown();
+    return false;
+  }
+  event_logger.AddEvent("cusolverDnCreate");
+  if (cusparseCreate(&cusparse_handle_) != CUSPARSE_STATUS_SUCCESS) {
+    *message =
+        "CUDA initialization failed because cuSPARSE::cusparseCreate failed.";
+    TearDown();
+    return false;
+  }
+  event_logger.AddEvent("cusparseCreate");
+  for (auto& s : streams_) {
+    if (cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking) != cudaSuccess) {
+      *message =
+          "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags "
+          "failed.";
+      TearDown();
+      return false;
+    }
+  }
+  event_logger.AddEvent("cudaStreamCreateWithFlags");
+  if (cusolverDnSetStream(cusolver_handle_, DefaultStream()) !=
+          CUSOLVER_STATUS_SUCCESS ||
+      cublasSetStream(cublas_handle_, DefaultStream()) !=
+          CUBLAS_STATUS_SUCCESS ||
+      cusparseSetStream(cusparse_handle_, DefaultStream()) !=
+          CUSPARSE_STATUS_SUCCESS) {
+    *message = "CUDA initialization failed because SetStream failed.";
+    TearDown();
+    return false;
+  }
+  event_logger.AddEvent("SetStream");
+  is_cuda_initialized_ = true;
  return true;
 }
 #endif  // CERES_NO_CUDA

 ContextImpl::~ContextImpl() {
 #ifndef CERES_NO_CUDA
-  if (cuda_initialized_) {
-    cusolverDnDestroy(cusolver_handle_);
-    cublasDestroy(cublas_handle_);
-    cudaStreamDestroy(stream_);
-  }
+  TearDown();
 #endif  // CERES_NO_CUDA
 }
+
 void ContextImpl::EnsureMinimumThreads(int num_threads) {
-#ifdef CERES_USE_CXX_THREADS
  thread_pool.Resize(num_threads);
-#endif  // CERES_USE_CXX_THREADS
 }
-}  // namespace internal
-}  // namespace ceres
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/context_impl.h
+++ b/extern/ceres/internal/ceres/context_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,14 +46,12 @@
 #include "cublas_v2.h"
 #include "cuda_runtime.h"
 #include "cusolverDn.h"
+#include "cusparse.h"
 #endif  // CERES_NO_CUDA

-#ifdef CERES_USE_CXX_THREADS
 #include "ceres/thread_pool.h"
-#endif  // CERES_USE_CXX_THREADS

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT ContextImpl final : public Context {
 public:
@@ -67,30 +65,82 @@ class CERES_NO_EXPORT ContextImpl final : public Context {
  // defined by the hardware.  Otherwise this call is a no-op.
  void EnsureMinimumThreads(int num_threads);

-#ifdef CERES_USE_CXX_THREADS
  ThreadPool thread_pool;
-#endif  // CERES_USE_CXX_THREADS

 #ifndef CERES_NO_CUDA
-  // Initializes the cuSolverDN context, creates an asynchronous stream, and
-  // associates the stream with cuSolverDN. Returns true iff initialization was
-  // successful, else it returns false and a human-readable error message is
-  // returned.
-  bool InitCUDA(std::string* message);
+  // Note on Ceres' use of CUDA Devices on multi-GPU systems:
+  // 1. On a multi-GPU system, if nothing special is done, the "default" CUDA
+  //    device will be used, which is device 0.
+  // 2. If the user masks out GPUs using the  CUDA_VISIBLE_DEVICES  environment
+  //    variable, Ceres will still use device 0 visible to the program, but
+  //    device 0 will be the first GPU indicated in the environment variable.
+  // 3. If the user explicitly selects a GPU in the host process before calling
+  //    Ceres, Ceres will use that GPU.
+
+  // Note on Ceres' use of CUDA Streams:
+  // Most of operations on the GPU are performed using a single stream.  In
+  // those cases DefaultStream() should be used. This ensures that operations
+  // are stream-ordered, and might be concurrent with cpu processing with no
+  // additional efforts.
+  //
+  // a. Single-stream workloads
+  //  - Only use default stream
+  //  - Return control to the callee without synchronization whenever possible
+  //  - Stream synchronization occurs only after GPU to CPU transfers, and is
+  //  handled by CudaBuffer
+  //
+  // b. Multi-stream workloads
+  // Multi-stream workloads are more restricted in order to make it harder to
+  // get a race-condition.
+  //  - Should always synchronize the default stream on entry
+  //  - Should always synchronize all utilized streams on exit
+  //  - Should not make any assumptions on one of streams_[] being default
+  //
+  // With those rules in place
+  //  - All single-stream asynchronous workloads are serialized using default
+  //  stream
+  //  - Multiple-stream workloads always wait single-stream workloads to finish
+  //  and leave no running computations on exit.
+  //  This slightly penalizes multi-stream workloads, but makes it easier to
+  //  avoid race conditions when  multiple-stream workload depends on results of
+  //  any preceeding gpu computations.
+
+  // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
+  // asynchronous CUDA stream, and associates the stream with the contexts.
+  // Returns true iff initialization was successful, else it returns false and a
+  // human-readable error message is returned.
+  bool InitCuda(std::string* message);
+  void TearDown();
+  inline bool IsCudaInitialized() const { return is_cuda_initialized_; }
+  // Returns a human-readable string describing the capabilities of the current
+  // CUDA device. CudaConfigAsString can only be called after InitCuda has been
+  // called.
+  std::string CudaConfigAsString() const;
+  // Returns the number of bytes of available global memory on the current CUDA
+  // device. If it is called before InitCuda, it returns 0.
+  size_t GpuMemoryAvailable() const;

-  // Handle to the cuSOLVER context.
  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // Handle to cuBLAS context.
  cublasHandle_t cublas_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
-  // Indicates whether all the CUDA resources have been initialized.
-  bool cuda_initialized_ = false;
+
+  // Default stream.
+  // Kernel invocations and memory copies on this stream can be left without
+  // synchronization.
+  cudaStream_t DefaultStream() { return streams_[0]; }
+  static constexpr int kNumCudaStreams = 2;
+  cudaStream_t streams_[kNumCudaStreams] = {0};
+
+  cusparseHandle_t cusparse_handle_ = nullptr;
+  bool is_cuda_initialized_ = false;
+  int gpu_device_id_in_use_ = -1;
+  cudaDeviceProp gpu_device_properties_;
+  bool is_cuda_memory_pools_supported_ = false;
+  int cuda_version_major_ = 0;
+  int cuda_version_minor_ = 0;
 #endif  // CERES_NO_CUDA
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/coordinate_descent_minimizer.cc
+++ b/extern/ceres/internal/ceres/coordinate_descent_minimizer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,11 @@

 #include <algorithm>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <numeric>
+#include <set>
+#include <string>
 #include <vector>

 #include "ceres/evaluator.h"
@@ -49,15 +52,7 @@
 #include "ceres/trust_region_minimizer.h"
 #include "ceres/trust_region_strategy.h"

-namespace ceres {
-namespace internal {
-
-using std::map;
-using std::max;
-using std::min;
-using std::set;
-using std::string;
-using std::vector;
+namespace ceres::internal {

 CoordinateDescentMinimizer::CoordinateDescentMinimizer(ContextImpl* context)
    : context_(context) {
@@ -70,15 +65,19 @@ bool CoordinateDescentMinimizer::Init(
    const Program& program,
    const ProblemImpl::ParameterMap& parameter_map,
    const ParameterBlockOrdering& ordering,
-    string* error) {
+    std::string* /*error*/) {
  parameter_blocks_.clear();
  independent_set_offsets_.clear();
  independent_set_offsets_.push_back(0);

  // Serialize the OrderedGroups into a vector of parameter block
  // offsets for parallel access.
-  map<ParameterBlock*, int> parameter_block_index;
-  map<int, set<double*>> group_to_elements = ordering.group_to_elements();
+
+  // TODO(sameeragarwal): Investigate if parameter_block_index should be an
+  // ordered or an unordered container.
+  std::map<ParameterBlock*, int> parameter_block_index;
+  std::map<int, std::set<double*>> group_to_elements =
+      ordering.group_to_elements();
  for (const auto& g_t_e : group_to_elements) {
    const auto& elements = g_t_e.second;
    for (double* parameter_block : elements) {
@@ -93,7 +92,8 @@ bool CoordinateDescentMinimizer::Init(
  // The ordering does not have to contain all parameter blocks, so
  // assign zero offsets/empty independent sets to these parameter
  // blocks.
-  const vector<ParameterBlock*>& parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program.parameter_blocks();
  for (auto* parameter_block : parameter_blocks) {
    if (!ordering.IsMember(parameter_block->mutable_user_state())) {
      parameter_blocks_.push_back(parameter_block);
@@ -104,7 +104,8 @@ bool CoordinateDescentMinimizer::Init(
  // Compute the set of residual blocks that depend on each parameter
  // block.
  residual_blocks_.resize(parameter_block_index.size());
-  const vector<ResidualBlock*>& residual_blocks = program.residual_blocks();
+  const std::vector<ResidualBlock*>& residual_blocks =
+      program.residual_blocks();
  for (auto* residual_block : residual_blocks) {
    const int num_parameter_blocks = residual_block->NumParameterBlocks();
    for (int j = 0; j < num_parameter_blocks; ++j) {
@@ -126,7 +127,7 @@ bool CoordinateDescentMinimizer::Init(

 void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,
                                          double* parameters,
-                                          Solver::Summary* summary) {
+                                          Solver::Summary* /*summary*/) {
  // Set the state and mark all parameter blocks constant.
  for (auto* parameter_block : parameter_blocks_) {
    parameter_block->SetState(parameters + parameter_block->state_offset());
@@ -135,8 +136,6 @@ void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,

  std::vector<std::unique_ptr<LinearSolver>> linear_solvers(
      options.num_threads);
-  // std::unique_ptr<LinearSolver*[]> linear_solvers(
-  //    new LinearSolver*[options.num_threads]);

  LinearSolver::Options linear_solver_options;
  linear_solver_options.type = DENSE_QR;
@@ -155,9 +154,9 @@ void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,
    }

    const int num_inner_iteration_threads =
-        min(options.num_threads, num_problems);
+        std::min(options.num_threads, num_problems);
    evaluator_options_.num_threads =
-        max(1, options.num_threads / num_inner_iteration_threads);
+        std::max(1, options.num_threads / num_inner_iteration_threads);

    // The parameter blocks in each independent set can be optimized
    // in parallel, since they do not co-occur in any residual block.
@@ -170,9 +169,11 @@ void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,
          ParameterBlock* parameter_block = parameter_blocks_[j];
          const int old_index = parameter_block->index();
          const int old_delta_offset = parameter_block->delta_offset();
+          const int old_state_offset = parameter_block->state_offset();
          parameter_block->SetVarying();
          parameter_block->set_index(0);
          parameter_block->set_delta_offset(0);
+          parameter_block->set_state_offset(0);

          Program inner_program;
          inner_program.mutable_parameter_blocks()->push_back(parameter_block);
@@ -189,11 +190,12 @@ void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,
          Solver::Summary inner_summary;
          Solve(&inner_program,
                linear_solvers[thread_id].get(),
-                parameters + parameter_block->state_offset(),
+                parameters + old_state_offset,
                &inner_summary);

          parameter_block->set_index(old_index);
          parameter_block->set_delta_offset(old_delta_offset);
+          parameter_block->set_state_offset(old_state_offset);
          parameter_block->SetState(parameters +
                                    parameter_block->state_offset());
          parameter_block->SetConstant();
@@ -203,10 +205,6 @@ void CoordinateDescentMinimizer::Minimize(const Minimizer::Options& options,
  for (auto* parameter_block : parameter_blocks_) {
    parameter_block->SetVarying();
  }
-
-  //  for (int i = 0; i < options.num_threads; ++i) {
-  //  delete linear_solvers[i];
-  //}
 }

 // Solve the optimization problem for one parameter block.
@@ -218,7 +216,7 @@ void CoordinateDescentMinimizer::Solve(Program* program,
  summary->initial_cost = 0.0;
  summary->fixed_cost = 0.0;
  summary->final_cost = 0.0;
-  string error;
+  std::string error;

  Minimizer::Options minimizer_options;
  minimizer_options.evaluator =
@@ -241,8 +239,10 @@ void CoordinateDescentMinimizer::Solve(Program* program,
 bool CoordinateDescentMinimizer::IsOrderingValid(
    const Program& program,
    const ParameterBlockOrdering& ordering,
-    string* message) {
-  const map<int, set<double*>>& group_to_elements =
+    std::string* message) {
+  // TODO(sameeragarwal): Investigate if this should be an ordered or an
+  // unordered group.
+  const std::map<int, std::set<double*>>& group_to_elements =
      ordering.group_to_elements();

  // Verify that each group is an independent set
@@ -270,5 +270,4 @@ CoordinateDescentMinimizer::CreateOrdering(const Program& program) {
  return ordering;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/coordinate_descent_minimizer.h
+++ b/extern/ceres/internal/ceres/coordinate_descent_minimizer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 #ifndef CERES_INTERNAL_COORDINATE_DESCENT_MINIMIZER_H_
 #define CERES_INTERNAL_COORDINATE_DESCENT_MINIMIZER_H_

+#include <memory>
 #include <string>
 #include <vector>

@@ -40,8 +41,7 @@
 #include "ceres/problem_impl.h"
 #include "ceres/solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;
 class LinearSolver;
@@ -103,7 +103,6 @@ class CERES_NO_EXPORT CoordinateDescentMinimizer final : public Minimizer {
  ContextImpl* context_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_COORDINATE_DESCENT_MINIMIZER_H_
--- a/extern/ceres/internal/ceres/corrector.cc
+++ b/extern/ceres/internal/ceres/corrector.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,7 @@
 #include "ceres/internal/eigen.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 Corrector::Corrector(const double sq_norm, const double rho[3]) {
  CHECK_GE(sq_norm, 0.0);
@@ -88,7 +87,7 @@ Corrector::Corrector(const double sq_norm, const double rho[3]) {
  // We now require that the first derivative of the loss function be
  // positive only if the second derivative is positive. This is
  // because when the second derivative is non-positive, we do not use
-  // the second order correction suggested by BANS and instead use a
+  // the second order correction suggested by BAMS and instead use a
  // simpler first order strategy which does not use a division by the
  // gradient of the loss function.
  CHECK_GT(rho[1], 0.0);
@@ -112,7 +111,7 @@ Corrector::Corrector(const double sq_norm, const double rho[3]) {

 void Corrector::CorrectResiduals(const int num_rows, double* residuals) {
  DCHECK(residuals != nullptr);
-  // Equation 11 in BANS.
+  // Equation 11 in BAMS.
  VectorRef(residuals, num_rows) *= residual_scaling_;
 }

@@ -129,7 +128,7 @@ void Corrector::CorrectJacobian(const int num_rows,
    return;
  }

-  // Equation 11 in BANS.
+  // Equation 11 in BAMS.
  //
  //  J = sqrt(rho) * (J - alpha^2 r * r' J)
  //
@@ -155,5 +154,4 @@ void Corrector::CorrectJacobian(const int num_rows,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/corrector.h
+++ b/extern/ceres/internal/ceres/corrector.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
 //
 // Class definition for the object that is responsible for applying a
 // second order correction to the Gauss-Newton based on the ideas in
-// BANS by Triggs et al.
+// BAMS by Triggs et al.

 #ifndef CERES_INTERNAL_CORRECTOR_H_
 #define CERES_INTERNAL_CORRECTOR_H_
@@ -38,8 +38,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Corrector is responsible for applying the second order correction
 // to the residual and jacobian of a least squares problem based on a
@@ -48,7 +47,7 @@ namespace internal {
 // The key idea here is to look at the expressions for the robustified
 // gauss newton approximation and then take its square root to get the
 // corresponding corrections to the residual and jacobian.  For the
-// full expressions see Eq. 10 and 11 in BANS by Triggs et al.
+// full expressions see Eq. 10 and 11 in BAMS by Triggs et al.
 class CERES_NO_EXPORT Corrector {
 public:
  // The constructor takes the squared norm, the value, the first and
@@ -87,8 +86,7 @@ class CERES_NO_EXPORT Corrector {
  double residual_scaling_;
  double alpha_sq_norm_;
 };
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/cost_function.cc
+++ b/extern/ceres/internal/ceres/cost_function.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/covariance.cc
+++ b/extern/ceres/internal/ceres/covariance.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,9 +39,6 @@

 namespace ceres {

-using std::pair;
-using std::vector;
-
 Covariance::Covariance(const Covariance::Options& options) {
  impl_ = std::make_unique<internal::CovarianceImpl>(options);
 }
@@ -49,14 +46,15 @@ Covariance::Covariance(const Covariance::Options& options) {
 Covariance::~Covariance() = default;

 bool Covariance::Compute(
-    const vector<pair<const double*, const double*>>& covariance_blocks,
+    const std::vector<std::pair<const double*, const double*>>&
+        covariance_blocks,
    Problem* problem) {
-  return impl_->Compute(covariance_blocks, problem->impl_.get());
+  return impl_->Compute(covariance_blocks, problem->mutable_impl());
 }

-bool Covariance::Compute(const vector<const double*>& parameter_blocks,
+bool Covariance::Compute(const std::vector<const double*>& parameter_blocks,
                         Problem* problem) {
-  return impl_->Compute(parameter_blocks, problem->impl_.get());
+  return impl_->Compute(parameter_blocks, problem->mutable_impl());
 }

 bool Covariance::GetCovarianceBlock(const double* parameter_block1,
@@ -79,7 +77,7 @@ bool Covariance::GetCovarianceBlockInTangentSpace(
 }

 bool Covariance::GetCovarianceMatrix(
-    const vector<const double*>& parameter_blocks,
+    const std::vector<const double*>& parameter_blocks,
    double* covariance_matrix) const {
  return impl_->GetCovarianceMatrixInTangentOrAmbientSpace(parameter_blocks,
                                                           true,  // ambient
--- a/extern/ceres/internal/ceres/covariance_impl.cc
+++ b/extern/ceres/internal/ceres/covariance_impl.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -57,24 +57,12 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::swap;
+namespace ceres::internal {

 using CovarianceBlocks = std::vector<std::pair<const double*, const double*>>;

 CovarianceImpl::CovarianceImpl(const Covariance::Options& options)
    : options_(options), is_computed_(false), is_valid_(false) {
-#ifdef CERES_NO_THREADS
-  if (options_.num_threads > 1) {
-    LOG(WARNING) << "No threading support is compiled into this binary; "
-                 << "only options.num_threads = 1 is supported. Switching "
-                 << "to single threaded mode.";
-    options_.num_threads = 1;
-  }
-#endif
-
  evaluate_options_.num_threads = options_.num_threads;
  evaluate_options_.apply_loss_function = options_.apply_loss_function;
 }
@@ -176,7 +164,7 @@ bool CovarianceImpl::GetCovarianceBlockInTangentOrAmbientSpace(
  const double* parameter_block2 = original_parameter_block2;
  const bool transpose = parameter_block1 > parameter_block2;
  if (transpose) {
-    swap(parameter_block1, parameter_block2);
+    std::swap(parameter_block1, parameter_block2);
  }

  // Find where in the covariance matrix the block is located.
@@ -190,7 +178,7 @@ bool CovarianceImpl::GetCovarianceBlockInTangentOrAmbientSpace(
  const int* cols_begin = cols + rows[row_begin];

  // The only part that requires work is walking the compressed column
-  // vector to determine where the set of columns correspnding to the
+  // vector to determine where the set of columns corresponding to the
  // covariance block begin.
  int offset = 0;
  while (cols_begin[offset] != col_begin && offset < row_size) {
@@ -322,9 +310,8 @@ bool CovarianceImpl::GetCovarianceMatrixInTangentOrAmbientSpace(
  // Assemble the blocks in the covariance matrix.
  MatrixRef covariance(covariance_matrix, covariance_size, covariance_size);
  const int num_threads = options_.num_threads;
-  std::unique_ptr<double[]> workspace(
-      new double[num_threads * max_covariance_block_size *
-                 max_covariance_block_size]);
+  auto workspace = std::make_unique<double[]>(
+      num_threads * max_covariance_block_size * max_covariance_block_size);

  bool success = true;

@@ -481,14 +468,12 @@ bool CovarianceImpl::ComputeCovarianceSparsity(
    // Iterate over the covariance blocks contained in this row block
    // and count the number of columns in this row block.
    int num_col_blocks = 0;
-    int num_columns = 0;
    for (int j = i; j < covariance_blocks.size(); ++j, ++num_col_blocks) {
      const std::pair<const double*, const double*>& block_pair =
          covariance_blocks[j];
      if (block_pair.first != row_block) {
        break;
      }
-      num_columns += problem->ParameterBlockTangentSize(block_pair.second);
    }

    // Fill out all the compressed rows for this parameter block.
@@ -598,9 +583,9 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSuiteSparseQR() {
  cholmod_jacobian.ncol = num_cols;
  cholmod_jacobian.nzmax = num_nonzeros;
  cholmod_jacobian.nz = nullptr;
-  cholmod_jacobian.p = reinterpret_cast<void*>(&transpose_rows[0]);
-  cholmod_jacobian.i = reinterpret_cast<void*>(&transpose_cols[0]);
-  cholmod_jacobian.x = reinterpret_cast<void*>(&transpose_values[0]);
+  cholmod_jacobian.p = reinterpret_cast<void*>(transpose_rows.data());
+  cholmod_jacobian.i = reinterpret_cast<void*>(transpose_cols.data());
+  cholmod_jacobian.x = reinterpret_cast<void*>(transpose_values.data());
  cholmod_jacobian.z = nullptr;
  cholmod_jacobian.stype = 0;  // Matrix is not symmetric.
  cholmod_jacobian.itype = CHOLMOD_LONG;
@@ -628,13 +613,15 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSuiteSparseQR() {
  // more efficient, both in runtime as well as the quality of
  // ordering computed. So, it maybe worth doing that analysis
  // separately.
-  const SuiteSparse_long rank = SuiteSparseQR<double>(SPQR_ORDERING_BESTAMD,
-                                                      SPQR_DEFAULT_TOL,
-                                                      cholmod_jacobian.ncol,
-                                                      &cholmod_jacobian,
-                                                      &R,
-                                                      &permutation,
-                                                      &cc);
+  const SuiteSparse_long rank = SuiteSparseQR<double>(
+      SPQR_ORDERING_BESTAMD,
+      options_.column_pivot_threshold < 0 ? SPQR_DEFAULT_TOL
+                                          : options_.column_pivot_threshold,
+      static_cast<int64_t>(cholmod_jacobian.ncol),
+      &cholmod_jacobian,
+      &R,
+      &permutation,
+      &cc);
  event_logger.AddEvent("Numeric Factorization");
  if (R == nullptr) {
    LOG(ERROR) << "Something is wrong. SuiteSparseQR returned R = nullptr.";
@@ -678,7 +665,7 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingSuiteSparseQR() {
  // Since the covariance matrix is symmetric, the i^th row and column
  // are equal.
  const int num_threads = options_.num_threads;
-  std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
+  auto workspace = std::make_unique<double[]>(num_threads * num_cols);

  problem_->context()->EnsureMinimumThreads(num_threads);
  ParallelFor(
@@ -830,19 +817,23 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingEigenSparseQR() {
          jacobian.values.data());
  event_logger.AddEvent("ConvertToSparseMatrix");

-  Eigen::SparseQR<EigenSparseMatrix, Eigen::COLAMDOrdering<int>> qr_solver(
-      sparse_jacobian);
+  Eigen::SparseQR<EigenSparseMatrix, Eigen::COLAMDOrdering<int>> qr;
+  if (options_.column_pivot_threshold > 0) {
+    qr.setPivotThreshold(options_.column_pivot_threshold);
+  }
+
+  qr.compute(sparse_jacobian);
  event_logger.AddEvent("QRDecomposition");

-  if (qr_solver.info() != Eigen::Success) {
+  if (qr.info() != Eigen::Success) {
    LOG(ERROR) << "Eigen::SparseQR decomposition failed.";
    return false;
  }

-  if (qr_solver.rank() < jacobian.num_cols) {
+  if (qr.rank() < jacobian.num_cols) {
    LOG(ERROR) << "Jacobian matrix is rank deficient. "
               << "Number of columns: " << jacobian.num_cols
-               << " rank: " << qr_solver.rank();
+               << " rank: " << qr.rank();
    return false;
  }

@@ -852,7 +843,7 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingEigenSparseQR() {

  // Compute the inverse column permutation used by QR factorization.
  Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic> inverse_permutation =
-      qr_solver.colsPermutation().inverse();
+      qr.colsPermutation().inverse();

  // The following loop exploits the fact that the i^th column of A^{-1}
  // is given by the solution to the linear system
@@ -865,7 +856,7 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingEigenSparseQR() {
  // are equal.
  const int num_cols = jacobian.num_cols;
  const int num_threads = options_.num_threads;
-  std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
+  auto workspace = std::make_unique<double[]>(num_threads * num_cols);

  problem_->context()->EnsureMinimumThreads(num_threads);
  ParallelFor(
@@ -875,9 +866,9 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingEigenSparseQR() {
        if (row_end != row_begin) {
          double* solution = workspace.get() + thread_id * num_cols;
          SolveRTRWithSparseRHS<int>(num_cols,
-                                     qr_solver.matrixR().innerIndexPtr(),
-                                     qr_solver.matrixR().outerIndexPtr(),
-                                     &qr_solver.matrixR().data().value(0),
+                                     qr.matrixR().innerIndexPtr(),
+                                     qr.matrixR().outerIndexPtr(),
+                                     &qr.matrixR().data().value(0),
                                     inverse_permutation.indices().coeff(r),
                                     solution);

@@ -895,5 +886,4 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingEigenSparseQR() {
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/covariance_impl.h
+++ b/extern/ceres/internal/ceres/covariance_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/problem_impl.h"
 #include "ceres/suitesparse.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CompressedRowSparseMatrix;

@@ -96,8 +95,7 @@ class CERES_NO_EXPORT CovarianceImpl {
  std::unique_ptr<CompressedRowSparseMatrix> covariance_matrix_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/extern/ceres/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -0,0 +1,103 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+namespace ceres::internal {
+
+CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm,
+                                               ContextImpl* context)
+    : context_(context) {
+  block_structure_ = std::make_unique<CudaBlockSparseStructure>(
+      *bsm.block_structure(), context);
+  CudaBuffer<int32_t> rows(context, bsm.num_rows() + 1);
+  CudaBuffer<int32_t> cols(context, bsm.num_nonzeros());
+  FillCRSStructure(block_structure_->num_row_blocks(),
+                   bsm.num_rows(),
+                   block_structure_->first_cell_in_row_block(),
+                   block_structure_->cells(),
+                   block_structure_->row_blocks(),
+                   block_structure_->col_blocks(),
+                   rows.data(),
+                   cols.data(),
+                   context->DefaultStream(),
+                   context->is_cuda_memory_pools_supported_);
+  is_crs_compatible_ = block_structure_->IsCrsCompatible();
+  // if matrix is crs-compatible - we can drop block-structure and don't need
+  // streamed_buffer_
+  if (is_crs_compatible_) {
+    VLOG(3) << "Block-sparse matrix is compatible with CRS, discarding "
+               "block-structure";
+    block_structure_ = nullptr;
+  } else {
+    streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+        context_, kMaxTemporaryArraySize);
+  }
+  crs_matrix_ = std::make_unique<CudaSparseMatrix>(
+      bsm.num_cols(), std::move(rows), std::move(cols), context);
+  UpdateValues(bsm);
+}
+
+void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) {
+  if (is_crs_compatible_) {
+    // Values of CRS-compatible matrices can be copied as-is
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(crs_matrix_->mutable_values(),
+                             bsm.values(),
+                             bsm.num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+    return;
+  }
+  streamed_buffer_->CopyToGpu(
+      bsm.values(),
+      bsm.num_nonzeros(),
+      [bs = block_structure_.get(), crs = crs_matrix_.get()](
+          const double* values, int num_values, int offset, auto stream) {
+        PermuteToCRS(offset,
+                     num_values,
+                     bs->num_row_blocks(),
+                     bs->first_cell_in_row_block(),
+                     bs->cells(),
+                     bs->row_blocks(),
+                     bs->col_blocks(),
+                     crs->rows(),
+                     values,
+                     crs->mutable_values(),
+                     stream);
+      });
+}
+
+}  // namespace ceres::internal
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_block_sparse_crs_view.h
+++ b/extern/ceres/internal/ceres/cuda_block_sparse_crs_view.h
@@ -0,0 +1,108 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// block-sparse format with varying size of the blocks. Thus, we perform the
+// following operations in order to compute products of block-sparse matrices
+// and dense vectors on gpu:
+//  - Once per block-sparse structure update:
+//    - Compute CRS structure from block-sparse structure and check if values of
+//    block-sparse matrix would have the same order as values of CRS matrix
+//  - Once per block-sparse values update:
+//    - Update values in CRS matrix with values of block-sparse matrix
+//
+// Only block-sparse matrices with sequential order of cells are supported.
+//
+// UpdateValues method updates values:
+//  - In a single host-to-device copy for matrices with CRS-compatible value
+//  layout
+//  - Simultaneously transferring and permuting values using CudaStreamedBuffer
+//  otherwise
+class CERES_NO_EXPORT CudaBlockSparseCRSView {
+ public:
+  // Initializes internal CRS matrix using structure and values of block-sparse
+  // matrix For block-sparse matrices that have value layout different from CRS
+  // block-sparse structure will be stored/
+  CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context);
+
+  const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); }
+  CudaSparseMatrix* mutable_crs_matrix() { return crs_matrix_.get(); }
+
+  // Update values of crs_matrix_ using values of block-sparse matrix.
+  // Assumes that bsm has the same block-sparse structure as matrix that was
+  // used for construction.
+  void UpdateValues(const BlockSparseMatrix& bsm);
+
+  // Returns true if block-sparse matrix had CRS-compatible value layout
+  bool IsCrsCompatible() const { return is_crs_compatible_; }
+
+  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+    crs_matrix()->LeftMultiplyAndAccumulate(x, y);
+  }
+
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+    crs_matrix()->RightMultiplyAndAccumulate(x, y);
+  }
+
+ private:
+  // Value permutation kernel performs a single element-wise operation per
+  // thread, thus performing permutation in blocks of 8 megabytes of
+  // block-sparse  values seems reasonable
+  static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+  std::unique_ptr<CudaSparseMatrix> crs_matrix_;
+  // Only created if block-sparse matrix has non-CRS value layout
+  std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+  // Only stored if block-sparse matrix has non-CRS value layout
+  std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+  bool is_crs_compatible_;
+  ContextImpl* context_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
--- a/extern/ceres/internal/ceres/cuda_block_sparse_crs_view_test.cc
+++ b/extern/ceres/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -0,0 +1,164 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+class CudaBlockSparseCRSViewTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    BlockSparseMatrix::RandomMatrixOptions options;
+    options.num_row_blocks = 1234;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 10;
+    options.num_col_blocks = 567;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 10;
+    options.block_density = 0.2;
+    std::mt19937 rng;
+
+    // Block-sparse matrix with order of values different from CRS
+    block_sparse_non_crs_compatible_ =
+        BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+    std::iota(block_sparse_non_crs_compatible_->mutable_values(),
+              block_sparse_non_crs_compatible_->mutable_values() +
+                  block_sparse_non_crs_compatible_->num_nonzeros(),
+              1);
+
+    options.max_row_block_size = 1;
+    // Block-sparse matrix with CRS order of values (row-blocks are rows)
+    block_sparse_crs_compatible_rows_ =
+        BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+    std::iota(block_sparse_crs_compatible_rows_->mutable_values(),
+              block_sparse_crs_compatible_rows_->mutable_values() +
+                  block_sparse_crs_compatible_rows_->num_nonzeros(),
+              1);
+    // Block-sparse matrix with CRS order of values (single cell per row-block)
+    auto bs = std::make_unique<CompressedRowBlockStructure>(
+        *block_sparse_non_crs_compatible_->block_structure());
+
+    int num_nonzeros = 0;
+    for (auto& r : bs->rows) {
+      const int num_cells = r.cells.size();
+      if (num_cells > 1) {
+        std::uniform_int_distribution<int> uniform_cell(0, num_cells - 1);
+        const int selected_cell = uniform_cell(rng);
+        std::swap(r.cells[0], r.cells[selected_cell]);
+        r.cells.resize(1);
+      }
+      const int row_block_size = r.block.size;
+      for (auto& c : r.cells) {
+        c.position = num_nonzeros;
+        const int col_block_size = bs->cols[c.block_id].size;
+        num_nonzeros += col_block_size * row_block_size;
+      }
+    }
+    block_sparse_crs_compatible_single_cell_ =
+        std::make_unique<BlockSparseMatrix>(bs.release());
+    std::iota(block_sparse_crs_compatible_single_cell_->mutable_values(),
+              block_sparse_crs_compatible_single_cell_->mutable_values() +
+                  block_sparse_crs_compatible_single_cell_->num_nonzeros(),
+              1);
+  }
+
+  void Compare(const BlockSparseMatrix& bsm, const CudaSparseMatrix& csm) {
+    ASSERT_EQ(csm.num_cols(), bsm.num_cols());
+    ASSERT_EQ(csm.num_rows(), bsm.num_rows());
+    ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros());
+    const int num_rows = bsm.num_rows();
+    const int num_cols = bsm.num_cols();
+    Vector x(num_cols);
+    Vector y(num_rows);
+    CudaVector x_cuda(&context_, num_cols);
+    CudaVector y_cuda(&context_, num_rows);
+    Vector y_cuda_host(num_rows);
+
+    for (int i = 0; i < num_cols; ++i) {
+      x.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_cuda.CopyFromCpu(x);
+      csm.RightMultiplyAndAccumulate(x_cuda, &y_cuda);
+      bsm.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+  }
+
+  std::unique_ptr<BlockSparseMatrix> block_sparse_non_crs_compatible_;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_rows_;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_single_cell_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesNonCompatible) {
+  auto view =
+      CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), false);
+
+  auto matrix = view.crs_matrix();
+  Compare(*block_sparse_non_crs_compatible_, *matrix);
+}
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleRows) {
+  auto view =
+      CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), true);
+
+  auto matrix = view.crs_matrix();
+  Compare(*block_sparse_crs_compatible_rows_, *matrix);
+}
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleSingleCell) {
+  auto view = CudaBlockSparseCRSView(*block_sparse_crs_compatible_single_cell_,
+                                     &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), true);
+
+  auto matrix = view.crs_matrix();
+  Compare(*block_sparse_crs_compatible_single_cell_, *matrix);
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_block_structure.cc
+++ b/extern/ceres/internal/ceres/cuda_block_structure.cc
@@ -0,0 +1,234 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_structure.h"
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+namespace {
+// Dimension of a sorted array of blocks
+inline int Dimension(const std::vector<Block>& blocks) {
+  if (blocks.empty()) {
+    return 0;
+  }
+  const auto& last = blocks.back();
+  return last.size + last.position;
+}
+}  // namespace
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+    const CompressedRowBlockStructure& block_structure, ContextImpl* context)
+    : CudaBlockSparseStructure(block_structure, 0, context) {}
+
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+    const CompressedRowBlockStructure& block_structure,
+    const int num_col_blocks_e,
+    ContextImpl* context)
+    : first_cell_in_row_block_(context),
+      value_offset_row_block_f_(context),
+      cells_(context),
+      row_blocks_(context),
+      col_blocks_(context) {
+  // Row blocks extracted from CompressedRowBlockStructure::rows
+  std::vector<Block> row_blocks;
+  // Column blocks can be reused as-is
+  const auto& col_blocks = block_structure.cols;
+
+  // Row block offset is an index of the first cell corresponding to row block
+  std::vector<int> first_cell_in_row_block;
+  // Offset of the first value in the first non-empty row-block of F sub-matrix
+  std::vector<int> value_offset_row_block_f;
+  // Flat array of all cells from all row-blocks
+  std::vector<Cell> cells;
+
+  int f_values_offset = -1;
+  num_nonzeros_e_ = 0;
+  is_crs_compatible_ = true;
+  num_row_blocks_ = block_structure.rows.size();
+  num_col_blocks_ = col_blocks.size();
+
+  row_blocks.reserve(num_row_blocks_);
+  first_cell_in_row_block.reserve(num_row_blocks_ + 1);
+  value_offset_row_block_f.reserve(num_row_blocks_ + 1);
+  num_nonzeros_ = 0;
+  // Block-sparse matrices arising from block-jacobian writer are expected to
+  // have sequential layout (for partitioned matrices - it is expected that both
+  // E and F sub-matrices have sequential layout).
+  bool sequential_layout = true;
+  int row_block_id = 0;
+  num_row_blocks_e_ = 0;
+  for (; row_block_id < num_row_blocks_; ++row_block_id) {
+    const auto& r = block_structure.rows[row_block_id];
+    const int row_block_size = r.block.size;
+    const int num_cells = r.cells.size();
+
+    if (num_col_blocks_e == 0 || r.cells.size() == 0 ||
+        r.cells[0].block_id >= num_col_blocks_e) {
+      break;
+    }
+    num_row_blocks_e_ = row_block_id + 1;
+    // In E sub-matrix there is exactly a single E cell in the row
+    // since E cells are stored separately from F cells, crs-compatiblity of
+    // F sub-matrix only breaks if there are more than 2 cells in row (that
+    // is, more than 1 cell in F sub-matrix)
+    if (num_cells > 2 && row_block_size > 1) {
+      is_crs_compatible_ = false;
+    }
+    row_blocks.emplace_back(r.block);
+    first_cell_in_row_block.push_back(cells.size());
+
+    for (int cell_id = 0; cell_id < num_cells; ++cell_id) {
+      const auto& c = r.cells[cell_id];
+      const int col_block_size = col_blocks[c.block_id].size;
+      const int cell_size = col_block_size * row_block_size;
+      cells.push_back(c);
+      if (cell_id == 0) {
+        DCHECK(c.position == num_nonzeros_e_);
+        num_nonzeros_e_ += cell_size;
+      } else {
+        if (f_values_offset == -1) {
+          num_nonzeros_ = c.position;
+          f_values_offset = c.position;
+        }
+        sequential_layout &= c.position == num_nonzeros_;
+        num_nonzeros_ += cell_size;
+        if (cell_id == 1) {
+          // Correct value_offset_row_block_f for empty row-blocks of F
+          // preceding this one
+          for (auto it = value_offset_row_block_f.rbegin();
+               it != value_offset_row_block_f.rend();
+               ++it) {
+            if (*it != -1) break;
+            *it = c.position;
+          }
+          value_offset_row_block_f.push_back(c.position);
+        }
+      }
+    }
+    if (num_cells == 1) {
+      value_offset_row_block_f.push_back(-1);
+    }
+  }
+  for (; row_block_id < num_row_blocks_; ++row_block_id) {
+    const auto& r = block_structure.rows[row_block_id];
+    const int row_block_size = r.block.size;
+    const int num_cells = r.cells.size();
+    // After num_row_blocks_e_ row-blocks, there should be no cells in E
+    // sub-matrix. Thus crs-compatibility of F sub-matrix breaks if there are
+    // more than one cells in the row-block
+    if (num_cells > 1 && row_block_size > 1) {
+      is_crs_compatible_ = false;
+    }
+    row_blocks.emplace_back(r.block);
+    first_cell_in_row_block.push_back(cells.size());
+
+    if (r.cells.empty()) {
+      value_offset_row_block_f.push_back(-1);
+    } else {
+      for (auto it = value_offset_row_block_f.rbegin();
+           it != value_offset_row_block_f.rend();
+           --it) {
+        if (*it != -1) break;
+        *it = cells[0].position;
+      }
+      value_offset_row_block_f.push_back(r.cells[0].position);
+    }
+    for (const auto& c : r.cells) {
+      const int col_block_size = col_blocks[c.block_id].size;
+      const int cell_size = col_block_size * row_block_size;
+      cells.push_back(c);
+      DCHECK(c.block_id >= num_col_blocks_e);
+      if (f_values_offset == -1) {
+        num_nonzeros_ = c.position;
+        f_values_offset = c.position;
+      }
+      sequential_layout &= c.position == num_nonzeros_;
+      num_nonzeros_ += cell_size;
+    }
+  }
+
+  if (f_values_offset == -1) {
+    f_values_offset = num_nonzeros_e_;
+    num_nonzeros_ = num_nonzeros_e_;
+  }
+  // Fill non-zero offsets for the last rows of F submatrix
+  for (auto it = value_offset_row_block_f.rbegin();
+       it != value_offset_row_block_f.rend();
+       ++it) {
+    if (*it != -1) break;
+    *it = num_nonzeros_;
+  }
+  value_offset_row_block_f.push_back(num_nonzeros_);
+  CHECK_EQ(num_nonzeros_e_, f_values_offset);
+  first_cell_in_row_block.push_back(cells.size());
+  num_cells_ = cells.size();
+
+  num_rows_ = Dimension(row_blocks);
+  num_cols_ = Dimension(col_blocks);
+
+  CHECK(sequential_layout);
+
+  if (VLOG_IS_ON(3)) {
+    const size_t first_cell_in_row_block_size =
+        first_cell_in_row_block.size() * sizeof(int);
+    const size_t cells_size = cells.size() * sizeof(Cell);
+    const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
+    const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
+    const size_t total_size = first_cell_in_row_block_size + cells_size +
+                              col_blocks_size + row_blocks_size;
+    const double ratio =
+        (100. * total_size) / (num_nonzeros_ * (sizeof(int) + sizeof(double)) +
+                               num_rows_ * sizeof(int));
+    VLOG(3) << "\nCudaBlockSparseStructure:\n"
+               "\tRow block offsets: "
+            << first_cell_in_row_block_size
+            << " bytes\n"
+               "\tColumn blocks: "
+            << col_blocks_size
+            << " bytes\n"
+               "\tRow blocks: "
+            << row_blocks_size
+            << " bytes\n"
+               "\tCells: "
+            << cells_size << " bytes\n\tTotal: " << total_size
+            << " bytes of GPU memory (" << ratio << "% of CRS matrix size)";
+  }
+
+  first_cell_in_row_block_.CopyFromCpuVector(first_cell_in_row_block);
+  cells_.CopyFromCpuVector(cells);
+  row_blocks_.CopyFromCpuVector(row_blocks);
+  col_blocks_.CopyFromCpuVector(col_blocks);
+  if (num_col_blocks_e || num_row_blocks_e_) {
+    value_offset_row_block_f_.CopyFromCpuVector(value_offset_row_block_f);
+  }
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_block_structure.h
+++ b/extern/ceres/internal/ceres/cuda_block_structure.h
@@ -0,0 +1,120 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+#define CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+class CudaBlockStructureTest;
+
+// This class stores a read-only block-sparse structure in gpu memory.
+// Invariants are the same as those of CompressedRowBlockStructure.
+// In order to simplify allocation and copying data to gpu, cells from all
+// row-blocks are stored in a single array sequentially. Array
+// first_cell_in_row_block of size num_row_blocks + 1 allows to identify range
+// of cells corresponding to a row-block. Cells corresponding to i-th row-block
+// are stored in sub-array cells[first_cell_in_row_block[i]; ...
+// first_cell_in_row_block[i + 1] - 1], and their order is preserved.
+class CERES_NO_EXPORT CudaBlockSparseStructure {
+ public:
+  // CompressedRowBlockStructure is contains a vector of CompressedLists, with
+  // each CompressedList containing a vector of Cells. We precompute a flat
+  // array of cells on cpu and transfer it to the gpu.
+  CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+                           ContextImpl* context);
+  // In the case of partitioned matrices, number of non-zeros in E and layout of
+  // F are computed
+  CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+                           const int num_col_blocks_e,
+                           ContextImpl* context);
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return num_cols_; }
+  int num_cells() const { return num_cells_; }
+  int num_nonzeros() const { return num_nonzeros_; }
+  // When partitioned matrix constructor was used, returns number of non-zeros
+  // in E sub-matrix
+  int num_nonzeros_e() const { return num_nonzeros_e_; }
+  int num_row_blocks() const { return num_row_blocks_; }
+  int num_row_blocks_e() const { return num_row_blocks_e_; }
+  int num_col_blocks() const { return num_col_blocks_; }
+
+  // Returns true if values from block-sparse matrix (F sub-matrix in
+  // partitioned case) can be copied to CRS matrix as-is. This is possible if
+  // each row-block is stored in CRS order:
+  //  - Row-block consists of a single row
+  //  - Row-block contains a single cell
+  bool IsCrsCompatible() const { return is_crs_compatible_; }
+
+  // Device pointer to array of num_row_blocks + 1 indices of the first cell of
+  // row block
+  const int* first_cell_in_row_block() const {
+    return first_cell_in_row_block_.data();
+  }
+  // Device pointer to array of num_row_blocks + 1 indices of the first value in
+  // this or subsequent row-blocks of submatrix F
+  const int* value_offset_row_block_f() const {
+    return value_offset_row_block_f_.data();
+  }
+  // Device pointer to array of num_cells cells, sorted by row-block
+  const Cell* cells() const { return cells_.data(); }
+  // Device pointer to array of row blocks
+  const Block* row_blocks() const { return row_blocks_.data(); }
+  // Device pointer to array of column blocks
+  const Block* col_blocks() const { return col_blocks_.data(); }
+
+ private:
+  int num_rows_;
+  int num_cols_;
+  int num_cells_;
+  int num_nonzeros_;
+  int num_nonzeros_e_;
+  int num_row_blocks_;
+  int num_row_blocks_e_;
+  int num_col_blocks_;
+  bool is_crs_compatible_;
+  CudaBuffer<int> first_cell_in_row_block_;
+  CudaBuffer<int> value_offset_row_block_f_;
+  CudaBuffer<Cell> cells_;
+  CudaBuffer<Block> row_blocks_;
+  CudaBuffer<Block> col_blocks_;
+  friend class CudaBlockStructureTest;
+};
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_BLOCK_SPARSE_STRUCTURE_H_
--- a/extern/ceres/internal/ceres/cuda_block_structure_test.cc
+++ b/extern/ceres/internal/ceres/cuda_block_structure_test.cc
@@ -0,0 +1,144 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+
+namespace ceres::internal {
+
+class CudaBlockStructureTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    BlockSparseMatrix::RandomMatrixOptions options;
+    options.num_row_blocks = 1234;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 10;
+    options.num_col_blocks = 567;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 10;
+    options.block_density = 0.2;
+    std::mt19937 rng;
+    A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
+    std::iota(
+        A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+  }
+
+  std::vector<Cell> GetCells(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.cells_;
+    std::vector<Cell> cells(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(cells.data(), cells.size());
+    return cells;
+  }
+  std::vector<Block> GetRowBlocks(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.row_blocks_;
+    std::vector<Block> blocks(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+    return blocks;
+  }
+  std::vector<Block> GetColBlocks(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.col_blocks_;
+    std::vector<Block> blocks(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+    return blocks;
+  }
+  std::vector<int> GetRowBlockOffsets(
+      const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.first_cell_in_row_block_;
+    std::vector<int> first_cell_in_row_block(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(first_cell_in_row_block.data(),
+                          first_cell_in_row_block.size());
+    return first_cell_in_row_block;
+  }
+
+  std::unique_ptr<BlockSparseMatrix> A_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaBlockStructureTest, StructureIdentity) {
+  auto block_structure = A_->block_structure();
+  const int num_row_blocks = block_structure->rows.size();
+  const int num_col_blocks = block_structure->cols.size();
+
+  CudaBlockSparseStructure cuda_block_structure(*block_structure, &context_);
+
+  ASSERT_EQ(cuda_block_structure.num_rows(), A_->num_rows());
+  ASSERT_EQ(cuda_block_structure.num_cols(), A_->num_cols());
+  ASSERT_EQ(cuda_block_structure.num_nonzeros(), A_->num_nonzeros());
+  ASSERT_EQ(cuda_block_structure.num_row_blocks(), num_row_blocks);
+  ASSERT_EQ(cuda_block_structure.num_col_blocks(), num_col_blocks);
+
+  std::vector<Block> blocks = GetColBlocks(cuda_block_structure);
+  ASSERT_EQ(blocks.size(), num_col_blocks);
+  for (int i = 0; i < num_col_blocks; ++i) {
+    EXPECT_EQ(block_structure->cols[i].position, blocks[i].position);
+    EXPECT_EQ(block_structure->cols[i].size, blocks[i].size);
+  }
+
+  std::vector<Cell> cells = GetCells(cuda_block_structure);
+  std::vector<int> first_cell_in_row_block =
+      GetRowBlockOffsets(cuda_block_structure);
+  blocks = GetRowBlocks(cuda_block_structure);
+
+  ASSERT_EQ(blocks.size(), num_row_blocks);
+  ASSERT_EQ(first_cell_in_row_block.size(), num_row_blocks + 1);
+  ASSERT_EQ(first_cell_in_row_block.back(), cells.size());
+
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int num_cells = block_structure->rows[i].cells.size();
+    EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position);
+    EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size);
+    const int first_cell = first_cell_in_row_block[i];
+    const int last_cell = first_cell_in_row_block[i + 1];
+    ASSERT_EQ(last_cell - first_cell, num_cells);
+    for (int j = 0; j < num_cells; ++j) {
+      EXPECT_EQ(cells[first_cell + j].block_id,
+                block_structure->rows[i].cells[j].block_id);
+      EXPECT_EQ(cells[first_cell + j].position,
+                block_structure->rows[i].cells[j].position);
+    }
+  }
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_buffer.h
+++ b/extern/ceres/internal/ceres/cuda_buffer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 #ifndef CERES_INTERNAL_CUDA_BUFFER_H_
 #define CERES_INTERNAL_CUDA_BUFFER_H_

+#include "ceres/context_impl.h"
 #include "ceres/internal/config.h"

 #ifndef CERES_NO_CUDA
@@ -40,17 +41,27 @@
 #include "cuda_runtime.h"
 #include "glog/logging.h"

+namespace ceres::internal {
 // An encapsulated buffer to maintain GPU memory, and handle transfers between
 // GPU and system memory. It is the responsibility of the user to ensure that
 // the appropriate GPU device is selected before each subroutine is called. This
 // is particularly important when using multiple GPU devices on different CPU
 // threads, since active Cuda devices are determined by the cuda runtime on a
-// per-thread basis. Note that unless otherwise specified, all methods use the
-// default stream, and are synchronous.
+// per-thread basis.
 template <typename T>
 class CudaBuffer {
 public:
-  CudaBuffer() = default;
+  explicit CudaBuffer(ContextImpl* context) : context_(context) {}
+  CudaBuffer(ContextImpl* context, int size) : context_(context) {
+    Reserve(size);
+  }
+
+  CudaBuffer(CudaBuffer&& other)
+      : data_(other.data_), size_(other.size_), context_(other.context_) {
+    other.data_ = nullptr;
+    other.size_ = 0;
+  }
+
  CudaBuffer(const CudaBuffer&) = delete;
  CudaBuffer& operator=(const CudaBuffer&) = delete;

@@ -67,41 +78,95 @@ class CudaBuffer {
      if (data_ != nullptr) {
        CHECK_EQ(cudaFree(data_), cudaSuccess);
      }
-      CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess);
+      CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess)
+          << "Failed to allocate " << size * sizeof(T)
+          << " bytes of GPU memory";
      size_ = size;
    }
  }

-  // Perform an asynchronous copy from CPU memory to GPU memory using the stream
-  // provided.
-  void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+  // Perform an asynchronous copy from CPU memory to GPU memory managed by this
+  // CudaBuffer instance using the stream provided.
+  void CopyFromCpu(const T* data, const size_t size) {
    Reserve(size);
-    CHECK_EQ(cudaMemcpyAsync(
-                 data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream),
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data,
+                             size * sizeof(T),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()),
             cudaSuccess);
  }

-  // Copy data from the GPU to CPU memory. This is necessarily synchronous since
-  // any potential GPU kernels that may be writing to the buffer must finish
-  // before the transfer happens.
-  void CopyToHost(T* data, const size_t size) {
+  // Perform an asynchronous copy from a vector in CPU memory to GPU memory
+  // managed by this CudaBuffer instance.
+  void CopyFromCpuVector(const std::vector<T>& data) {
+    Reserve(data.size());
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data.data(),
+                             data.size() * sizeof(T),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()),
+             cudaSuccess);
+  }
+
+  // Perform an asynchronous copy from another GPU memory array to the GPU
+  // memory managed by this CudaBuffer instance using the stream provided.
+  void CopyFromGPUArray(const T* data, const size_t size) {
+    Reserve(size);
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data,
+                             size * sizeof(T),
+                             cudaMemcpyDeviceToDevice,
+                             context_->DefaultStream()),
+             cudaSuccess);
+  }
+
+  // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
+  // memory. It is the caller's responsibility to ensure that the CPU memory
+  // pointer is valid, i.e. it is not null, and that it points to memory of
+  // at least this->size() size. This method ensures all previously dispatched
+  // GPU operations on the specified stream have completed before copying the
+  // data to CPU memory.
+  void CopyToCpu(T* data, const size_t size) const {
    CHECK(data_ != nullptr);
-    CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
+    CHECK_EQ(cudaMemcpyAsync(data,
+                             data_,
+                             size * sizeof(T),
+                             cudaMemcpyDeviceToHost,
+                             context_->DefaultStream()),
+             cudaSuccess);
+    CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
+  }
+
+  // Copy N items from another GPU memory array to the GPU memory managed by
+  // this CudaBuffer instance, growing this buffer's size if needed. This copy
+  // is asynchronous, and operates on the stream provided.
+  void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
+    Reserve(n);
+    CHECK(other.data_ != nullptr);
+    CHECK(data_ != nullptr);
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             other.data_,
+                             size_ * sizeof(T),
+                             cudaMemcpyDeviceToDevice,
+                             context_->DefaultStream()),
             cudaSuccess);
  }

-  void CopyToGpu(const std::vector<T>& data) {
-    CopyToGpu(data.data(), data.size());
-  }
-
+  // Return a pointer to the GPU memory managed by this CudaBuffer instance.
  T* data() { return data_; }
+  const T* data() const { return data_; }
+  // Return the number of items of type T that can fit in the GPU memory
+  // allocated so far by this CudaBuffer instance.
  size_t size() const { return size_; }

 private:
  T* data_ = nullptr;
  size_t size_ = 0;
+  ContextImpl* context_ = nullptr;
 };
+}  // namespace ceres::internal

 #endif  // CERES_NO_CUDA

-#endif  // CERES_INTERNAL_CUDA_BUFFER_H_
+#endif  // CERES_INTERNAL_CUDA_BUFFER_H_
--- a/extern/ceres/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/extern/ceres/internal/ceres/cuda_dense_cholesky_test.cc
@@ -0,0 +1,332 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_cholesky.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres::internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CUDADenseCholesky, InvalidOptionOnCreate) {
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
+  EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholesky, Cholesky4x4Matrix) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+
+  Vector b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  static const double kEpsilon = std::numeric_limits<double>::epsilon() * 10;
+  const Eigen::Vector4d x_expected(113.75 / 3.0, -31.0 / 3.0, 5.0 / 3.0, 1.0);
+  EXPECT_NEAR((x[0] - x_expected[0]) / x_expected[0], 0.0, kEpsilon);
+  EXPECT_NEAR((x[1] - x_expected[1]) / x_expected[1], 0.0, kEpsilon);
+  EXPECT_NEAR((x[2] - x_expected[2]) / x_expected[2], 0.0, kEpsilon);
+  EXPECT_NEAR((x[3] - x_expected[3]) / x_expected[3], 0.0, kEpsilon);
+}
+
+TEST(CUDADenseCholesky, SingularMatrix) {
+  Eigen::Matrix3d A;
+  // clang-format off
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, 0;
+  // clang-format on
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::FAILURE);
+}
+
+TEST(CUDADenseCholesky, NegativeMatrix) {
+  Eigen::Matrix3d A;
+  // clang-format off
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, -1;
+  // clang-format on
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::FAILURE);
+}
+
+TEST(CUDADenseCholesky, MustFactorizeBeforeSolve) {
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+            LinearSolverTerminationType::FATAL_ERROR);
+}
+
+TEST(CUDADenseCholesky, Randomized1600x1600Tests) {
+  const int kNumCols = 1600;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  std::unique_ptr<DenseCholesky> dense_cholesky =
+      CUDADenseCholesky::Create(options);
+
+  const int kNumTrials = 20;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+    lhs = lhs.transpose() * lhs;
+    lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumCols);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumCols);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_cholesky->FactorAndSolve(
+        kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
+    ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+    static const double kEpsilon = std::numeric_limits<double>::epsilon() * 3e5;
+    ASSERT_NEAR(
+        (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
+  }
+}
+
+TEST(CUDADenseCholeskyMixedPrecision, InvalidOptionsOnCreate) {
+  {
+    // Did not ask for CUDA, and did not ask for mixed precision.
+    LinearSolver::Options options;
+    ContextImpl context;
+    options.context = &context;
+    std::string error;
+    EXPECT_TRUE(context.InitCuda(&error)) << error;
+    auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+    ASSERT_EQ(solver, nullptr);
+  }
+  {
+    // Asked for CUDA, but did not ask for mixed precision.
+    LinearSolver::Options options;
+    ContextImpl context;
+    options.context = &context;
+    std::string error;
+    EXPECT_TRUE(context.InitCuda(&error)) << error;
+    options.dense_linear_algebra_library_type = ceres::CUDA;
+    auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+    ASSERT_EQ(solver, nullptr);
+  }
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix1Step) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  // A common test Cholesky decomposition test matrix, see :
+  // https://en.wikipedia.org/w/index.php?title=Cholesky_decomposition&oldid=1080607368#Example
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.max_num_refinement_iterations = 0;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  options.use_mixed_precision_solves = true;
+  auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+  ASSERT_NE(solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // A single step of the mixed precision solver will be equivalent to solving
+  // in low precision (FP32). Hence the tolerance is defined w.r.t. FP32 epsilon
+  // instead of FP64 epsilon.
+  static const double kEpsilon = std::numeric_limits<float>::epsilon() * 10;
+  const Eigen::Vector4d x_expected(113.75 / 3.0, -31.0 / 3.0, 5.0 / 3.0, 1.0);
+  EXPECT_NEAR((x[0] - x_expected[0]) / x_expected[0], 0.0, kEpsilon);
+  EXPECT_NEAR((x[1] - x_expected[1]) / x_expected[1], 0.0, kEpsilon);
+  EXPECT_NEAR((x[2] - x_expected[2]) / x_expected[2], 0.0, kEpsilon);
+  EXPECT_NEAR((x[3] - x_expected[3]) / x_expected[3], 0.0, kEpsilon);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix4Steps) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.max_num_refinement_iterations = 3;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  options.use_mixed_precision_solves = true;
+  auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+  ASSERT_NE(solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // The error does not reduce beyond four iterations, and stagnates at this
+  // level of precision.
+  static const double kEpsilon = std::numeric_limits<double>::epsilon() * 100;
+  const Eigen::Vector4d x_expected(113.75 / 3.0, -31.0 / 3.0, 5.0 / 3.0, 1.0);
+  EXPECT_NEAR((x[0] - x_expected[0]) / x_expected[0], 0.0, kEpsilon);
+  EXPECT_NEAR((x[1] - x_expected[1]) / x_expected[1], 0.0, kEpsilon);
+  EXPECT_NEAR((x[2] - x_expected[2]) / x_expected[2], 0.0, kEpsilon);
+  EXPECT_NEAR((x[3] - x_expected[3]) / x_expected[3], 0.0, kEpsilon);
+}
+
+TEST(CUDADenseCholeskyMixedPrecision, Randomized1600x1600Tests) {
+  const int kNumCols = 1600;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  options.use_mixed_precision_solves = true;
+  options.max_num_refinement_iterations = 20;
+  std::unique_ptr<CUDADenseCholeskyMixedPrecision> dense_cholesky =
+      CUDADenseCholeskyMixedPrecision::Create(options);
+
+  const int kNumTrials = 20;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+    lhs = lhs.transpose() * lhs;
+    lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumCols);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumCols);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_cholesky->FactorAndSolve(
+        kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
+    ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+    static const double kEpsilon = std::numeric_limits<double>::epsilon() * 1e6;
+    ASSERT_NEAR(
+        (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
+  }
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/cuda_dense_qr_test.cc
+++ b/extern/ceres/internal/ceres/cuda_dense_qr_test.cc
@@ -0,0 +1,177 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_qr.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres::internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CUDADenseQR, InvalidOptionOnCreate) {
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x4Matrix) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(
+      dense_cuda_solver->Factorize(A.rows(), A.cols(), A.data(), &error_string),
+      LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // Empirically observed accuracy of cuSolverDN's QR solver.
+  const double kEpsilon = std::numeric_limits<double>::epsilon() * 1500;
+  const Eigen::Vector4d x_expected(113.75 / 3.0, -31.0 / 3.0, 5.0 / 3.0, 1.0);
+  EXPECT_NEAR((x - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x2Matrix) {
+  Eigen::Matrix<double, 4, 2> A;
+  // clang-format off
+  A <<  4,  12,
+       12,  37,
+      -16, -43,
+        0,   0;
+  // clang-format on
+
+  const std::vector<double> b(4, 1.0);
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(
+      dense_cuda_solver->Factorize(A.rows(), A.cols(), A.data(), &error_string),
+      LinearSolverTerminationType::SUCCESS);
+  std::vector<double> x(2, 0);
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // Empirically observed accuracy of cuSolverDN's QR solver.
+  const double kEpsilon = std::numeric_limits<double>::epsilon() * 10;
+  // Solution values computed with Octave.
+  const Eigen::Vector2d x_expected(-1.143410852713177, 0.4031007751937981);
+  EXPECT_NEAR((x[0] - x_expected[0]) / x_expected[0], 0.0, kEpsilon);
+  EXPECT_NEAR((x[1] - x_expected[1]) / x_expected[1], 0.0, kEpsilon);
+}
+
+TEST(CUDADenseQR, MustFactorizeBeforeSolve) {
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+            LinearSolverTerminationType::FATAL_ERROR);
+}
+
+TEST(CUDADenseQR, Randomized1600x100Tests) {
+  const int kNumRows = 1600;
+  const int kNumCols = 100;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
+
+  const int kNumTrials = 20;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumRows, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumRows);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumRows);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_qr->FactorAndSolve(kNumRows,
+                                                        kNumCols,
+                                                        lhs.data(),
+                                                        rhs.data(),
+                                                        x_computed.data(),
+                                                        &summary.message);
+    ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+    ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(),
+                0.0,
+                std::numeric_limits<double>::epsilon() * 400);
+  }
+}
+#endif  // CERES_NO_CUDA
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/extern/ceres/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -0,0 +1,477 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+#include <cuda_runtime.h>
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+namespace {
+inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) {
+  // par_nosync execution policy was added in Thrust 1.16
+  // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160
+#if THRUST_VERSION < 101700
+  return thrust::cuda::par.on(stream);
+#else
+  return thrust::cuda::par_nosync.on(stream);
+#endif
+}
+
+void* CudaMalloc(size_t size,
+                 cudaStream_t stream,
+                 bool memory_pools_supported) {
+  void* data = nullptr;
+  // Stream-ordered alloaction API is available since CUDA 11.2, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11020
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
+  cudaMalloc(&data, size);
+#else
+  if (memory_pools_supported) {
+    cudaMallocAsync(&data, size, stream);
+  } else {
+    cudaMalloc(&data, size);
+  }
+#endif
+  return data;
+}
+
+void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
+  // Stream-ordered alloaction API is available since CUDA 11.2, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11020
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
+  cudaSuccess, cudaFree(data);
+#else
+  if (memory_pools_supported) {
+    cudaFreeAsync(data, stream);
+  } else {
+    cudaFree(data);
+  }
+#endif
+}
+template <typename T>
+T* CudaAllocate(size_t num_elements,
+                cudaStream_t stream,
+                bool memory_pools_supported) {
+  T* data = static_cast<T*>(
+      CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
+  return data;
+}
+}  // namespace
+
+// Fill row block id and nnz for each row using block-sparse structure
+// represented by a set of flat arrays.
+// Inputs:
+// - num_row_blocks: number of row-blocks in block-sparse structure
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// Outputs:
+// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
+// will be set to 0; rows are filled with a shift by one element in order
+// to obtain row-index array of CRS matrix with a inclusive scan afterwards
+// - row_block_ids: row_block_ids[i] will be set to index of row-block that
+// contains i-th row.
+// Computation is perform row-block-wise
+template <bool partitioned = false>
+__global__ void RowBlockIdAndNNZ(
+    const int num_row_blocks,
+    const int num_col_blocks_e,
+    const int num_row_blocks_e,
+    const int* __restrict__ first_cell_in_row_block,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    int* __restrict__ rows_e,
+    int* __restrict__ rows_f,
+    int* __restrict__ row_block_ids) {
+  const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_block_id > num_row_blocks) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  if (row_block_id == num_row_blocks) {
+    // one extra thread sets the first element
+    rows_f[0] = 0;
+    if constexpr (partitioned) {
+      rows_e[0] = 0;
+    }
+    return;
+  }
+  const auto& row_block = row_blocks[row_block_id];
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  int row_nnz_e = 0;
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    // First cell is a cell from E
+    row_nnz_e = col_blocks[first_cell->block_id].size;
+    ++first_cell;
+  }
+  int row_nnz_f = 0;
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    row_nnz_f += col_blocks[cell->block_id].size;
+  }
+  const int first_row = row_block.position;
+  const int last_row = first_row + row_block.size;
+  for (int i = first_row; i < last_row; ++i) {
+    if constexpr (partitioned) {
+      rows_e[i + 1] = row_nnz_e;
+    }
+    rows_f[i + 1] = row_nnz_f;
+    row_block_ids[i] = row_block_id;
+  }
+}
+
+// Row-wise creation of CRS structure
+// Inputs:
+// - num_rows: number of rows in matrix
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// - row_block_ids: index of row-block that corresponds to row
+// - rows: row-index array of CRS structure
+// Outputs:
+// - cols: column-index array of CRS structure
+// Computaion is perform row-wise
+template <bool partitioned>
+__global__ void ComputeColumns(const int num_rows,
+                               const int num_row_blocks_e,
+                               const int num_col_blocks_e,
+                               const int* __restrict__ first_cell_in_row_block,
+                               const Cell* __restrict__ cells,
+                               const Block* __restrict__ row_blocks,
+                               const Block* __restrict__ col_blocks,
+                               const int* __restrict__ row_block_ids,
+                               const int* __restrict__ rows_e,
+                               int* __restrict__ cols_e,
+                               const int* __restrict__ rows_f,
+                               int* __restrict__ cols_f) {
+  const int row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row >= num_rows) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  const int row_block_id = row_block_ids[row];
+  // position in crs matrix
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  const int num_cols_e = col_blocks[num_col_blocks_e].position;
+  // For reach cell of row-block only current row is being filled
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    // The first cell is cell from E
+    const auto& col_block = col_blocks[first_cell->block_id];
+    const int col_block_size = col_block.size;
+    int column_idx = col_block.position;
+    int crs_position_e = rows_e[row];
+    // Column indices for each element of row_in_block row of current cell
+    for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
+      cols_e[crs_position_e] = column_idx++;
+    }
+    ++first_cell;
+  }
+  int crs_position_f = rows_f[row];
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    const auto& col_block = col_blocks[cell->block_id];
+    const int col_block_size = col_block.size;
+    int column_idx = col_block.position - num_cols_e;
+    // Column indices for each element of row_in_block row of current cell
+    for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
+      cols_f[crs_position_f] = column_idx++;
+    }
+  }
+}
+
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* first_cell_in_row_block,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      cudaStream_t stream,
+                      bool memory_pools_supported) {
+  // Set number of non-zeros per row in rows array and row to row-block map in
+  // row_block_ids array
+  int* row_block_ids =
+      CudaAllocate<int>(num_rows, stream, memory_pools_supported);
+  const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+  RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+      num_row_blocks,
+      0,
+      0,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      nullptr,
+      rows,
+      row_block_ids);
+  // Finalize row-index array of CRS strucure by computing prefix sum
+  thrust::inclusive_scan(
+      ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);
+
+  // Fill cols array of CRS structure
+  const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+  ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+      num_rows,
+      0,
+      0,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      row_block_ids,
+      nullptr,
+      nullptr,
+      rows,
+      cols);
+  CudaFree(row_block_ids, stream, memory_pools_supported);
+}
+
+void FillCRSStructurePartitioned(const int num_row_blocks,
+                                 const int num_rows,
+                                 const int num_row_blocks_e,
+                                 const int num_col_blocks_e,
+                                 const int num_nonzeros_e,
+                                 const int* first_cell_in_row_block,
+                                 const Cell* cells,
+                                 const Block* row_blocks,
+                                 const Block* col_blocks,
+                                 int* rows_e,
+                                 int* cols_e,
+                                 int* rows_f,
+                                 int* cols_f,
+                                 cudaStream_t stream,
+                                 bool memory_pools_supported) {
+  // Set number of non-zeros per row in rows array and row to row-block map in
+  // row_block_ids array
+  int* row_block_ids =
+      CudaAllocate<int>(num_rows, stream, memory_pools_supported);
+  const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+  RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+      num_row_blocks,
+      num_col_blocks_e,
+      num_row_blocks_e,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      rows_e,
+      rows_f,
+      row_block_ids);
+  // Finalize row-index array of CRS strucure by computing prefix sum
+  thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+                         rows_e,
+                         rows_e + num_rows + 1,
+                         rows_e);
+  thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+                         rows_f,
+                         rows_f + num_rows + 1,
+                         rows_f);
+
+  // Fill cols array of CRS structure
+  const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+  ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+      num_rows,
+      num_row_blocks_e,
+      num_col_blocks_e,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      row_block_ids,
+      rows_e,
+      cols_e,
+      rows_f,
+      cols_f);
+  CudaFree(row_block_ids, stream, memory_pools_supported);
+}
+
+template <typename T, typename Predicate>
+__device__ int PartitionPoint(const T* data,
+                              int first,
+                              int last,
+                              Predicate&& predicate) {
+  if (!predicate(data[first])) {
+    return first;
+  }
+  while (last - first > 1) {
+    const auto midpoint = first + (last - first) / 2;
+    if (predicate(data[midpoint])) {
+      first = midpoint;
+    } else {
+      last = midpoint;
+    }
+  }
+  return last;
+}
+
+// Element-wise reordering of block-sparse values
+// - first_cell_in_row_block - position of the first cell of row-block
+// - block_sparse_values - segment of block-sparse values starting from
+// block_sparse_offset, containing num_values
+template <bool partitioned>
+__global__ void PermuteToCrsKernel(
+    const int block_sparse_offset,
+    const int num_values,
+    const int num_row_blocks,
+    const int num_row_blocks_e,
+    const int* __restrict__ first_cell_in_row_block,
+    const int* __restrict__ value_offset_row_block_f,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    const int* __restrict__ crs_rows,
+    const double* __restrict__ block_sparse_values,
+    double* __restrict__ crs_values) {
+  const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (value_id >= num_values) {
+    return;
+  }
+  const int block_sparse_value_id = value_id + block_sparse_offset;
+  // Find the corresponding row-block with a binary search
+  const int row_block_id =
+      (partitioned
+           ? PartitionPoint(value_offset_row_block_f,
+                            0,
+                            num_row_blocks,
+                            [block_sparse_value_id] __device__(
+                                const int row_block_offset) {
+                              return row_block_offset <= block_sparse_value_id;
+                            })
+           : PartitionPoint(first_cell_in_row_block,
+                            0,
+                            num_row_blocks,
+                            [cells, block_sparse_value_id] __device__(
+                                const int row_block_offset) {
+                              return cells[row_block_offset].position <=
+                                     block_sparse_value_id;
+                            })) -
+      1;
+  // Find cell and calculate offset within the row with a linear scan
+  const auto& row_block = row_blocks[row_block_id];
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  const int row_block_size = row_block.size;
+  int num_cols_before = 0;
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    ++first_cell;
+  }
+  for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
+    const auto& col_block = col_blocks[cell->block_id];
+    const int col_block_size = col_block.size;
+    const int cell_size = row_block_size * col_block_size;
+    if (cell->position + cell_size > block_sparse_value_id) {
+      const int pos_in_cell = block_sparse_value_id - cell->position;
+      const int row_in_cell = pos_in_cell / col_block_size;
+      const int col_in_cell = pos_in_cell % col_block_size;
+      const int row = row_in_cell + row_block.position;
+      crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
+          block_sparse_values[value_id];
+      break;
+    }
+    num_cols_before += col_block_size;
+  }
+}
+
+void PermuteToCRS(const int block_sparse_offset,
+                  const int num_values,
+                  const int num_row_blocks,
+                  const int* first_cell_in_row_block,
+                  const Cell* cells,
+                  const Block* row_blocks,
+                  const Block* col_blocks,
+                  const int* crs_rows,
+                  const double* block_sparse_values,
+                  double* crs_values,
+                  cudaStream_t stream) {
+  const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+  PermuteToCrsKernel<false>
+      <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+          block_sparse_offset,
+          num_values,
+          num_row_blocks,
+          0,
+          first_cell_in_row_block,
+          nullptr,
+          cells,
+          row_blocks,
+          col_blocks,
+          crs_rows,
+          block_sparse_values,
+          crs_values);
+}
+
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+                              const int num_values,
+                              const int num_row_blocks,
+                              const int num_row_blocks_e,
+                              const int* first_cell_in_row_block,
+                              const int* value_offset_row_block_f,
+                              const Cell* cells,
+                              const Block* row_blocks,
+                              const Block* col_blocks,
+                              const int* crs_rows,
+                              const double* block_sparse_values,
+                              double* crs_values,
+                              cudaStream_t stream) {
+  const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+  PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+      block_sparse_offset,
+      num_values,
+      num_row_blocks,
+      num_row_blocks_e,
+      first_cell_in_row_block,
+      value_offset_row_block_f,
+      cells,
+      row_blocks,
+      col_blocks,
+      crs_rows,
+      block_sparse_values,
+      crs_values);
+}
+
+}  // namespace internal
+}  // namespace ceres
--- a/extern/ceres/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/extern/ceres/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -0,0 +1,113 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres {
+namespace internal {
+struct Block;
+struct Cell;
+
+// Compute structure of CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrix are to be allocated by caller
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* first_cell_in_row_block,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      cudaStream_t stream,
+                      bool memory_pools_supported);
+
+// Compute structure of partitioned CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrices are to be allocated by caller
+void FillCRSStructurePartitioned(const int num_row_blocks,
+                                 const int num_rows,
+                                 const int num_row_blocks_e,
+                                 const int num_col_blocks_e,
+                                 const int num_nonzeros_e,
+                                 const int* first_cell_in_row_block,
+                                 const Cell* cells,
+                                 const Block* row_blocks,
+                                 const Block* col_blocks,
+                                 int* rows_e,
+                                 int* cols_e,
+                                 int* rows_f,
+                                 int* cols_f,
+                                 cudaStream_t stream,
+                                 bool memory_pools_supported);
+
+// Permute segment of values from block-sparse matrix with sequential layout to
+// CRS order. Segment starts at block_sparse_offset and has length of num_values
+void PermuteToCRS(const int block_sparse_offset,
+                  const int num_values,
+                  const int num_row_blocks,
+                  const int* first_cell_in_row_block,
+                  const Cell* cells,
+                  const Block* row_blocks,
+                  const Block* col_blocks,
+                  const int* crs_rows,
+                  const double* block_sparse_values,
+                  double* crs_values,
+                  cudaStream_t stream);
+
+// Permute segment of values from F sub-matrix of block-sparse partitioned
+// matrix with sequential layout to CRS order. Segment starts at
+// block_sparse_offset (including the offset induced by values of E submatrix)
+// and has length of num_values
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+                              const int num_values,
+                              const int num_row_blocks,
+                              const int num_row_blocks_e,
+                              const int* first_cell_in_row_block,
+                              const int* value_offset_row_block_f,
+                              const Cell* cells,
+                              const Block* row_blocks,
+                              const Block* col_blocks,
+                              const int* crs_rows,
+                              const double* block_sparse_values,
+                              double* crs_values,
+                              cudaStream_t stream);
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_NO_CUDA
+
+#endif  // CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
--- a/extern/ceres/internal/ceres/parallel_for_nothreads.cc
+++ b/extern/ceres/internal/ceres/parallel_for_nothreads.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -26,53 +26,31 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: alexs.mac@gmail.com (Alex Stewart)
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)

-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
-
-#ifdef CERES_NO_THREADS
-
-#include "ceres/parallel_for.h"
-#include "glog/logging.h"
+#ifndef CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_UTILS_H_

 namespace ceres {
 namespace internal {

-int MaxNumThreadsAvailable() { return 1; }
+// Parallel execution on CUDA device requires splitting job into blocks of a
+// fixed size. We use block-size of kCudaBlockSize for all kernels that do not
+// require any specific block size. As the CUDA Toolkit documentation says,
+// "although arbitrary in this case, is a common choice". This is determined by
+// the warp size, max block size, and multiprocessor sizes of recent GPUs. For
+// complex kernels with significant register usage and unusual memory patterns,
+// the occupancy calculator API might provide better performance. See "Occupancy
+// Calculator" under the CUDA toolkit documentation.
+constexpr int kCudaBlockSize = 256;

-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int)>& function) {
-  CHECK_GT(num_threads, 0);
-  CHECK(context != nullptr);
-  if (end <= start) {
-    return;
-  }
-  for (int i = start; i < end; ++i) {
-    function(i);
-  }
+// Compute number of blocks of kCudaBlockSize that span over 1-d grid with
+// dimension size. Note that 1-d grid dimension is limited by 2^31-1 in CUDA,
+// thus a signed int is used as an argument.
+inline int NumBlocksInGrid(int size) {
+  return (size + kCudaBlockSize - 1) / kCudaBlockSize;
 }
-
-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int thread_id, int i)>& function) {
-  CHECK_GT(num_threads, 0);
-  CHECK(context != nullptr);
-  if (end <= start) {
-    return;
-  }
-  const int thread_id = 0;
-  for (int i = start; i < end; ++i) {
-    function(thread_id, i);
-  }
-}
-
 }  // namespace internal
 }  // namespace ceres

-#endif  // CERES_NO_THREADS
+#endif  // CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
--- a/extern/ceres/internal/ceres/cuda_kernels_vector_ops.cu.cc
+++ b/extern/ceres/internal/ceres/cuda_kernels_vector_ops.cu.cc
@@ -0,0 +1,123 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_kernels_vector_ops.h"
+
+#include <cuda_runtime.h>
+
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+template <typename SrcType, typename DstType>
+__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
+                                     DstType* __restrict__ output,
+                                     const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = static_cast<DstType>(input[i]);
+  }
+}
+
+void CudaFP64ToFP32(const double* input,
+                    float* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  TypeConversionKernel<double, float>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+void CudaFP32ToFP64(const float* input,
+                    double* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  TypeConversionKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+template <typename T>
+__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = T(0.0);
+  }
+}
+
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  SetZeroKernel<double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+template <typename SrcType, typename DstType>
+__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
+                                   const SrcType* __restrict__ y,
+                                   const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    x[i] = x[i] + DstType(y[i]);
+  }
+}
+
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  XPlusEqualsYKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
+}
+
+__global__ void CudaDtDxpyKernel(double* __restrict__ y,
+                                 const double* D,
+                                 const double* __restrict__ x,
+                                 const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    y[i] = y[i] + D[i] * D[i] * x[i];
+  }
+}
+
+void CudaDtDxpy(double* y,
+                const double* D,
+                const double* x,
+                const int size,
+                cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
+}
+
+}  // namespace internal
+}  // namespace ceres
--- a/extern/ceres/internal/ceres/cuda_kernels_vector_ops.h
+++ b/extern/ceres/internal/ceres/cuda_kernels_vector_ops.h
@@ -0,0 +1,83 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres {
+namespace internal {
+class Block;
+class Cell;
+
+// Convert an array of double (FP64) values to float (FP32). Both arrays must
+// already be on GPU memory.
+void CudaFP64ToFP32(const double* input,
+                    float* output,
+                    const int size,
+                    cudaStream_t stream);
+
+// Convert an array of float (FP32) values to double (FP64). Both arrays must
+// already be on GPU memory.
+void CudaFP32ToFP64(const float* input,
+                    double* output,
+                    const int size,
+                    cudaStream_t stream);
+
+// Set all elements of the array to the FP32 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream);
+
+// Set all elements of the array to the FP64 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream);
+
+// Compute x = x + double(y). Input array is float (FP32), output array is
+// double (FP64). Both arrays must already be on GPU memory.
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream);
+
+// Compute y[i] = y[i] + d[i]^2 x[i]. All arrays must already be on GPU memory.
+void CudaDtDxpy(double* y,
+                const double* D,
+                const double* x,
+                const int size,
+                cudaStream_t stream);
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_NO_CUDA
+
+#endif  // CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
--- a/extern/ceres/internal/ceres/cuda_kernels_vector_ops_test.cc
+++ b/extern/ceres/internal/ceres/cuda_kernels_vector_ops_test.cc
@@ -0,0 +1,198 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_kernels_vector_ops.h"
+
+#include <math.h>
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "ceres/context_impl.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CudaFP64ToFP32, SimpleConversions) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.Reserve(fp64_cpu.size());
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 context.DefaultStream());
+  std::vector<float> fp32_cpu(fp64_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+  for (int i = 0; i < fp32_cpu.size(); ++i) {
+    EXPECT_EQ(fp32_cpu[i], static_cast<float>(fp64_cpu[i]));
+  }
+}
+
+TEST(CudaFP64ToFP32, NumericallyExtremeValues) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<double> fp64_cpu = {
+      DBL_MIN, 10.0 * DBL_MIN, DBL_MAX, 0.1 * DBL_MAX};
+  // First just make sure that the compiler has represented these values
+  // accurately as fp64.
+  EXPECT_GT(fp64_cpu[0], 0.0);
+  EXPECT_GT(fp64_cpu[1], 0.0);
+  EXPECT_TRUE(std::isfinite(fp64_cpu[2]));
+  EXPECT_TRUE(std::isfinite(fp64_cpu[3]));
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.Reserve(fp64_cpu.size());
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 context.DefaultStream());
+  std::vector<float> fp32_cpu(fp64_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+  EXPECT_EQ(fp32_cpu[0], 0.0f);
+  EXPECT_EQ(fp32_cpu[1], 0.0f);
+  EXPECT_EQ(fp32_cpu[2], std::numeric_limits<float>::infinity());
+  EXPECT_EQ(fp32_cpu[3], std::numeric_limits<float>::infinity());
+}
+
+TEST(CudaFP32ToFP64, SimpleConversions) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.CopyFromCpuVector(fp32_cpu);
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.Reserve(fp32_cpu.size());
+  CudaFP32ToFP64(fp32_gpu.data(),
+                 fp64_gpu.data(),
+                 fp32_cpu.size(),
+                 context.DefaultStream());
+  std::vector<double> fp64_cpu(fp32_cpu.size());
+  fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size());
+  for (int i = 0; i < fp64_cpu.size(); ++i) {
+    EXPECT_EQ(fp64_cpu[i], static_cast<double>(fp32_cpu[i]));
+  }
+}
+
+TEST(CudaSetZeroFP32, NonZeroInput) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.CopyFromCpuVector(fp32_cpu);
+  CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.DefaultStream());
+  std::vector<float> fp32_cpu_zero(fp32_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
+  for (int i = 0; i < fp32_cpu_zero.size(); ++i) {
+    EXPECT_EQ(fp32_cpu_zero[i], 0.0f);
+  }
+}
+
+TEST(CudaSetZeroFP64, NonZeroInput) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
+  CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.DefaultStream());
+  std::vector<double> fp64_cpu_zero(fp64_cpu.size());
+  fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
+  for (int i = 0; i < fp64_cpu_zero.size(); ++i) {
+    EXPECT_EQ(fp64_cpu_zero[i], 0.0);
+  }
+}
+
+TEST(CudaDsxpy, DoubleValues) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  std::vector<double> fp64_cpu_b = {
+      1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu_a(&context);
+  fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a);
+  CudaBuffer<double> fp64_gpu_b(&context);
+  fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b);
+  CudaDsxpy(fp64_gpu_b.data(),
+            fp32_gpu_a.data(),
+            fp32_gpu_a.size(),
+            context.DefaultStream());
+  fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size());
+  for (int i = 0; i < fp64_cpu_b.size(); ++i) {
+    EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]);
+  }
+}
+
+TEST(CudaDtDxpy, ComputeFourItems) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
+  std::vector<double> x_cpu = {1, 2, 3, 4};
+  std::vector<double> y_cpu = {4, 3, 2, 1};
+  std::vector<double> d_cpu = {10, 20, 30, 40};
+  CudaBuffer<double> x_gpu(&context);
+  x_gpu.CopyFromCpuVector(x_cpu);
+  CudaBuffer<double> y_gpu(&context);
+  y_gpu.CopyFromCpuVector(y_cpu);
+  CudaBuffer<double> d_gpu(&context);
+  d_gpu.CopyFromCpuVector(d_cpu);
+  CudaDtDxpy(y_gpu.data(),
+             d_gpu.data(),
+             x_gpu.data(),
+             y_gpu.size(),
+             context.DefaultStream());
+  y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size());
+  EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0);
+  EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);
+  EXPECT_DOUBLE_EQ(y_cpu[2], 2.0 + 30.0 * 30.0 * 3.0);
+  EXPECT_DOUBLE_EQ(y_cpu[3], 1.0 + 40.0 * 40.0 * 4.0);
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
--- a/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+++ b/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -0,0 +1,152 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+namespace ceres::internal {
+
+CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView(
+    const BlockSparseMatrix& bsm,
+    const int num_col_blocks_e,
+    ContextImpl* context)
+    :
+
+      context_(context) {
+  const auto& bs = *bsm.block_structure();
+  block_structure_ =
+      std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context);
+  // Determine number of non-zeros in left submatrix
+  // Row-blocks are at least 1 row high, thus we can use a temporary array of
+  // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for
+  // FillCRSStructurePartitioned
+  const int num_rows = bsm.num_rows();
+  const int num_nonzeros_e = block_structure_->num_nonzeros_e();
+  const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e;
+
+  const int num_cols_e = num_col_blocks_e < bs.cols.size()
+                             ? bs.cols[num_col_blocks_e].position
+                             : bsm.num_cols();
+  const int num_cols_f = bsm.num_cols() - num_cols_e;
+
+  CudaBuffer<int32_t> rows_e(context, num_rows + 1);
+  CudaBuffer<int32_t> cols_e(context, num_nonzeros_e);
+  CudaBuffer<int32_t> rows_f(context, num_rows + 1);
+  CudaBuffer<int32_t> cols_f(context, num_nonzeros_f);
+
+  num_row_blocks_e_ = block_structure_->num_row_blocks_e();
+  FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
+                              num_rows,
+                              num_row_blocks_e_,
+                              num_col_blocks_e,
+                              num_nonzeros_e,
+                              block_structure_->first_cell_in_row_block(),
+                              block_structure_->cells(),
+                              block_structure_->row_blocks(),
+                              block_structure_->col_blocks(),
+                              rows_e.data(),
+                              cols_e.data(),
+                              rows_f.data(),
+                              cols_f.data(),
+                              context->DefaultStream(),
+                              context->is_cuda_memory_pools_supported_);
+  f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
+  if (f_is_crs_compatible_) {
+    block_structure_ = nullptr;
+  } else {
+    streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+        context, kMaxTemporaryArraySize);
+  }
+  matrix_e_ = std::make_unique<CudaSparseMatrix>(
+      num_cols_e, std::move(rows_e), std::move(cols_e), context);
+  matrix_f_ = std::make_unique<CudaSparseMatrix>(
+      num_cols_f, std::move(rows_f), std::move(cols_f), context);
+
+  CHECK_EQ(bsm.num_nonzeros(),
+           matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
+
+  UpdateValues(bsm);
+}
+
+void CudaPartitionedBlockSparseCRSView::UpdateValues(
+    const BlockSparseMatrix& bsm) {
+  if (f_is_crs_compatible_) {
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(matrix_e_->mutable_values(),
+                             bsm.values(),
+                             matrix_e_->num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(matrix_f_->mutable_values(),
+                             bsm.values() + matrix_e_->num_nonzeros(),
+                             matrix_f_->num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+    return;
+  }
+  streamed_buffer_->CopyToGpu(
+      bsm.values(),
+      bsm.num_nonzeros(),
+      [block_structure = block_structure_.get(),
+       num_nonzeros_e = matrix_e_->num_nonzeros(),
+       num_row_blocks_e = num_row_blocks_e_,
+       values_f = matrix_f_->mutable_values(),
+       rows_f = matrix_f_->rows()](
+          const double* values, int num_values, int offset, auto stream) {
+        PermuteToCRSPartitionedF(num_nonzeros_e + offset,
+                                 num_values,
+                                 block_structure->num_row_blocks(),
+                                 num_row_blocks_e,
+                                 block_structure->first_cell_in_row_block(),
+                                 block_structure->value_offset_row_block_f(),
+                                 block_structure->cells(),
+                                 block_structure->row_blocks(),
+                                 block_structure->col_blocks(),
+                                 rows_f,
+                                 values,
+                                 values_f,
+                                 stream);
+      });
+  CHECK_EQ(cudaSuccess,
+           cudaMemcpyAsync(matrix_e_->mutable_values(),
+                           bsm.values(),
+                           matrix_e_->num_nonzeros() * sizeof(double),
+                           cudaMemcpyHostToDevice,
+                           context_->DefaultStream()));
+}
+
+}  // namespace ceres::internal
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
+++ b/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
@@ -0,0 +1,111 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// neither block-sparse format with varying size of the blocks nor
+// submatrix-vector products. Thus, we perform the following operations in order
+// to compute products of partitioned block-sparse matrices and dense vectors on
+// gpu:
+//  - Once per block-sparse structure update:
+//    - Compute CRS structures of left and right submatrices from block-sparse
+//    structure
+//    - Check if values of F sub-matrix can be copied without permutation
+//    matrices
+//  - Once per block-sparse values update:
+//    - Copy values of E sub-matrix
+//    - Permute or copy values of F sub-matrix
+//
+// It is assumed that cells of block-sparse matrix are laid out sequentially in
+// both of sub-matrices and there is exactly one cell in row-block of E
+// sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
+// sub-matrix below num_row_blocks_e_ row blocks.
+//
+// This class avoids storing both CRS and block-sparse values in GPU memory.
+// Instead, block-sparse values are transferred to gpu memory as a disjoint set
+// of small continuous segments with simultaneous permutation of the values into
+// correct order using block-structure.
+class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
+ public:
+  // Initializes internal CRS matrix and block-sparse structure on GPU side
+  // values. The following objects are stored in gpu memory for the whole
+  // lifetime of the object
+  //  - matrix_e_: left CRS submatrix
+  //  - matrix_f_: right CRS submatrix
+  //  - block_structure_: copy of block-sparse structure on GPU
+  //  - streamed_buffer_: helper for value updating
+  CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
+                                    const int num_col_blocks_e,
+                                    ContextImpl* context);
+
+  // Update values of CRS submatrices using values of block-sparse matrix.
+  // Assumes that bsm has the same block-sparse structure as matrix that was
+  // used for construction.
+  void UpdateValues(const BlockSparseMatrix& bsm);
+
+  const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
+  const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
+  CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
+  CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }
+
+ private:
+  // Value permutation kernel performs a single element-wise operation per
+  // thread, thus performing permutation in blocks of 8 megabytes of
+  // block-sparse  values seems reasonable
+  static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+  std::unique_ptr<CudaSparseMatrix> matrix_e_;
+  std::unique_ptr<CudaSparseMatrix> matrix_f_;
+  std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+  std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+  bool f_is_crs_compatible_;
+  int num_row_blocks_e_;
+  ContextImpl* context_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
--- a/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
+++ b/extern/ceres/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -0,0 +1,279 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+
+namespace {
+struct RandomPartitionedMatrixOptions {
+  int num_row_blocks_e;
+  int num_row_blocks_f;
+  int num_col_blocks_e;
+  int num_col_blocks_f;
+  int min_row_block_size;
+  int max_row_block_size;
+  int min_col_block_size;
+  int max_col_block_size;
+  double empty_f_probability;
+  double cell_probability_f;
+  int max_cells_f;
+};
+
+std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
+    const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
+  const int num_row_blocks =
+      std::max(options.num_row_blocks_e, options.num_row_blocks_f);
+  const int num_col_blocks =
+      options.num_col_blocks_e + options.num_col_blocks_f;
+
+  CompressedRowBlockStructure* block_structure =
+      new CompressedRowBlockStructure;
+  block_structure->cols.reserve(num_col_blocks);
+  block_structure->rows.reserve(num_row_blocks);
+
+  // Create column blocks
+  std::uniform_int_distribution<int> col_size(options.min_col_block_size,
+                                              options.max_col_block_size);
+  int num_cols = 0;
+  for (int i = 0; i < num_col_blocks; ++i) {
+    const int size = col_size(rng);
+    block_structure->cols.emplace_back(size, num_cols);
+    num_cols += size;
+  }
+
+  // Prepare column-block indices of E cells
+  std::vector<int> e_col_block_idx;
+  e_col_block_idx.reserve(options.num_row_blocks_e);
+  std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
+  for (int i = 0; i < options.num_row_blocks_e; ++i) {
+    e_col_block_idx.emplace_back(col_e(rng));
+  }
+  std::sort(e_col_block_idx.begin(), e_col_block_idx.end());
+
+  // Prepare cell structure
+  std::uniform_int_distribution<int> row_size(options.min_row_block_size,
+                                              options.max_row_block_size);
+  std::uniform_real_distribution<double> uniform;
+  int num_rows = 0;
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int size = row_size(rng);
+    block_structure->rows.emplace_back();
+    auto& row = block_structure->rows.back();
+    row.block.size = size;
+    row.block.position = num_rows;
+    num_rows += size;
+    if (i < options.num_row_blocks_e) {
+      row.cells.emplace_back(e_col_block_idx[i], -1);
+      if (uniform(rng) < options.empty_f_probability) {
+        continue;
+      }
+    }
+    if (i >= options.num_row_blocks_f) continue;
+    const int cells_before = row.cells.size();
+    for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
+      if (uniform(rng) > options.cell_probability_f) {
+        continue;
+      }
+      row.cells.emplace_back(j, -1);
+    }
+    if (row.cells.size() > cells_before + options.max_cells_f) {
+      std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
+      row.cells.resize(cells_before + options.max_cells_f);
+      std::sort(
+          row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
+            return a.block_id < b.block_id;
+          });
+    }
+  }
+
+  // Fill positions in E sub-matrix
+  int num_nonzeros = 0;
+  for (int i = 0; i < options.num_row_blocks_e; ++i) {
+    CHECK_GE(block_structure->rows[i].cells.size(), 1);
+    block_structure->rows[i].cells[0].position = num_nonzeros;
+    const int col_block_size =
+        block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
+    const int row_block_size = block_structure->rows[i].block.size;
+    num_nonzeros += row_block_size * col_block_size;
+    CHECK_GE(num_nonzeros, 0);
+  }
+  // Fill positions in F sub-matrix
+  for (int i = 0; i < options.num_row_blocks_f; ++i) {
+    const int row_block_size = block_structure->rows[i].block.size;
+    for (auto& cell : block_structure->rows[i].cells) {
+      if (cell.position >= 0) continue;
+      cell.position = num_nonzeros;
+      const int col_block_size = block_structure->cols[cell.block_id].size;
+      num_nonzeros += row_block_size * col_block_size;
+      CHECK_GE(num_nonzeros, 0);
+    }
+  }
+  // Populate values
+  auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
+  for (int i = 0; i < num_nonzeros; ++i) {
+    bsm->mutable_values()[i] = i + 1;
+  }
+  return bsm;
+}
+}  // namespace
+
+class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
+  static constexpr int kNumColBlocksE = 456;
+
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    RandomPartitionedMatrixOptions options;
+    options.num_row_blocks_f = 123;
+    options.num_row_blocks_e = 456;
+    options.num_col_blocks_f = 123;
+    options.num_col_blocks_e = kNumColBlocksE;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 4;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 4;
+    options.empty_f_probability = .1;
+    options.cell_probability_f = .2;
+    options.max_cells_f = options.num_col_blocks_f;
+
+    std::mt19937 rng;
+    short_f_ = CreateRandomPartitionedMatrix(options, rng);
+
+    options.num_row_blocks_e = 123;
+    options.num_row_blocks_f = 456;
+    short_e_ = CreateRandomPartitionedMatrix(options, rng);
+
+    options.max_cells_f = 1;
+    options.num_row_blocks_e = options.num_row_blocks_f;
+    options.num_row_blocks_e = options.num_row_blocks_f;
+    f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
+  }
+
+  void TestMatrix(const BlockSparseMatrix& A_) {
+    const int num_col_blocks_e = 456;
+    CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);
+
+    const int num_rows = A_.num_rows();
+    const int num_cols = A_.num_cols();
+
+    const auto& bs = *A_.block_structure();
+    const int num_cols_e = bs.cols[num_col_blocks_e].position;
+    const int num_cols_f = num_cols - num_cols_e;
+
+    auto matrix_e = view.matrix_e();
+    auto matrix_f = view.matrix_f();
+    ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
+    ASSERT_EQ(matrix_e->num_rows(), num_rows);
+    ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
+    ASSERT_EQ(matrix_f->num_rows(), num_rows);
+
+    Vector x(num_cols);
+    Vector x_left(num_cols_e);
+    Vector x_right(num_cols_f);
+    Vector y(num_rows);
+    CudaVector x_cuda(&context_, num_cols);
+    CudaVector x_left_cuda(&context_, num_cols_e);
+    CudaVector x_right_cuda(&context_, num_cols_f);
+    CudaVector y_cuda(&context_, num_rows);
+    Vector y_cuda_host(num_rows);
+
+    for (int i = 0; i < num_cols_e; ++i) {
+      x.setZero();
+      x_left.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_left[i] = 1.;
+      x_left_cuda.CopyFromCpu(x_left);
+      A_.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+    for (int i = num_cols_e; i < num_cols_f; ++i) {
+      x.setZero();
+      x_right.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_right[i - num_cols_e] = 1.;
+      x_right_cuda.CopyFromCpu(x_right);
+      A_.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+  }
+
+  // E sub-matrix might have less row-blocks with cells than F sub-matrix. This
+  // test matrix checks if this case is handled properly
+  std::unique_ptr<BlockSparseMatrix> short_e_;
+  // In case of non-crs compatible F matrix, permuting values from block-order
+  // to crs order involves binary search over row-blocks of F. Having lots of
+  // row-blocks with no F cells is an edge case for this algorithm.
+  std::unique_ptr<BlockSparseMatrix> short_f_;
+  // With F matrix being CRS-compatible, update of the values of partitioned
+  // matrix view reduces to two host->device memcopies, and uses a separate code
+  // path
+  std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;
+
+  ContextImpl context_;
+};
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
+  TestMatrix(*short_e_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
+  TestMatrix(*short_f_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest,
+       CreateUpdateValuesCrsCompatibleF) {
+  TestMatrix(*f_crs_compatible_);
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/cuda_sparse_matrix.cc
@@ -0,0 +1,226 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include "ceres/cuda_sparse_matrix.h"
+
+#include <math.h>
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
+#include "ceres/crs_matrix.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+#include "ceres/wall_time.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels_vector_ops.h"
+#include "ceres/cuda_vector.h"
+#include "cuda_runtime_api.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+namespace {
+// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
+// CUSPARSE_SPMV_ALG_DEFAULT.
+#if CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
+#else   // CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
+#endif  // CUDART_VERSION >= 11021
+size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
+                              const cusparseOperation_t op,
+                              const cusparseDnVecDescr_t& x,
+                              const cusparseDnVecDescr_t& y,
+                              const cusparseSpMatDescr_t& A) {
+  size_t buffer_size;
+  const double alpha = 1.0;
+  const double beta = 1.0;
+  CHECK_NE(A, nullptr);
+  CHECK_EQ(cusparseSpMV_bufferSize(handle,
+                                   op,
+                                   &alpha,
+                                   A,
+                                   x,
+                                   &beta,
+                                   y,
+                                   CUDA_R_64F,
+                                   kSpMVAlgorithm,
+                                   &buffer_size),
+           CUSPARSE_STATUS_SUCCESS);
+  return buffer_size;
+}
+
+size_t GetTempBufferSize(const cusparseHandle_t& handle,
+                         const cusparseDnVecDescr_t& left,
+                         const cusparseDnVecDescr_t& right,
+                         const cusparseSpMatDescr_t& A) {
+  CHECK_NE(A, nullptr);
+  return std::max(GetTempBufferSizeForOp(
+                      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
+                  GetTempBufferSizeForOp(
+                      handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
+}
+}  // namespace
+
+CudaSparseMatrix::CudaSparseMatrix(int num_cols,
+                                   CudaBuffer<int32_t>&& rows,
+                                   CudaBuffer<int32_t>&& cols,
+                                   ContextImpl* context)
+    : num_rows_(rows.size() - 1),
+      num_cols_(num_cols),
+      num_nonzeros_(cols.size()),
+      context_(context),
+      rows_(std::move(rows)),
+      cols_(std::move(cols)),
+      values_(context, num_nonzeros_),
+      spmv_buffer_(context) {
+  Initialize();
+}
+
+CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
+                                   const CompressedRowSparseMatrix& crs_matrix)
+    : num_rows_(crs_matrix.num_rows()),
+      num_cols_(crs_matrix.num_cols()),
+      num_nonzeros_(crs_matrix.num_nonzeros()),
+      context_(context),
+      rows_(context, num_rows_ + 1),
+      cols_(context, num_nonzeros_),
+      values_(context, num_nonzeros_),
+      spmv_buffer_(context) {
+  rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
+  cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
+  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
+  Initialize();
+}
+
+CudaSparseMatrix::~CudaSparseMatrix() {
+  CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
+  descr_ = nullptr;
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
+}
+
+void CudaSparseMatrix::CopyValuesFromCpu(
+    const CompressedRowSparseMatrix& crs_matrix) {
+  // There is no quick and easy way to verify that the structure is unchanged,
+  // but at least we can check that the size of the matrix and the number of
+  // nonzeros is unchanged.
+  CHECK_EQ(num_rows_, crs_matrix.num_rows());
+  CHECK_EQ(num_cols_, crs_matrix.num_cols());
+  CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
+  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
+}
+
+void CudaSparseMatrix::Initialize() {
+  CHECK(context_->IsCudaInitialized());
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateCsr(&descr_,
+                             num_rows_,
+                             num_cols_,
+                             num_nonzeros_,
+                             rows_.data(),
+                             cols_.data(),
+                             values_.data(),
+                             CUSPARSE_INDEX_32I,
+                             CUSPARSE_INDEX_32I,
+                             CUSPARSE_INDEX_BASE_ZERO,
+                             CUDA_R_64F));
+
+  // Note: values_.data() is used as non-zero pointer to device memory
+  // When there is no non-zero values, data-pointer of values_ array will be a
+  // nullptr; but in this case left/right products are trivial and temporary
+  // buffer (and vector descriptors) is not required
+  if (!num_nonzeros_) return;
+
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateDnVec(
+               &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateDnVec(
+               &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
+  size_t buffer_size = GetTempBufferSize(
+      context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
+  spmv_buffer_.Reserve(buffer_size);
+}
+
+void CudaSparseMatrix::SpMv(cusparseOperation_t op,
+                            const cusparseDnVecDescr_t& x,
+                            const cusparseDnVecDescr_t& y) const {
+  const double alpha = 1.0;
+  const double beta = 1.0;
+
+  CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
+                        op,
+                        &alpha,
+                        descr_,
+                        x,
+                        &beta,
+                        y,
+                        CUDA_R_64F,
+                        kSpMVAlgorithm,
+                        spmv_buffer_.data()),
+           CUSPARSE_STATUS_SUCCESS);
+}
+
+void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
+                                                  CudaVector* y) const {
+  DCHECK(GetTempBufferSize(
+             context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
+         spmv_buffer_.size());
+  SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
+}
+
+void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
+                                                 CudaVector* y) const {
+  // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
+  // matrix by converting CSR to CSC. From the cuSPARSE documentation:
+  // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
+  // != CUSPARSE_OPERATION_NON_TRANSPOSE"
+  DCHECK(GetTempBufferSize(
+             context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
+         spmv_buffer_.size());
+  SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/cuda_sparse_matrix.h
@@ -0,0 +1,143 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+#ifndef CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+#define CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+
+// A sparse matrix hosted on the GPU in compressed row sparse format, with
+// CUDA-accelerated operations.
+// The user of the class must ensure that ContextImpl::InitCuda() has already
+// been successfully called before using this class.
+class CERES_NO_EXPORT CudaSparseMatrix {
+ public:
+  // Create a GPU copy of the matrix provided.
+  CudaSparseMatrix(ContextImpl* context,
+                   const CompressedRowSparseMatrix& crs_matrix);
+
+  // Create matrix from existing row and column index buffers.
+  // Values are left uninitialized.
+  CudaSparseMatrix(int num_cols,
+                   CudaBuffer<int32_t>&& rows,
+                   CudaBuffer<int32_t>&& cols,
+                   ContextImpl* context);
+
+  ~CudaSparseMatrix();
+
+  // Left/right products are using internal buffer and are not thread-safe
+  // y = y + Ax;
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
+  // y = y + A'x;
+  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return num_cols_; }
+  int num_nonzeros() const { return num_nonzeros_; }
+
+  const int32_t* rows() const { return rows_.data(); }
+  const int32_t* cols() const { return cols_.data(); }
+  const double* values() const { return values_.data(); }
+
+  int32_t* mutable_rows() { return rows_.data(); }
+  int32_t* mutable_cols() { return cols_.data(); }
+  double* mutable_values() { return values_.data(); }
+
+  // If subsequent uses of this matrix involve only numerical changes and no
+  // structural changes, then this method can be used to copy the updated
+  // non-zero values -- the row and column index arrays are kept  the same. It
+  // is the caller's responsibility to ensure that the sparsity structure of the
+  // matrix is unchanged.
+  void CopyValuesFromCpu(const CompressedRowSparseMatrix& crs_matrix);
+
+  const cusparseSpMatDescr_t& descr() const { return descr_; }
+
+ private:
+  // Disable copy and assignment.
+  CudaSparseMatrix(const CudaSparseMatrix&) = delete;
+  CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
+
+  // Allocate temporary buffer for left/right products, create cuSPARSE
+  // descriptors
+  void Initialize();
+
+  // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or
+  // CUSPARSE_OPERATION_TRANSPOSE.
+  void SpMv(cusparseOperation_t op,
+            const cusparseDnVecDescr_t& x,
+            const cusparseDnVecDescr_t& y) const;
+
+  int num_rows_ = 0;
+  int num_cols_ = 0;
+  int num_nonzeros_ = 0;
+
+  ContextImpl* context_ = nullptr;
+  // CSR row indices.
+  CudaBuffer<int32_t> rows_;
+  // CSR column indices.
+  CudaBuffer<int32_t> cols_;
+  // CSR values.
+  CudaBuffer<double> values_;
+
+  // CuSparse object that describes this matrix.
+  cusparseSpMatDescr_t descr_ = nullptr;
+
+  // Dense vector descriptors for pointer interface
+  cusparseDnVecDescr_t descr_vec_left_ = nullptr;
+  cusparseDnVecDescr_t descr_vec_right_ = nullptr;
+
+  mutable CudaBuffer<uint8_t> spmv_buffer_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
--- a/extern/ceres/internal/ceres/cuda_sparse_matrix_test.cc
+++ b/extern/ceres/internal/ceres/cuda_sparse_matrix_test.cc
@@ -0,0 +1,286 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_sparse_matrix.h"
+
+#include <string>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/casts.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/linear_least_squares_problems.h"
+#include "ceres/triplet_sparse_matrix.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+class CudaSparseMatrixTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+    std::unique_ptr<LinearLeastSquaresProblem> problem =
+        CreateLinearLeastSquaresProblemFromId(2);
+    CHECK(problem != nullptr);
+    A_.reset(down_cast<BlockSparseMatrix*>(problem->A.release()));
+    CHECK(A_ != nullptr);
+    CHECK(problem->b != nullptr);
+    CHECK(problem->x != nullptr);
+    b_.resize(A_->num_rows());
+    for (int i = 0; i < A_->num_rows(); ++i) {
+      b_[i] = problem->b[i];
+    }
+    x_.resize(A_->num_cols());
+    for (int i = 0; i < A_->num_cols(); ++i) {
+      x_[i] = problem->x[i];
+    }
+    CHECK_EQ(A_->num_rows(), b_.rows());
+    CHECK_EQ(A_->num_cols(), x_.rows());
+  }
+
+  std::unique_ptr<BlockSparseMatrix> A_;
+  Vector x_;
+  Vector b_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaSparseMatrixTest, RightMultiplyAndAccumulate) {
+  std::string message;
+  auto A_crs = A_->ToCompressedRowSparseMatrix();
+  CudaSparseMatrix A_gpu(&context_, *A_crs);
+  CudaVector x_gpu(&context_, A_gpu.num_cols());
+  CudaVector res_gpu(&context_, A_gpu.num_rows());
+  x_gpu.CopyFromCpu(x_);
+
+  const Vector minus_b = -b_;
+  // res = -b
+  res_gpu.CopyFromCpu(minus_b);
+  // res += A * x
+  A_gpu.RightMultiplyAndAccumulate(x_gpu, &res_gpu);
+
+  Vector res;
+  res_gpu.CopyTo(&res);
+
+  Vector res_expected = minus_b;
+  A_->RightMultiplyAndAccumulate(x_.data(), res_expected.data());
+
+  EXPECT_LE((res - res_expected).norm(),
+            std::numeric_limits<double>::epsilon() * 1e3);
+}
+
+TEST(CudaSparseMatrix, CopyValuesFromCpu) {
+  // A1:
+  // [ 1 1 0 0
+  //   0 1 1 0]
+  // A2:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2 3 4]'
+  // A1 * b = [3 5]'
+  // A2 * b = [5 18]'
+  TripletSparseMatrix A1(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 1, 1, 1});
+  TripletSparseMatrix A2(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
+  Vector b(4);
+  b << 1, 2, 3, 4;
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  auto A1_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A1);
+  CudaSparseMatrix A_gpu(&context, *A1_crs);
+  CudaVector b_gpu(&context, A1.num_cols());
+  CudaVector x_gpu(&context, A1.num_rows());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  Vector x_expected(2);
+  x_expected << 3, 5;
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+  EXPECT_EQ(x_computed, x_expected);
+
+  auto A2_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A2);
+  A_gpu.CopyValuesFromCpu(*A2_crs);
+  x_gpu.SetZero();
+  x_expected << 5, 18;
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+  x_gpu.CopyTo(&x_computed);
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, RightMultiplyAndAccumulate) {
+  // A:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2 3 4]'
+  // A * b = [5 18]'
+  TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
+  Vector b(4);
+  b << 1, 2, 3, 4;
+  Vector x_expected(2);
+  x_expected << 5, 18;
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, A.num_cols());
+  CudaVector x_gpu(&context, A.num_rows());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, LeftMultiplyAndAccumulate) {
+  // A:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2]'
+  // A'* b = [1 8 8 0]'
+  TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
+  Vector b(2);
+  b << 1, 2;
+  Vector x_expected(4);
+  x_expected << 1, 8, 8, 0;
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, A.num_rows());
+  CudaVector x_gpu(&context, A.num_cols());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  A_gpu.LeftMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+// If there are numerical errors due to synchronization issues, they will show
+// up when testing with large matrices, since each operation will take
+// significant time, thus hopefully revealing any potential synchronization
+// issues.
+TEST(CudaSparseMatrix, LargeMultiplyAndAccumulate) {
+  // Create a large NxN matrix A that has the following structure:
+  // In row i, only columns i and i+1 are non-zero.
+  // A_{i, i} = A_{i, i+1} = 1.
+  // There will be 2 * N - 1 non-zero elements in A.
+  // X = [1:N]
+  // Right multiply test:
+  // b = A * X
+  // Left multiply test:
+  // b = A' * X
+
+  const int N = 10 * 1000 * 1000;
+  const int num_non_zeros = 2 * N - 1;
+  std::vector<int> row_indices(num_non_zeros);
+  std::vector<int> col_indices(num_non_zeros);
+  std::vector<double> values(num_non_zeros);
+
+  for (int i = 0; i < N; ++i) {
+    row_indices[2 * i] = i;
+    col_indices[2 * i] = i;
+    values[2 * i] = 1.0;
+    if (i + 1 < N) {
+      col_indices[2 * i + 1] = i + 1;
+      row_indices[2 * i + 1] = i;
+      values[2 * i + 1] = 1;
+    }
+  }
+  TripletSparseMatrix A(N, N, row_indices, col_indices, values);
+  Vector x(N);
+  for (int i = 0; i < N; ++i) {
+    x[i] = i + 1;
+  }
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, N);
+  CudaVector x_gpu(&context, N);
+  x_gpu.CopyFromCpu(x);
+
+  // First check RightMultiply.
+  {
+    b_gpu.SetZero();
+    A_gpu.RightMultiplyAndAccumulate(x_gpu, &b_gpu);
+    Vector b_computed;
+    b_gpu.CopyTo(&b_computed);
+    for (int i = 0; i < N; ++i) {
+      if (i + 1 < N) {
+        EXPECT_EQ(b_computed[i], 2 * (i + 1) + 1);
+      } else {
+        EXPECT_EQ(b_computed[i], i + 1);
+      }
+    }
+  }
+
+  // Next check LeftMultiply.
+  {
+    b_gpu.SetZero();
+    A_gpu.LeftMultiplyAndAccumulate(x_gpu, &b_gpu);
+    Vector b_computed;
+    b_gpu.CopyTo(&b_computed);
+    for (int i = 0; i < N; ++i) {
+      if (i > 0) {
+        EXPECT_EQ(b_computed[i], 2 * (i + 1) - 1);
+      } else {
+        EXPECT_EQ(b_computed[i], i + 1);
+      }
+    }
+  }
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
--- a/extern/ceres/internal/ceres/cuda_streamed_buffer.h
+++ b/extern/ceres/internal/ceres/cuda_streamed_buffer.h
@@ -0,0 +1,335 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+
+// Most contemporary CUDA devices are capable of simultaneous code execution and
+// host-to-device transfer. This class copies batches of data to GPU memory and
+// executes processing of copied data in parallel (asynchronously).
+// Data is copied to a fixed-size buffer on GPU (containing at most
+// max_buffer_size values), and this memory is re-used when the previous
+// batch of values is processed by user-provided callback
+// Host-to-device copy uses a temporary buffer if required. Each batch of values
+// has size of kValuesPerBatch, except the last one.
+template <typename T>
+class CERES_NO_EXPORT CudaStreamedBuffer {
+ public:
+  // If hardware supports only one host-to-device copy or one host-to-device
+  // copy is able to reach peak bandwidth, two streams are sufficient to reach
+  // maximum efficiency:
+  //  - If transferring batch of values takes more time, than processing it on
+  //  gpu, then at every moment of time one of the streams will be transferring
+  //  data and other stream will be either processing data or idle; the whole
+  //  process will be bounded by host-to-device copy.
+  //  - If transferring batch of values takes less time, than processing it on
+  //  gpu, then at every moment of time one of the streams will be processing
+  //  data and other stream will be either performing computations or
+  //  transferring data, and the whole process will be bounded by computations.
+  static constexpr int kNumBatches = 2;
+  // max_buffer_size is the maximal size (in elements of type T) of array
+  // to be pre-allocated in gpu memory. The size of array determines size of
+  // batch of values for simultaneous copying and processing. It should be large
+  // enough to allow highly-parallel execution of user kernels; making it too
+  // large increases latency.
+  CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
+      : kValuesPerBatch(max_buffer_size / kNumBatches),
+        context_(context),
+        values_gpu_(context, kValuesPerBatch * kNumBatches) {
+    static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
+    CHECK_GE(max_buffer_size, kNumBatches);
+    // Pre-allocate a buffer of page-locked memory for transfers from a regular
+    // cpu memory. Because we will be only writing into that buffer from cpu,
+    // memory is allocated with cudaHostAllocWriteCombined flag.
+    CHECK_EQ(cudaSuccess,
+             cudaHostAlloc(&values_cpu_pinned_,
+                           sizeof(T) * kValuesPerBatch * kNumBatches,
+                           cudaHostAllocWriteCombined));
+    for (auto& e : copy_finished_) {
+      CHECK_EQ(cudaSuccess,
+               cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+    }
+  }
+
+  CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
+
+  ~CudaStreamedBuffer() {
+    CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
+    for (auto& e : copy_finished_) {
+      CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
+    }
+  }
+
+  // Transfer num_values at host-memory pointer from, calling
+  // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
+  // after scheduling transfer of each batch of data. User-provided callback
+  // should perform processing of data at device_pointer only in
+  // stream_to_use stream (device_pointer will be re-used in the next
+  // callback invocation with the same stream).
+  //
+  // Two diagrams below describe operation in two possible scenarios, depending
+  // on input data being stored in page-locked memory. In this example we will
+  // have max_buffer_size = 2 * K, num_values = N * K and callback
+  // scheduling a single asynchronous launch of
+  // Kernel<<..., stream_to_use>>(device_pointer,
+  //                              size_of_batch,
+  //                              offset_of_batch)
+  //
+  // a. Copying from page-locked memory
+  // In this case no copy on the host-side is necessary, and this method just
+  // schedules a bunch of interleaved memory copies and callback invocations:
+  //
+  //  cudaStreamSynchronize(context->DefaultStream());
+  //  - Iteration #0:
+  //    - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
+  //    - callback(values_gpu_, K, 0, stream_0)
+  //  - Iteration #1:
+  //    - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
+  //    stream_1)
+  //    - callback(values_gpu_ + K, K, K, stream_1)
+  //  - Iteration #2:
+  //    - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - callback(values_gpu_, K, 2 * K, stream_0)
+  //  - Iteration #3:
+  //     - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
+  //     stream_1)
+  //     - callback(values_gpu_ + K, K, 3 * K, stream_1)
+  //  ...
+  //  - Iteration #i:
+  //     - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
+  //     sizeof(T), H->D, stream_(i % 2))
+  //     - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
+  //  ...
+  //  cudaStreamSynchronize(stream_0)
+  //  cudaStreamSynchronize(stream_1)
+  //
+  //  This sequence of calls results in following activity on gpu (assuming that
+  //  kernel invoked by callback takes less time than host-to-device copy):
+  //  +-------------------+-------------------+
+  //  | Stream #0         | Stream #1         |
+  //  +-------------------+-------------------+
+  //  | Copy host->device |                   |
+  //  |                   |                   |
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Kernel            | Copy host->device |
+  //  +-------------------+                   |
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Copy host->device | Kernel            |
+  //  |                   +-------------------+
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Kernel            | Copy host->device |
+  //  |                  ...                  |
+  //  +---------------------------------------+
+  //
+  // b. Copying from regular memory
+  // In this case a copy from regular memory to page-locked memory is required
+  // in order to get asynchrnonous operation. Because pinned memory on host-side
+  // is reused, additional synchronization is required. On each iteration method
+  // the following actions are performed:
+  //  - Wait till previous copy operation in stream is completed
+  //  - Copy batch of values from input array into pinned memory
+  //  - Asynchronously launch host-to-device copy
+  //  - Setup event for synchronization on copy completion
+  //  - Invoke callback (that launches kernel asynchronously)
+  //
+  //  Invocations are performed with the following arguments
+  //  cudaStreamSynchronize(context->DefaultStream());
+  //  - Iteration #0:
+  //    - cudaEventSynchronize(copy_finished_0)
+  //    - std::copy_n(from, K, values_cpu_pinned_)
+  //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - cudaEventRecord(copy_finished_0, stream_0)
+  //    - callback(values_gpu_, K, 0, stream_0)
+  //  - Iteration #1:
+  //    - cudaEventSynchronize(copy_finished_1)
+  //    - std::copy_n(from + K, K, values_cpu_pinned_ + K)
+  //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+  //    sizeof(T), H->D, stream_1)
+  //    - cudaEventRecord(copy_finished_1, stream_1)
+  //    - callback(values_gpu_ + K, K, K, stream_1)
+  //  - Iteration #2:
+  //    - cudaEventSynchronize(copy_finished_0)
+  //    - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
+  //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - cudaEventRecord(copy_finished_0, stream_0)
+  //    - callback(values_gpu_, K, 2 * K, stream_0)
+  //  - Iteration #3:
+  //    - cudaEventSynchronize(copy_finished_1)
+  //    - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
+  //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+  //    sizeof(T), H->D, stream_1)
+  //    - cudaEventRecord(copy_finished_1, stream_1)
+  //    - callback(values_gpu_ + K, K, 3 * K, stream_1)
+  //  ...
+  //  - Iteration #i:
+  //    - cudaEventSynchronize(copy_finished_(i % 2))
+  //    - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
+  //    - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
+  //    2) * K, K * sizeof(T), H->D, stream_(i % 2))
+  //    - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
+  //    - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
+  //  ...
+  //  cudaStreamSynchronize(stream_0)
+  //  cudaStreamSynchronize(stream_1)
+  //
+  //  This sequence of calls results in following activity on cpu and gpu
+  //  (assuming that kernel invoked by callback takes less time than
+  //  host-to-device copy and copy in cpu memory, and copy in cpu memory is
+  //  faster than host-to-device copy):
+  //  +----------------------------+-------------------+-------------------+
+  //  | Stream #0                  | Stream #0         | Stream #1         |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      |                   |                   |
+  //  |                            |                   |                   |
+  //  +----------------------------+-------------------|                   |
+  //  | Copy to pinned memory      | Copy host->device |                   |
+  //  |                            |                   |                   |
+  //  +----------------------------+                   |                   |
+  //  | Waiting previous h->d copy |                   |                   |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      | Kernel            | Copy host->device |
+  //  |                            +-------------------+                   |
+  //  +----------------------------+                   |                   |
+  //  | Waiting previous h->d copy |                   |                   |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      | Copy host->device | Kernel            |
+  //  |                            |                   +-------------------+
+  //  |                           ...                 ...                  |
+  //  +----------------------------+---------------------------------------+
+  //
+  template <typename Fun>
+  void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
+    // This synchronization is not required in some cases, but we perform it in
+    // order to avoid situation when user callback depends on data that is
+    // still to be computed in default stream
+    CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
+
+    // If pointer to input data does not correspond to page-locked memory,
+    // host-to-device memory copy might be executed synchrnonously (with a copy
+    // to pinned memory happening inside the driver). In that case we perform
+    // copy to a pre-allocated array of page-locked memory.
+    const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
+    T* batch_values_gpu[kNumBatches];
+    T* batch_values_cpu[kNumBatches];
+    auto streams = context_->streams_;
+    for (int i = 0; i < kNumBatches; ++i) {
+      batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
+      batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
+    }
+    int batch_id = 0;
+    for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
+      const int num_values_batch =
+          std::min(num_values - offset, kValuesPerBatch);
+      const T* batch_from = from + offset;
+      T* batch_to = batch_values_gpu[batch_id];
+      auto stream = streams[batch_id];
+      auto copy_finished = copy_finished_[batch_id];
+
+      if (copy_to_pinned_memory) {
+        // Copying values to a temporary buffer should be started only after the
+        // previous copy from temporary buffer to device is completed.
+        CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
+        std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
+        batch_from = batch_values_cpu[batch_id];
+      }
+      CHECK_EQ(cudaSuccess,
+               cudaMemcpyAsync(batch_to,
+                               batch_from,
+                               sizeof(T) * num_values_batch,
+                               cudaMemcpyHostToDevice,
+                               stream));
+      if (copy_to_pinned_memory) {
+        // Next copy to a temporary buffer can start straight after asynchronous
+        // copy is completed (and might be started before kernels asynchronously
+        // executed in stream by user-supplied callback are completed).
+        // No explicit synchronization is required when copying data from
+        // page-locked memory, because memory copy and user kernel execution
+        // with corresponding part of values_gpu_ array is serialized using
+        // stream
+        CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
+      }
+      callback(batch_to, num_values_batch, offset, stream);
+      batch_id = (batch_id + 1) % kNumBatches;
+    }
+    // Explicitly synchronize on all CUDA streams that were utilized.
+    for (int i = 0; i < kNumBatches; ++i) {
+      CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
+    }
+  }
+
+ private:
+  // It is necessary to have all host-to-device copies to be completely
+  // asynchronous. This requires source memory to be allocated in page-locked
+  // memory.
+  static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
+    cudaPointerAttributes attributes;
+    auto status = cudaPointerGetAttributes(&attributes, ptr);
+#if CUDART_VERSION < 11000
+    // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
+    // pointer will return  cudaErrorInvalidValue
+    if (status == cudaErrorInvalidValue) {
+      return true;
+    }
+#endif
+    CHECK_EQ(status, cudaSuccess);
+    // This class only supports cpu memory as a source
+    CHECK_NE(attributes.type, cudaMemoryTypeDevice);
+    // If host memory was allocated (or registered) with CUDA API, or is a
+    // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
+    // of managed memory it might be slightly better to perform a single call of
+    // user-provided call-back (and hope that page migration will provide a
+    // similar throughput with zero efforts from our side).
+    return attributes.type == cudaMemoryTypeUnregistered;
+  }
+
+  const int kValuesPerBatch;
+  ContextImpl* context_ = nullptr;
+  CudaBuffer<T> values_gpu_;
+  T* values_cpu_pinned_ = nullptr;
+  cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
--- a/extern/ceres/internal/ceres/cuda_streamed_buffer_test.cc
+++ b/extern/ceres/internal/ceres/cuda_streamed_buffer_test.cc
@@ -0,0 +1,169 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+
+TEST(CudaStreamedBufferTest, IntegerCopy) {
+  // Offsets and sizes of batches supplied to callback
+  std::vector<std::pair<int, int>> batches;
+  const int kMaxTemporaryArraySize = 16;
+  const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+  std::vector<int> inputs(kInputSize);
+  std::vector<int> outputs(kInputSize, -1);
+  std::iota(inputs.begin(), inputs.end(), 0);
+
+  CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+  streamed_buffer.CopyToGpu(inputs.data(),
+                            kInputSize,
+                            [&outputs, &batches](const int* device_pointer,
+                                                 int size,
+                                                 int offset,
+                                                 cudaStream_t stream) {
+                              batches.emplace_back(offset, size);
+                              CHECK_EQ(cudaSuccess,
+                                       cudaMemcpyAsync(outputs.data() + offset,
+                                                       device_pointer,
+                                                       sizeof(int) * size,
+                                                       cudaMemcpyDeviceToHost,
+                                                       stream));
+                            });
+  // All operations in all streams should be completed when CopyToGpu returns
+  // control to the callee
+  for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+    CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+  }
+
+  // Check if every element was visited
+  for (int i = 0; i < kInputSize; ++i) {
+    CHECK_EQ(outputs[i], i);
+  }
+
+  // Check if there is no overlap between batches
+  std::sort(batches.begin(), batches.end());
+  const int num_batches = batches.size();
+  for (int i = 0; i < num_batches; ++i) {
+    const auto [begin, size] = batches[i];
+    const int end = begin + size;
+    CHECK_GE(begin, 0);
+    CHECK_LT(begin, kInputSize);
+
+    CHECK_GT(size, 0);
+    CHECK_LE(end, kInputSize);
+
+    if (i + 1 == num_batches) continue;
+    CHECK_EQ(end, batches[i + 1].first);
+  }
+}
+
+TEST(CudaStreamedBufferTest, IntegerNoCopy) {
+  // Offsets and sizes of batches supplied to callback
+  std::vector<std::pair<int, int>> batches;
+  const int kMaxTemporaryArraySize = 16;
+  const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+  int* inputs;
+  int* outputs;
+  CHECK_EQ(cudaSuccess,
+           cudaHostAlloc(
+               &inputs, sizeof(int) * kInputSize, cudaHostAllocWriteCombined));
+  CHECK_EQ(
+      cudaSuccess,
+      cudaHostAlloc(&outputs, sizeof(int) * kInputSize, cudaHostAllocDefault));
+
+  std::fill(outputs, outputs + kInputSize, -1);
+  std::iota(inputs, inputs + kInputSize, 0);
+
+  CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+  streamed_buffer.CopyToGpu(inputs,
+                            kInputSize,
+                            [outputs, &batches](const int* device_pointer,
+                                                int size,
+                                                int offset,
+                                                cudaStream_t stream) {
+                              batches.emplace_back(offset, size);
+                              CHECK_EQ(cudaSuccess,
+                                       cudaMemcpyAsync(outputs + offset,
+                                                       device_pointer,
+                                                       sizeof(int) * size,
+                                                       cudaMemcpyDeviceToHost,
+                                                       stream));
+                            });
+  // All operations in all streams should be completed when CopyToGpu returns
+  // control to the callee
+  for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+    CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+  }
+
+  // Check if every element was visited
+  for (int i = 0; i < kInputSize; ++i) {
+    CHECK_EQ(outputs[i], i);
+  }
+
+  // Check if there is no overlap between batches
+  std::sort(batches.begin(), batches.end());
+  const int num_batches = batches.size();
+  for (int i = 0; i < num_batches; ++i) {
+    const auto [begin, size] = batches[i];
+    const int end = begin + size;
+    CHECK_GE(begin, 0);
+    CHECK_LT(begin, kInputSize);
+
+    CHECK_GT(size, 0);
+    CHECK_LE(end, kInputSize);
+
+    if (i + 1 == num_batches) continue;
+    CHECK_EQ(end, batches[i + 1].first);
+  }
+
+  CHECK_EQ(cudaSuccess, cudaFreeHost(inputs));
+  CHECK_EQ(cudaSuccess, cudaFreeHost(outputs));
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_vector.cc
+++ b/extern/ceres/internal/ceres/cuda_vector.cc
@@ -0,0 +1,185 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A simple CUDA vector class.
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <math.h>
+
+#include "ceres/context_impl.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels_vector_ops.h"
+#include "ceres/cuda_vector.h"
+#include "cublas_v2.h"
+
+namespace ceres::internal {
+
+CudaVector::CudaVector(ContextImpl* context, int size)
+    : context_(context), data_(context, size) {
+  DCHECK_NE(context, nullptr);
+  DCHECK(context->IsCudaInitialized());
+  Resize(size);
+}
+
+CudaVector::CudaVector(CudaVector&& other)
+    : num_rows_(other.num_rows_),
+      context_(other.context_),
+      data_(std::move(other.data_)),
+      descr_(other.descr_) {
+  other.num_rows_ = 0;
+  other.descr_ = nullptr;
+}
+
+CudaVector& CudaVector::operator=(const CudaVector& other) {
+  if (this != &other) {
+    Resize(other.num_rows());
+    data_.CopyFromGPUArray(other.data_.data(), num_rows_);
+  }
+  return *this;
+}
+
+void CudaVector::DestroyDescriptor() {
+  if (descr_ != nullptr) {
+    CHECK_EQ(cusparseDestroyDnVec(descr_), CUSPARSE_STATUS_SUCCESS);
+    descr_ = nullptr;
+  }
+}
+
+CudaVector::~CudaVector() { DestroyDescriptor(); }
+
+void CudaVector::Resize(int size) {
+  data_.Reserve(size);
+  num_rows_ = size;
+  DestroyDescriptor();
+  CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
+           CUSPARSE_STATUS_SUCCESS);
+}
+
+double CudaVector::Dot(const CudaVector& x) const {
+  double result = 0;
+  CHECK_EQ(cublasDdot(context_->cublas_handle_,
+                      num_rows_,
+                      data_.data(),
+                      1,
+                      x.data(),
+                      1,
+                      &result),
+           CUBLAS_STATUS_SUCCESS)
+      << "CuBLAS cublasDdot failed.";
+  return result;
+}
+
+double CudaVector::Norm() const {
+  double result = 0;
+  CHECK_EQ(cublasDnrm2(
+               context_->cublas_handle_, num_rows_, data_.data(), 1, &result),
+           CUBLAS_STATUS_SUCCESS)
+      << "CuBLAS cublasDnrm2 failed.";
+  return result;
+}
+
+void CudaVector::CopyFromCpu(const double* x) {
+  data_.CopyFromCpu(x, num_rows_);
+}
+
+void CudaVector::CopyFromCpu(const Vector& x) {
+  if (x.rows() != num_rows_) {
+    Resize(x.rows());
+  }
+  CopyFromCpu(x.data());
+}
+
+void CudaVector::CopyTo(Vector* x) const {
+  CHECK(x != nullptr);
+  x->resize(num_rows_);
+  data_.CopyToCpu(x->data(), num_rows_);
+}
+
+void CudaVector::CopyTo(double* x) const {
+  CHECK(x != nullptr);
+  data_.CopyToCpu(x, num_rows_);
+}
+
+void CudaVector::SetZero() {
+  // Allow empty vector to be zeroed
+  if (num_rows_ == 0) return;
+  CHECK(data_.data() != nullptr);
+  CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream());
+}
+
+void CudaVector::Axpby(double a, const CudaVector& x, double b) {
+  if (&x == this) {
+    Scale(a + b);
+    return;
+  }
+  CHECK_EQ(num_rows_, x.num_rows_);
+  if (b != 1.0) {
+    // First scale y by b.
+    CHECK_EQ(
+        cublasDscal(context_->cublas_handle_, num_rows_, &b, data_.data(), 1),
+        CUBLAS_STATUS_SUCCESS)
+        << "CuBLAS cublasDscal failed.";
+  }
+  // Then add a * x to y.
+  CHECK_EQ(cublasDaxpy(context_->cublas_handle_,
+                       num_rows_,
+                       &a,
+                       x.data(),
+                       1,
+                       data_.data(),
+                       1),
+           CUBLAS_STATUS_SUCCESS)
+      << "CuBLAS cublasDaxpy failed.";
+}
+
+void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) {
+  CudaDtDxpy(
+      data_.data(), D.data(), x.data(), num_rows_, context_->DefaultStream());
+}
+
+void CudaVector::Scale(double s) {
+  CHECK_EQ(
+      cublasDscal(context_->cublas_handle_, num_rows_, &s, data_.data(), 1),
+      CUBLAS_STATUS_SUCCESS)
+      << "CuBLAS cublasDscal failed.";
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
--- a/extern/ceres/internal/ceres/cuda_vector.h
+++ b/extern/ceres/internal/ceres/cuda_vector.h
@@ -0,0 +1,193 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A simple CUDA vector class.
+
+#ifndef CERES_INTERNAL_CUDA_VECTOR_H_
+#define CERES_INTERNAL_CUDA_VECTOR_H_
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <math.h>
+
+#include <memory>
+#include <string>
+
+#include "ceres/context_impl.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels_vector_ops.h"
+#include "ceres/internal/eigen.h"
+#include "cublas_v2.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+
+// An Nx1 vector, denoted y hosted on the GPU, with CUDA-accelerated operations.
+class CERES_NO_EXPORT CudaVector {
+ public:
+  // Create a pre-allocated vector of size N and return a pointer to it. The
+  // caller must ensure that InitCuda() has already been successfully called on
+  // context before calling this method.
+  CudaVector(ContextImpl* context, int size);
+
+  CudaVector(CudaVector&& other);
+
+  ~CudaVector();
+
+  void Resize(int size);
+
+  // Perform a deep copy of the vector.
+  CudaVector& operator=(const CudaVector&);
+
+  // Return the inner product x' * y.
+  double Dot(const CudaVector& x) const;
+
+  // Return the L2 norm of the vector (||y||_2).
+  double Norm() const;
+
+  // Set all elements to zero.
+  void SetZero();
+
+  // Copy from Eigen vector.
+  void CopyFromCpu(const Vector& x);
+
+  // Copy from CPU memory array.
+  void CopyFromCpu(const double* x);
+
+  // Copy to Eigen vector.
+  void CopyTo(Vector* x) const;
+
+  // Copy to CPU memory array. It is the caller's responsibility to ensure
+  // that the array is large enough.
+  void CopyTo(double* x) const;
+
+  // y = a * x + b * y.
+  void Axpby(double a, const CudaVector& x, double b);
+
+  // y = diag(d)' * diag(d) * x + y.
+  void DtDxpy(const CudaVector& D, const CudaVector& x);
+
+  // y = s * y.
+  void Scale(double s);
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return 1; }
+
+  const double* data() const { return data_.data(); }
+  double* mutable_data() { return data_.data(); }
+
+  const cusparseDnVecDescr_t& descr() const { return descr_; }
+
+ private:
+  CudaVector(const CudaVector&) = delete;
+  void DestroyDescriptor();
+
+  int num_rows_ = 0;
+  ContextImpl* context_ = nullptr;
+  CudaBuffer<double> data_;
+  // CuSparse object that describes this dense vector.
+  cusparseDnVecDescr_t descr_ = nullptr;
+};
+
+// Blas1 operations on Cuda vectors. These functions are needed as an
+// abstraction layer so that we can use different versions of a vector style
+// object in the conjugate gradients linear solver.
+// Context and num_threads arguments are not used by CUDA implementation,
+// context embedded into CudaVector is used instead.
+inline double Norm(const CudaVector& x,
+                   ContextImpl* context = nullptr,
+                   int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  return x.Norm();
+}
+inline void SetZero(CudaVector& x,
+                    ContextImpl* context = nullptr,
+                    int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  x.SetZero();
+}
+inline void Axpby(double a,
+                  const CudaVector& x,
+                  double b,
+                  const CudaVector& y,
+                  CudaVector& z,
+                  ContextImpl* context = nullptr,
+                  int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  if (&x == &y && &y == &z) {
+    // z = (a + b) * z;
+    z.Scale(a + b);
+  } else if (&x == &z) {
+    // x is aliased to z.
+    // z = x
+    //   = b * y + a * x;
+    z.Axpby(b, y, a);
+  } else if (&y == &z) {
+    // y is aliased to z.
+    // z = y = a * x + b * y;
+    z.Axpby(a, x, b);
+  } else {
+    // General case: all inputs and outputs are distinct.
+    z = y;
+    z.Axpby(a, x, b);
+  }
+}
+inline double Dot(const CudaVector& x,
+                  const CudaVector& y,
+                  ContextImpl* context = nullptr,
+                  int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  return x.Dot(y);
+}
+inline void Copy(const CudaVector& from,
+                 CudaVector& to,
+                 ContextImpl* context = nullptr,
+                 int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  to = from;
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_SPARSE_LINEAR_OPERATOR_H_
--- a/extern/ceres/internal/ceres/cuda_vector_test.cc
+++ b/extern/ceres/internal/ceres/cuda_vector_test.cc
@@ -0,0 +1,423 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_vector.h"
+
+#include <string>
+
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CudaVector, Creation) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x(&context, 1000);
+  EXPECT_EQ(x.num_rows(), 1000);
+  EXPECT_NE(x.data(), nullptr);
+}
+
+TEST(CudaVector, CopyVector) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector y(&context, 10);
+  y.CopyFromCpu(x);
+  EXPECT_EQ(y.num_rows(), 3);
+
+  Vector z(3);
+  z << 0, 0, 0;
+  y.CopyTo(&z);
+  EXPECT_EQ(x, z);
+}
+
+TEST(CudaVector, Move) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector y(&context, 10);
+  const auto y_data = y.data();
+  const auto y_descr = y.descr();
+  EXPECT_EQ(y.num_rows(), 10);
+  CudaVector z(std::move(y));
+  EXPECT_EQ(y.data(), nullptr);
+  EXPECT_EQ(y.descr(), nullptr);
+  EXPECT_EQ(y.num_rows(), 0);
+
+  EXPECT_EQ(z.data(), y_data);
+  EXPECT_EQ(z.descr(), y_descr);
+}
+
+TEST(CudaVector, DeepCopy) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 3);
+  x_gpu.CopyFromCpu(x);
+
+  CudaVector y_gpu(&context, 3);
+  y_gpu.SetZero();
+  EXPECT_EQ(y_gpu.Norm(), 0.0);
+
+  y_gpu = x_gpu;
+  Vector y(3);
+  y << 0, 0, 0;
+  y_gpu.CopyTo(&y);
+  EXPECT_EQ(x, y);
+}
+
+TEST(CudaVector, Dot) {
+  Vector x(3);
+  Vector y(3);
+  x << 1, 2, 3;
+  y << 100, 10, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  CudaVector y_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  EXPECT_EQ(x_gpu.Dot(y_gpu), 123.0);
+  EXPECT_EQ(Dot(x_gpu, y_gpu), 123.0);
+}
+
+TEST(CudaVector, Norm) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  EXPECT_NEAR(x_gpu.Norm(),
+              sqrt(1.0 + 4.0 + 9.0),
+              std::numeric_limits<double>::epsilon());
+
+  EXPECT_NEAR(Norm(x_gpu),
+              sqrt(1.0 + 4.0 + 9.0),
+              std::numeric_limits<double>::epsilon());
+}
+
+TEST(CudaVector, SetZero) {
+  Vector x(4);
+  x << 1, 1, 1, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon());
+
+  x_gpu.SetZero();
+  EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon());
+
+  x_gpu.CopyFromCpu(x);
+  EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon());
+  SetZero(x_gpu);
+  EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon());
+}
+
+TEST(CudaVector, Resize) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  EXPECT_EQ(x_gpu.num_rows(), 10);
+  x_gpu.Resize(4);
+  EXPECT_EQ(x_gpu.num_rows(), 4);
+}
+
+TEST(CudaVector, Axpy) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyBEquals1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberFunctionBNotEqual1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 3.0);
+  Vector result;
+  Vector expected(4);
+  expected << 203, 23, 5, 3;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberFunctionBEqual1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberXAliasesY) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.SetZero();
+
+  x_gpu.Axpby(2.0, x_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 300, 30, 3, 0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodNoAliases) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  CudaVector z_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+  z_gpu.Resize(4);
+  z_gpu.SetZero();
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, z_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  z_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesY) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector z_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  z_gpu.SetZero();
+
+  Axpby(2.0, x_gpu, 3.0, x_gpu, z_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 500, 50, 5, 0;
+  z_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesZ) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  CudaVector y_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodYAliasesZ) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, y_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  y_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesYAliasesZ) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  Axpby(2.0, x_gpu, 3.0, x_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 500, 50, 5, 0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, DtDxpy) {
+  Vector x(4);
+  Vector y(4);
+  Vector D(4);
+  x << 1, 2, 3, 4;
+  y << 100, 10, 1, 0;
+  D << 4, 3, 2, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  CudaVector D_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+  D_gpu.CopyFromCpu(D);
+
+  y_gpu.DtDxpy(D_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 116, 28, 13, 4;
+  y_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, Scale) {
+  Vector x(4);
+  x << 1, 2, 3, 4;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+
+  x_gpu.Scale(-3.0);
+
+  Vector result;
+  Vector expected(4);
+  expected << -3.0, -6.0, -9.0, -12.0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
--- a/extern/ceres/internal/ceres/cxsparse.cc
+++ b/extern/ceres/internal/ceres/cxsparse.cc
@@ -1,284 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: strandmark@google.com (Petter Strandmark)
-
-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
-
-#ifndef CERES_NO_CXSPARSE
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "ceres/compressed_col_sparse_matrix_utils.h"
-#include "ceres/compressed_row_sparse_matrix.h"
-#include "ceres/cxsparse.h"
-#include "ceres/triplet_sparse_matrix.h"
-#include "glog/logging.h"
-
-namespace ceres {
-namespace internal {
-
-using std::vector;
-
-CXSparse::CXSparse() : scratch_(nullptr), scratch_size_(0) {}
-
-CXSparse::~CXSparse() {
-  if (scratch_size_ > 0) {
-    cs_di_free(scratch_);
-  }
-}
-
-csn* CXSparse::Cholesky(cs_di* A, cs_dis* symbolic_factor) {
-  return cs_di_chol(A, symbolic_factor);
-}
-
-void CXSparse::Solve(cs_dis* symbolic_factor, csn* numeric_factor, double* b) {
-  // Make sure we have enough scratch space available.
-  const int num_cols = numeric_factor->L->n;
-  if (scratch_size_ < num_cols) {
-    if (scratch_size_ > 0) {
-      cs_di_free(scratch_);
-    }
-    scratch_ =
-        reinterpret_cast<CS_ENTRY*>(cs_di_malloc(num_cols, sizeof(CS_ENTRY)));
-    scratch_size_ = num_cols;
-  }
-
-  // When the Cholesky factor succeeded, these methods are
-  // guaranteed to succeeded as well. In the comments below, "x"
-  // refers to the scratch space.
-  //
-  // Set x = P * b.
-  CHECK(cs_di_ipvec(symbolic_factor->pinv, b, scratch_, num_cols));
-  // Set x = L \ x.
-  CHECK(cs_di_lsolve(numeric_factor->L, scratch_));
-  // Set x = L' \ x.
-  CHECK(cs_di_ltsolve(numeric_factor->L, scratch_));
-  // Set b = P' * x.
-  CHECK(cs_di_pvec(symbolic_factor->pinv, scratch_, b, num_cols));
-}
-
-bool CXSparse::SolveCholesky(cs_di* lhs, double* rhs_and_solution) {
-  return cs_cholsol(1, lhs, rhs_and_solution);
-}
-
-cs_dis* CXSparse::AnalyzeCholesky(cs_di* A) {
-  // order = 1 for Cholesky factor.
-  return cs_schol(1, A);
-}
-
-cs_dis* CXSparse::AnalyzeCholeskyWithNaturalOrdering(cs_di* A) {
-  // order = 0 for Natural ordering.
-  return cs_schol(0, A);
-}
-
-cs_dis* CXSparse::BlockAnalyzeCholesky(cs_di* A,
-                                       const vector<int>& row_blocks,
-                                       const vector<int>& col_blocks) {
-  const int num_row_blocks = row_blocks.size();
-  const int num_col_blocks = col_blocks.size();
-
-  vector<int> block_rows;
-  vector<int> block_cols;
-  CompressedColumnScalarMatrixToBlockMatrix(
-      A->i, A->p, row_blocks, col_blocks, &block_rows, &block_cols);
-  cs_di block_matrix;
-  block_matrix.m = num_row_blocks;
-  block_matrix.n = num_col_blocks;
-  block_matrix.nz = -1;
-  block_matrix.nzmax = block_rows.size();
-  block_matrix.p = &block_cols[0];
-  block_matrix.i = &block_rows[0];
-  block_matrix.x = nullptr;
-
-  int* ordering = cs_amd(1, &block_matrix);
-  vector<int> block_ordering(num_row_blocks, -1);
-  std::copy(ordering, ordering + num_row_blocks, &block_ordering[0]);
-  cs_free(ordering);
-
-  vector<int> scalar_ordering;
-  BlockOrderingToScalarOrdering(row_blocks, block_ordering, &scalar_ordering);
-
-  auto* symbolic_factor =
-      reinterpret_cast<cs_dis*>(cs_calloc(1, sizeof(cs_dis)));
-  symbolic_factor->pinv = cs_pinv(&scalar_ordering[0], A->n);
-  cs* permuted_A = cs_symperm(A, symbolic_factor->pinv, 0);
-
-  symbolic_factor->parent = cs_etree(permuted_A, 0);
-  int* postordering = cs_post(symbolic_factor->parent, A->n);
-  int* column_counts =
-      cs_counts(permuted_A, symbolic_factor->parent, postordering, 0);
-  cs_free(postordering);
-  cs_spfree(permuted_A);
-
-  symbolic_factor->cp = static_cast<int*>(cs_malloc(A->n + 1, sizeof(int)));
-  symbolic_factor->lnz = cs_cumsum(symbolic_factor->cp, column_counts, A->n);
-  symbolic_factor->unz = symbolic_factor->lnz;
-
-  cs_free(column_counts);
-
-  if (symbolic_factor->lnz < 0) {
-    cs_sfree(symbolic_factor);
-    symbolic_factor = nullptr;
-  }
-
-  return symbolic_factor;
-}
-
-cs_di CXSparse::CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A) {
-  cs_di At;
-  At.m = A->num_cols();
-  At.n = A->num_rows();
-  At.nz = -1;
-  At.nzmax = A->num_nonzeros();
-  At.p = A->mutable_rows();
-  At.i = A->mutable_cols();
-  At.x = A->mutable_values();
-  return At;
-}
-
-cs_di* CXSparse::CreateSparseMatrix(TripletSparseMatrix* tsm) {
-  cs_di_sparse tsm_wrapper;
-  tsm_wrapper.nzmax = tsm->num_nonzeros();
-  tsm_wrapper.nz = tsm->num_nonzeros();
-  tsm_wrapper.m = tsm->num_rows();
-  tsm_wrapper.n = tsm->num_cols();
-  tsm_wrapper.p = tsm->mutable_cols();
-  tsm_wrapper.i = tsm->mutable_rows();
-  tsm_wrapper.x = tsm->mutable_values();
-
-  return cs_compress(&tsm_wrapper);
-}
-
-void CXSparse::ApproximateMinimumDegreeOrdering(cs_di* A, int* ordering) {
-  int* cs_ordering = cs_amd(1, A);
-  std::copy(cs_ordering, cs_ordering + A->m, ordering);
-  cs_free(cs_ordering);
-}
-
-cs_di* CXSparse::TransposeMatrix(cs_di* A) { return cs_di_transpose(A, 1); }
-
-cs_di* CXSparse::MatrixMatrixMultiply(cs_di* A, cs_di* B) {
-  return cs_di_multiply(A, B);
-}
-
-void CXSparse::Free(cs_di* sparse_matrix) { cs_di_spfree(sparse_matrix); }
-
-void CXSparse::Free(cs_dis* symbolic_factor) { cs_di_sfree(symbolic_factor); }
-
-void CXSparse::Free(csn* numeric_factor) { cs_di_nfree(numeric_factor); }
-
-std::unique_ptr<SparseCholesky> CXSparseCholesky::Create(
-    const OrderingType ordering_type) {
-  return std::unique_ptr<SparseCholesky>(new CXSparseCholesky(ordering_type));
-}
-
-CompressedRowSparseMatrix::StorageType CXSparseCholesky::StorageType() const {
-  return CompressedRowSparseMatrix::LOWER_TRIANGULAR;
-}
-
-CXSparseCholesky::CXSparseCholesky(const OrderingType ordering_type)
-    : ordering_type_(ordering_type),
-      symbolic_factor_(nullptr),
-      numeric_factor_(nullptr) {}
-
-CXSparseCholesky::~CXSparseCholesky() {
-  FreeSymbolicFactorization();
-  FreeNumericFactorization();
-}
-
-LinearSolverTerminationType CXSparseCholesky::Factorize(
-    CompressedRowSparseMatrix* lhs, std::string* message) {
-  CHECK_EQ(lhs->storage_type(), StorageType());
-  if (lhs == nullptr) {
-    *message = "Failure: Input lhs is nullptr.";
-    return LINEAR_SOLVER_FATAL_ERROR;
-  }
-
-  cs_di cs_lhs = cs_.CreateSparseMatrixTransposeView(lhs);
-
-  if (symbolic_factor_ == nullptr) {
-    if (ordering_type_ == NATURAL) {
-      symbolic_factor_ = cs_.AnalyzeCholeskyWithNaturalOrdering(&cs_lhs);
-    } else {
-      if (!lhs->col_blocks().empty() && !(lhs->row_blocks().empty())) {
-        symbolic_factor_ = cs_.BlockAnalyzeCholesky(
-            &cs_lhs, lhs->col_blocks(), lhs->row_blocks());
-      } else {
-        symbolic_factor_ = cs_.AnalyzeCholesky(&cs_lhs);
-      }
-    }
-
-    if (symbolic_factor_ == nullptr) {
-      *message = "CXSparse Failure : Symbolic factorization failed.";
-      return LINEAR_SOLVER_FATAL_ERROR;
-    }
-  }
-
-  FreeNumericFactorization();
-  numeric_factor_ = cs_.Cholesky(&cs_lhs, symbolic_factor_);
-  if (numeric_factor_ == nullptr) {
-    *message = "CXSparse Failure : Numeric factorization failed.";
-    return LINEAR_SOLVER_FAILURE;
-  }
-
-  return LINEAR_SOLVER_SUCCESS;
-}
-
-LinearSolverTerminationType CXSparseCholesky::Solve(const double* rhs,
-                                                    double* solution,
-                                                    std::string* message) {
-  CHECK(numeric_factor_ != nullptr)
-      << "Solve called without a call to Factorize first.";
-  const int num_cols = numeric_factor_->L->n;
-  memcpy(solution, rhs, num_cols * sizeof(*solution));
-  cs_.Solve(symbolic_factor_, numeric_factor_, solution);
-  return LINEAR_SOLVER_SUCCESS;
-}
-
-void CXSparseCholesky::FreeSymbolicFactorization() {
-  if (symbolic_factor_ != nullptr) {
-    cs_.Free(symbolic_factor_);
-    symbolic_factor_ = nullptr;
-  }
-}
-
-void CXSparseCholesky::FreeNumericFactorization() {
-  if (numeric_factor_ != nullptr) {
-    cs_.Free(numeric_factor_);
-    numeric_factor_ = nullptr;
-  }
-}
-
-}  // namespace internal
-}  // namespace ceres
-
-#endif  // CERES_NO_CXSPARSE
--- a/extern/ceres/internal/ceres/cxsparse.h
+++ b/extern/ceres/internal/ceres/cxsparse.h
@@ -1,182 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: strandmark@google.com (Petter Strandmark)
-
-#ifndef CERES_INTERNAL_CXSPARSE_H_
-#define CERES_INTERNAL_CXSPARSE_H_
-
-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
-
-#ifndef CERES_NO_CXSPARSE
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "ceres/internal/disable_warnings.h"
-#include "ceres/linear_solver.h"
-#include "ceres/sparse_cholesky.h"
-#include "cs.h"
-
-namespace ceres {
-namespace internal {
-
-class CompressedRowSparseMatrix;
-class TripletSparseMatrix;
-
-// This object provides access to solving linear systems using Cholesky
-// factorization with a known symbolic factorization. This features does not
-// explicitly exist in CXSparse. The methods in the class are nonstatic because
-// the class manages internal scratch space.
-class CERES_NO_EXPORT CXSparse {
- public:
-  CXSparse();
-  ~CXSparse();
-
-  // Solve the system lhs * solution = rhs in place by using an
-  // approximate minimum degree fill reducing ordering.
-  bool SolveCholesky(cs_di* lhs, double* rhs_and_solution);
-
-  // Solves a linear system given its symbolic and numeric factorization.
-  void Solve(cs_dis* symbolic_factor,
-             csn* numeric_factor,
-             double* rhs_and_solution);
-
-  // Compute the numeric Cholesky factorization of A, given its
-  // symbolic factorization.
-  //
-  // Caller owns the result.
-  csn* Cholesky(cs_di* A, cs_dis* symbolic_factor);
-
-  // Creates a sparse matrix from a compressed-column form. No memory is
-  // allocated or copied; the structure A is filled out with info from the
-  // argument.
-  cs_di CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A);
-
-  // Creates a new matrix from a triplet form. Deallocate the returned matrix
-  // with Free. May return nullptr if the compression or allocation fails.
-  cs_di* CreateSparseMatrix(TripletSparseMatrix* A);
-
-  // B = A'
-  //
-  // The returned matrix should be deallocated with Free when not used
-  // anymore.
-  cs_di* TransposeMatrix(cs_di* A);
-
-  // C = A * B
-  //
-  // The returned matrix should be deallocated with Free when not used
-  // anymore.
-  cs_di* MatrixMatrixMultiply(cs_di* A, cs_di* B);
-
-  // Computes a symbolic factorization of A that can be used in SolveCholesky.
-  //
-  // The returned matrix should be deallocated with Free when not used anymore.
-  cs_dis* AnalyzeCholesky(cs_di* A);
-
-  // Computes a symbolic factorization of A that can be used in
-  // SolveCholesky, but does not compute a fill-reducing ordering.
-  //
-  // The returned matrix should be deallocated with Free when not used anymore.
-  cs_dis* AnalyzeCholeskyWithNaturalOrdering(cs_di* A);
-
-  // Computes a symbolic factorization of A that can be used in
-  // SolveCholesky. The difference from AnalyzeCholesky is that this
-  // function first detects the block sparsity of the matrix using
-  // information about the row and column blocks and uses this block
-  // sparse matrix to find a fill-reducing ordering. This ordering is
-  // then used to find a symbolic factorization. This can result in a
-  // significant performance improvement AnalyzeCholesky on block
-  // sparse matrices.
-  //
-  // The returned matrix should be deallocated with Free when not used
-  // anymore.
-  cs_dis* BlockAnalyzeCholesky(cs_di* A,
-                               const std::vector<int>& row_blocks,
-                               const std::vector<int>& col_blocks);
-
-  // Compute an fill-reducing approximate minimum degree ordering of
-  // the matrix A. ordering should be non-nullptr and should point to
-  // enough memory to hold the ordering for the rows of A.
-  void ApproximateMinimumDegreeOrdering(cs_di* A, int* ordering);
-
-  void Free(cs_di* sparse_matrix);
-  void Free(cs_dis* symbolic_factorization);
-  void Free(csn* numeric_factorization);
-
- private:
-  // Cached scratch space
-  CS_ENTRY* scratch_;
-  int scratch_size_;
-};
-
-// An implementation of SparseCholesky interface using the CXSparse
-// library.
-class CERES_NO_EXPORT CXSparseCholesky final : public SparseCholesky {
- public:
-  // Factory
-  static std::unique_ptr<SparseCholesky> Create(OrderingType ordering_type);
-
-  // SparseCholesky interface.
-  ~CXSparseCholesky() override;
-  CompressedRowSparseMatrix::StorageType StorageType() const final;
-  LinearSolverTerminationType Factorize(CompressedRowSparseMatrix* lhs,
-                                        std::string* message) final;
-  LinearSolverTerminationType Solve(const double* rhs,
-                                    double* solution,
-                                    std::string* message) final;
-
- private:
-  explicit CXSparseCholesky(const OrderingType ordering_type);
-  void FreeSymbolicFactorization();
-  void FreeNumericFactorization();
-
-  const OrderingType ordering_type_;
-  CXSparse cs_;
-  cs_dis* symbolic_factor_;
-  csn* numeric_factor_;
-};
-
-}  // namespace internal
-}  // namespace ceres
-
-#include "ceres/internal/reenable_warnings.h"
-
-#else
-
-typedef void cs_dis;
-
-class CXSparse {
- public:
-  void Free(void* arg) {}
-};
-#endif  // CERES_NO_CXSPARSE
-
-#endif  // CERES_INTERNAL_CXSPARSE_H_
--- a/extern/ceres/internal/ceres/dense_cholesky.cc
+++ b/extern/ceres/internal/ceres/dense_cholesky.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,12 +33,15 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

 #include "ceres/internal/config.h"
+#include "ceres/iterative_refiner.h"

 #ifndef CERES_NO_CUDA
 #include "ceres/context_impl.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 #include "cuda_runtime.h"
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA
@@ -57,10 +60,21 @@ extern "C" void dpotrs_(const char* uplo,
                        double* b,
                        const int* ldb,
                        int* info);
+
+extern "C" void spotrf_(
+    const char* uplo, const int* n, float* a, const int* lda, int* info);
+
+extern "C" void spotrs_(const char* uplo,
+                        const int* n,
+                        const int* nrhs,
+                        const float* a,
+                        const int* lda,
+                        float* b,
+                        const int* ldb,
+                        int* info);
 #endif

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DenseCholesky::~DenseCholesky() = default;

@@ -70,12 +84,22 @@ std::unique_ptr<DenseCholesky> DenseCholesky::Create(

  switch (options.dense_linear_algebra_library_type) {
    case EIGEN:
-      dense_cholesky = std::make_unique<EigenDenseCholesky>();
+      // Eigen mixed precision solver not yet implemented.
+      if (options.use_mixed_precision_solves) {
+        dense_cholesky = std::make_unique<FloatEigenDenseCholesky>();
+      } else {
+        dense_cholesky = std::make_unique<EigenDenseCholesky>();
+      }
      break;

    case LAPACK:
 #ifndef CERES_NO_LAPACK
-      dense_cholesky = std::make_unique<LAPACKDenseCholesky>();
+      // LAPACK mixed precision solver not yet implemented.
+      if (options.use_mixed_precision_solves) {
+        dense_cholesky = std::make_unique<FloatLAPACKDenseCholesky>();
+      } else {
+        dense_cholesky = std::make_unique<LAPACKDenseCholesky>();
+      }
      break;
 #else
      LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
@@ -83,7 +107,11 @@ std::unique_ptr<DenseCholesky> DenseCholesky::Create(

    case CUDA:
 #ifndef CERES_NO_CUDA
-      dense_cholesky = CUDADenseCholesky::Create(options);
+      if (options.use_mixed_precision_solves) {
+        dense_cholesky = CUDADenseCholeskyMixedPrecision::Create(options);
+      } else {
+        dense_cholesky = CUDADenseCholesky::Create(options);
+      }
      break;
 #else
      LOG(FATAL) << "Ceres was compiled without support for CUDA.";
@@ -94,6 +122,14 @@ std::unique_ptr<DenseCholesky> DenseCholesky::Create(
                 << DenseLinearAlgebraLibraryTypeToString(
                        options.dense_linear_algebra_library_type);
  }
+
+  if (options.max_num_refinement_iterations > 0) {
+    auto refiner = std::make_unique<DenseIterativeRefiner>(
+        options.max_num_refinement_iterations);
+    dense_cholesky = std::make_unique<RefinedDenseCholesky>(
+        std::move(dense_cholesky), std::move(refiner));
+  }
+
  return dense_cholesky;
 }

@@ -105,7 +141,7 @@ LinearSolverTerminationType DenseCholesky::FactorAndSolve(
    std::string* message) {
  LinearSolverTerminationType termination_type =
      Factorize(num_cols, lhs, message);
-  if (termination_type == LINEAR_SOLVER_SUCCESS) {
+  if (termination_type == LinearSolverTerminationType::SUCCESS) {
    termination_type = Solve(rhs, solution, message);
  }
  return termination_type;
@@ -117,11 +153,11 @@ LinearSolverTerminationType EigenDenseCholesky::Factorize(
  llt_ = std::make_unique<LLTType>(m);
  if (llt_->info() != Eigen::Success) {
    *message = "Eigen failure. Unable to perform dense Cholesky factorization.";
-    return LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::FAILURE;
  }

  *message = "Success.";
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 LinearSolverTerminationType EigenDenseCholesky::Solve(const double* rhs,
@@ -129,13 +165,41 @@ LinearSolverTerminationType EigenDenseCholesky::Solve(const double* rhs,
                                                      std::string* message) {
  if (llt_->info() != Eigen::Success) {
    *message = "Eigen failure. Unable to perform dense Cholesky factorization.";
-    return LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::FAILURE;
  }

  VectorRef(solution, llt_->cols()) =
      llt_->solve(ConstVectorRef(rhs, llt_->cols()));
  *message = "Success.";
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType FloatEigenDenseCholesky::Factorize(
+    int num_cols, double* lhs, std::string* message) {
+  // TODO(sameeragarwal): Check if this causes a double allocation.
+  lhs_ = Eigen::Map<Eigen::MatrixXd>(lhs, num_cols, num_cols).cast<float>();
+  llt_ = std::make_unique<LLTType>(lhs_);
+  if (llt_->info() != Eigen::Success) {
+    *message = "Eigen failure. Unable to perform dense Cholesky factorization.";
+    return LinearSolverTerminationType::FAILURE;
+  }
+
+  *message = "Success.";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType FloatEigenDenseCholesky::Solve(
+    const double* rhs, double* solution, std::string* message) {
+  if (llt_->info() != Eigen::Success) {
+    *message = "Eigen failure. Unable to perform dense Cholesky factorization.";
+    return LinearSolverTerminationType::FAILURE;
+  }
+
+  rhs_ = ConstVectorRef(rhs, llt_->cols()).cast<float>();
+  solution_ = llt_->solve(rhs_);
+  VectorRef(solution, llt_->cols()) = solution_.cast<double>();
+  *message = "Success.";
+  return LinearSolverTerminationType::SUCCESS;
 }

 #ifndef CERES_NO_LAPACK
@@ -149,19 +213,19 @@ LinearSolverTerminationType LAPACKDenseCholesky::Factorize(
  dpotrf_(&uplo, &num_cols_, lhs_, &num_cols_, &info);

  if (info < 0) {
-    termination_type_ = LINEAR_SOLVER_FATAL_ERROR;
+    termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
               << "Please report it. "
               << "LAPACK::dpotrf fatal error. "
               << "Argument: " << -info << " is invalid.";
  } else if (info > 0) {
-    termination_type_ = LINEAR_SOLVER_FAILURE;
+    termination_type_ = LinearSolverTerminationType::FAILURE;
    *message = StringPrintf(
        "LAPACK::dpotrf numerical failure. "
        "The leading minor of order %d is not positive definite.",
        info);
  } else {
-    termination_type_ = LINEAR_SOLVER_SUCCESS;
+    termination_type_ = LinearSolverTerminationType::SUCCESS;
    *message = "Success.";
  }
  return termination_type_;
@@ -174,12 +238,12 @@ LinearSolverTerminationType LAPACKDenseCholesky::Solve(const double* rhs,
  const int nrhs = 1;
  int info = 0;

-  std::copy_n(rhs, num_cols_, solution);
+  VectorRef(solution, num_cols_) = ConstVectorRef(rhs, num_cols_);
  dpotrs_(
      &uplo, &num_cols_, &nrhs, lhs_, &num_cols_, solution, &num_cols_, &info);

  if (info < 0) {
-    termination_type_ = LINEAR_SOLVER_FATAL_ERROR;
+    termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
               << "Please report it. "
               << "LAPACK::dpotrs fatal error. "
@@ -187,35 +251,118 @@ LinearSolverTerminationType LAPACKDenseCholesky::Solve(const double* rhs,
  }

  *message = "Success";
-  termination_type_ = LINEAR_SOLVER_SUCCESS;
+  termination_type_ = LinearSolverTerminationType::SUCCESS;

  return termination_type_;
 }

+LinearSolverTerminationType FloatLAPACKDenseCholesky::Factorize(
+    int num_cols, double* lhs, std::string* message) {
+  num_cols_ = num_cols;
+  lhs_ = Eigen::Map<Eigen::MatrixXd>(lhs, num_cols, num_cols).cast<float>();
+
+  const char uplo = 'L';
+  int info = 0;
+  spotrf_(&uplo, &num_cols_, lhs_.data(), &num_cols_, &info);
+
+  if (info < 0) {
+    termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+               << "Please report it. "
+               << "LAPACK::spotrf fatal error. "
+               << "Argument: " << -info << " is invalid.";
+  } else if (info > 0) {
+    termination_type_ = LinearSolverTerminationType::FAILURE;
+    *message = StringPrintf(
+        "LAPACK::spotrf numerical failure. "
+        "The leading minor of order %d is not positive definite.",
+        info);
+  } else {
+    termination_type_ = LinearSolverTerminationType::SUCCESS;
+    *message = "Success.";
+  }
+  return termination_type_;
+}
+
+LinearSolverTerminationType FloatLAPACKDenseCholesky::Solve(
+    const double* rhs, double* solution, std::string* message) {
+  const char uplo = 'L';
+  const int nrhs = 1;
+  int info = 0;
+  rhs_and_solution_ = ConstVectorRef(rhs, num_cols_).cast<float>();
+  spotrs_(&uplo,
+          &num_cols_,
+          &nrhs,
+          lhs_.data(),
+          &num_cols_,
+          rhs_and_solution_.data(),
+          &num_cols_,
+          &info);
+
+  if (info < 0) {
+    termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+               << "Please report it. "
+               << "LAPACK::dpotrs fatal error. "
+               << "Argument: " << -info << " is invalid.";
+  }
+
+  *message = "Success";
+  termination_type_ = LinearSolverTerminationType::SUCCESS;
+  VectorRef(solution, num_cols_) =
+      rhs_and_solution_.head(num_cols_).cast<double>();
+  return termination_type_;
+}
+
 #endif  // CERES_NO_LAPACK

+RefinedDenseCholesky::RefinedDenseCholesky(
+    std::unique_ptr<DenseCholesky> dense_cholesky,
+    std::unique_ptr<DenseIterativeRefiner> iterative_refiner)
+    : dense_cholesky_(std::move(dense_cholesky)),
+      iterative_refiner_(std::move(iterative_refiner)) {}
+
+RefinedDenseCholesky::~RefinedDenseCholesky() = default;
+
+LinearSolverTerminationType RefinedDenseCholesky::Factorize(
+    const int num_cols, double* lhs, std::string* message) {
+  lhs_ = lhs;
+  num_cols_ = num_cols;
+  return dense_cholesky_->Factorize(num_cols, lhs, message);
+}
+
+LinearSolverTerminationType RefinedDenseCholesky::Solve(const double* rhs,
+                                                        double* solution,
+                                                        std::string* message) {
+  CHECK(lhs_ != nullptr);
+  auto termination_type = dense_cholesky_->Solve(rhs, solution, message);
+  if (termination_type != LinearSolverTerminationType::SUCCESS) {
+    return termination_type;
+  }
+
+  iterative_refiner_->Refine(
+      num_cols_, lhs_, rhs, dense_cholesky_.get(), solution);
+  return LinearSolverTerminationType::SUCCESS;
+}
+
 #ifndef CERES_NO_CUDA

-bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
-  if (!context->InitCUDA(message)) {
-    return false;
-  }
-  cusolver_handle_ = context->cusolver_handle_;
-  stream_ = context->stream_;
-  error_.Reserve(1);
-  *message = "CUDADenseCholesky::Init Success.";
-  return true;
-}
+CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context)
+    : context_(context),
+      lhs_{context},
+      rhs_{context},
+      device_workspace_{context},
+      error_(context, 1) {}

 LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
                                                         double* lhs,
                                                         std::string* message) {
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
  lhs_.Reserve(num_cols * num_cols);
  num_cols_ = num_cols;
-  lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_cols * num_cols);
  int device_workspace_size = 0;
-  if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
+  if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_,
                                  CUBLAS_FILL_MODE_LOWER,
                                  num_cols,
                                  lhs_.data(),
@@ -223,10 +370,10 @@ LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
                                  &device_workspace_size) !=
      CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  device_workspace_.Reserve(device_workspace_size);
-  if (cusolverDnDpotrf(cusolver_handle_,
+  if (cusolverDnDpotrf(context_->cusolver_handle_,
                       CUBLAS_FILL_MODE_LOWER,
                       num_cols,
                       lhs_.data(),
@@ -235,15 +382,10 @@ LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
                       device_workspace_.size(),
                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDpotrf failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
  if (error < 0) {
    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
               << "please report it. "
@@ -251,29 +393,29 @@ LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
               << "Argument: " << -error << " is invalid.";
    // The following line is unreachable, but return failure just to be
    // pedantic, since the compiler does not know that.
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  } else if (error > 0) {
    *message = StringPrintf(
        "cuSolverDN::cusolverDnDpotrf numerical failure. "
        "The leading minor of order %d is not positive definite.",
        error);
-    factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
-    return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+    factorize_result_ = LinearSolverTerminationType::FAILURE;
+    return LinearSolverTerminationType::FAILURE;
  }
  *message = "Success";
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  factorize_result_ = LinearSolverTerminationType::SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 LinearSolverTerminationType CUDADenseCholesky::Solve(const double* rhs,
                                                     double* solution,
                                                     std::string* message) {
-  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
-    *message = "Factorize did not complete succesfully previously.";
+  if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
+    *message = "Factorize did not complete successfully previously.";
    return factorize_result_;
  }
-  rhs_.CopyToGpuAsync(rhs, num_cols_, stream_);
-  if (cusolverDnDpotrs(cusolver_handle_,
+  rhs_.CopyFromCpu(rhs, num_cols_);
+  if (cusolverDnDpotrs(context_->cusolver_handle_,
                       CUBLAS_FILL_MODE_LOWER,
                       num_cols_,
                       1,
@@ -283,45 +425,221 @@ LinearSolverTerminationType CUDADenseCholesky::Solve(const double* rhs,
                       num_cols_,
                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDpotrs failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
  if (error != 0) {
    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
               << "Please report it."
               << "cuSolverDN::cusolverDnDpotrs fatal error. "
               << "Argument: " << -error << " is invalid.";
  }
-  rhs_.CopyToHost(solution, num_cols_);
+  rhs_.CopyToCpu(solution, num_cols_);
  *message = "Success";
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
    const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      options.context == nullptr || !options.context->IsCudaInitialized()) {
    return nullptr;
  }
-  auto cuda_dense_cholesky =
-      std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky());
-  std::string cuda_error;
-  if (cuda_dense_cholesky->Init(options.context, &cuda_error)) {
-    return cuda_dense_cholesky;
+  return std::unique_ptr<CUDADenseCholesky>(
+      new CUDADenseCholesky(options.context));
+}
+
+std::unique_ptr<CUDADenseCholeskyMixedPrecision>
+CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      !options.use_mixed_precision_solves || options.context == nullptr ||
+      !options.context->IsCudaInitialized()) {
+    return nullptr;
  }
-  // Initialization failed, destroy the object (done automatically) and return a
-  // nullptr.
-  LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error;
-  return nullptr;
+  return std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
+      new CUDADenseCholeskyMixedPrecision(
+          options.context, options.max_num_refinement_iterations));
+}
+
+LinearSolverTerminationType
+CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
+  int device_workspace_size = 0;
+  if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_,
+                                  CUBLAS_FILL_MODE_LOWER,
+                                  num_cols_,
+                                  lhs_fp32_.data(),
+                                  num_cols_,
+                                  &device_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSpotrf_bufferSize failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  device_workspace_.Reserve(device_workspace_size);
+  if (cusolverDnSpotrf(context_->cusolver_handle_,
+                       CUBLAS_FILL_MODE_LOWER,
+                       num_cols_,
+                       lhs_fp32_.data(),
+                       num_cols_,
+                       device_workspace_.data(),
+                       device_workspace_.size(),
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSpotrf failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToCpu(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+               << "please report it. "
+               << "cuSolverDN::cusolverDnSpotrf fatal error. "
+               << "Argument: " << -error << " is invalid.";
+    // The following line is unreachable, but return failure just to be
+    // pedantic, since the compiler does not know that.
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  if (error > 0) {
+    *message = StringPrintf(
+        "cuSolverDN::cusolverDnSpotrf numerical failure. "
+        "The leading minor of order %d is not positive definite.",
+        error);
+    factorize_result_ = LinearSolverTerminationType::FAILURE;
+    return LinearSolverTerminationType::FAILURE;
+  }
+  *message = "Success";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskySolve(
+    std::string* message) {
+  CHECK_EQ(cudaMemcpyAsync(correction_fp32_.data(),
+                           residual_fp32_.data(),
+                           num_cols_ * sizeof(float),
+                           cudaMemcpyDeviceToDevice,
+                           context_->DefaultStream()),
+           cudaSuccess);
+  if (cusolverDnSpotrs(context_->cusolver_handle_,
+                       CUBLAS_FILL_MODE_LOWER,
+                       num_cols_,
+                       1,
+                       lhs_fp32_.data(),
+                       num_cols_,
+                       correction_fp32_.data(),
+                       num_cols_,
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDpotrs failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToCpu(&error, 1);
+  if (error != 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+               << "Please report it."
+               << "cuSolverDN::cusolverDnDpotrs fatal error. "
+               << "Argument: " << -error << " is invalid.";
+  }
+  *message = "Success";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision(
+    ContextImpl* context, int max_num_refinement_iterations)
+    : context_(context),
+      lhs_fp64_{context},
+      rhs_fp64_{context},
+      lhs_fp32_{context},
+      device_workspace_{context},
+      error_(context, 1),
+      x_fp64_{context},
+      correction_fp32_{context},
+      residual_fp32_{context},
+      residual_fp64_{context},
+      max_num_refinement_iterations_(max_num_refinement_iterations) {}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
+    int num_cols, double* lhs, std::string* message) {
+  num_cols_ = num_cols;
+
+  // Copy fp64 version of lhs to GPU.
+  lhs_fp64_.Reserve(num_cols * num_cols);
+  lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols);
+
+  // Create an fp32 copy of lhs, lhs_fp32.
+  lhs_fp32_.Reserve(num_cols * num_cols);
+  CudaFP64ToFP32(lhs_fp64_.data(),
+                 lhs_fp32_.data(),
+                 num_cols * num_cols,
+                 context_->DefaultStream());
+
+  // Factorize lhs_fp32.
+  factorize_result_ = CudaCholeskyFactorize(message);
+  return factorize_result_;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Solve(
+    const double* rhs, double* solution, std::string* message) {
+  // If factorization failed, return failure.
+  if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
+    *message = "Factorize did not complete successfully previously.";
+    return factorize_result_;
+  }
+
+  // Reserve memory for all arrays.
+  rhs_fp64_.Reserve(num_cols_);
+  x_fp64_.Reserve(num_cols_);
+  correction_fp32_.Reserve(num_cols_);
+  residual_fp32_.Reserve(num_cols_);
+  residual_fp64_.Reserve(num_cols_);
+
+  // Initialize x = 0.
+  CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream());
+
+  // Initialize residual = rhs.
+  rhs_fp64_.CopyFromCpu(rhs, num_cols_);
+  residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
+
+  for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
+    // Cast residual from fp64 to fp32.
+    CudaFP64ToFP32(residual_fp64_.data(),
+                   residual_fp32_.data(),
+                   num_cols_,
+                   context_->DefaultStream());
+    // [fp32] c = lhs^-1 * residual.
+    auto result = CudaCholeskySolve(message);
+    if (result != LinearSolverTerminationType::SUCCESS) {
+      return result;
+    }
+    // [fp64] x += c.
+    CudaDsxpy(x_fp64_.data(),
+              correction_fp32_.data(),
+              num_cols_,
+              context_->DefaultStream());
+    if (i < max_num_refinement_iterations_) {
+      // [fp64] residual = rhs - lhs * x
+      // This is done in two steps:
+      // 1. [fp64] residual = rhs
+      residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
+      // 2. [fp64] residual = residual - lhs * x
+      double alpha = -1.0;
+      double beta = 1.0;
+      cublasDsymv(context_->cublas_handle_,
+                  CUBLAS_FILL_MODE_LOWER,
+                  num_cols_,
+                  &alpha,
+                  lhs_fp64_.data(),
+                  num_cols_,
+                  x_fp64_.data(),
+                  1,
+                  &beta,
+                  residual_fp64_.data(),
+                  1);
+    }
+  }
+  x_fp64_.CopyToCpu(solution, num_cols_);
+  *message = "Success.";
+  return LinearSolverTerminationType::SUCCESS;
 }

 #endif  // CERES_NO_CUDA

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dense_cholesky.h
+++ b/extern/ceres/internal/ceres/dense_cholesky.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include <vector>

 #include "Eigen/Dense"
+#include "ceres/context_impl.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/linear_solver.h"
 #include "glog/logging.h"
@@ -49,8 +50,7 @@
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // An interface that abstracts away the internal details of various dense linear
 // algebra libraries and offers a simple API for solving dense symmetric
@@ -88,7 +88,7 @@ class CERES_NO_EXPORT DenseCholesky {
                                            std::string* message) = 0;

  // Convenience method which combines a call to Factorize and Solve. Solve is
-  // only called if Factorize returns LINEAR_SOLVER_SUCCESS.
+  // only called if Factorize returns LinearSolverTerminationType::SUCCESS.
  //
  // The input matrix lhs may be modified by the implementation to store the
  // factorization, irrespective of whether the method succeeds or not. It is
@@ -115,6 +115,23 @@ class CERES_NO_EXPORT EigenDenseCholesky final : public DenseCholesky {
  std::unique_ptr<LLTType> llt_;
 };

+class CERES_NO_EXPORT FloatEigenDenseCholesky final : public DenseCholesky {
+ public:
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  Eigen::MatrixXf lhs_;
+  Eigen::VectorXf rhs_;
+  Eigen::VectorXf solution_;
+  using LLTType = Eigen::LLT<Eigen::MatrixXf, Eigen::Lower>;
+  std::unique_ptr<LLTType> llt_;
+};
+
 #ifndef CERES_NO_LAPACK
 class CERES_NO_EXPORT LAPACKDenseCholesky final : public DenseCholesky {
 public:
@@ -128,10 +145,53 @@ class CERES_NO_EXPORT LAPACKDenseCholesky final : public DenseCholesky {
 private:
  double* lhs_ = nullptr;
  int num_cols_ = -1;
-  LinearSolverTerminationType termination_type_ = LINEAR_SOLVER_FATAL_ERROR;
+  LinearSolverTerminationType termination_type_ =
+      LinearSolverTerminationType::FATAL_ERROR;
+};
+
+class CERES_NO_EXPORT FloatLAPACKDenseCholesky final : public DenseCholesky {
+ public:
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  Eigen::MatrixXf lhs_;
+  Eigen::VectorXf rhs_and_solution_;
+  int num_cols_ = -1;
+  LinearSolverTerminationType termination_type_ =
+      LinearSolverTerminationType::FATAL_ERROR;
 };
 #endif  // CERES_NO_LAPACK

+class DenseIterativeRefiner;
+
+// Computes an initial solution using the given instance of
+// DenseCholesky, and then refines it using the DenseIterativeRefiner.
+class CERES_NO_EXPORT RefinedDenseCholesky final : public DenseCholesky {
+ public:
+  RefinedDenseCholesky(
+      std::unique_ptr<DenseCholesky> dense_cholesky,
+      std::unique_ptr<DenseIterativeRefiner> iterative_refiner);
+  ~RefinedDenseCholesky() override;
+
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  std::unique_ptr<DenseCholesky> dense_cholesky_;
+  std::unique_ptr<DenseIterativeRefiner> iterative_refiner_;
+  double* lhs_ = nullptr;
+  int num_cols_;
+};
+
 #ifndef CERES_NO_CUDA
 // CUDA implementation of DenseCholesky using the cuSolverDN library using the
 // 32-bit legacy interface for maximum compatibility.
@@ -149,16 +209,9 @@ class CERES_NO_EXPORT CUDADenseCholesky final : public DenseCholesky {
                                    std::string* message) override;

 private:
-  CUDADenseCholesky() = default;
-  // Picks up the cuSolverDN and cuStream handles from the context. If
-  // the context is unable to initialize CUDA, returns false with a
-  // human-readable message indicating the reason.
-  bool Init(ContextImpl* context, std::string* message);
+  explicit CUDADenseCholesky(ContextImpl* context);

-  // Handle to the cuSOLVER context.
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
+  ContextImpl* context_ = nullptr;
  // Number of columns in the A matrix, to be cached between calls to *Factorize
  // and *Solve.
  size_t num_cols_ = 0;
@@ -171,13 +224,85 @@ class CERES_NO_EXPORT CUDADenseCholesky final : public DenseCholesky {
  // Required for error handling with cuSOLVER.
  CudaBuffer<int> error_;
  // Cache the result of Factorize to ensure that when Solve is called, the
-  // factiorization of lhs is valid.
-  LinearSolverTerminationType factorize_result_ = LINEAR_SOLVER_FATAL_ERROR;
+  // factorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LinearSolverTerminationType::FATAL_ERROR;
+};
+
+// A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA
+// Dense Cholesky for inner iterations, and FP64 outer refinements.
+// This class implements a modified version of the  "Classical iterative
+// refinement" (Algorithm 4.1) from the following paper:
+// Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas
+// J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs
+// to accelerate solution of linear systems." Proceedings of the Royal Society A
+// 476, no. 2243 (2020): 20200110.
+//
+// The three key modifications from Algorithm 4.1 in the paper are:
+// 1. We use Cholesky factorization instead of LU factorization since our A is
+//    symmetric positive definite.
+// 2. During the solution update, the up-cast and accumulation is performed in
+//    one step with a custom kernel.
+class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final
+    : public DenseCholesky {
+ public:
+  static std::unique_ptr<CUDADenseCholeskyMixedPrecision> Create(
+      const LinearSolver::Options& options);
+  CUDADenseCholeskyMixedPrecision(const CUDADenseCholeskyMixedPrecision&) =
+      delete;
+  CUDADenseCholeskyMixedPrecision& operator=(
+      const CUDADenseCholeskyMixedPrecision&) = delete;
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  CUDADenseCholeskyMixedPrecision(ContextImpl* context,
+                                  int max_num_refinement_iterations);
+
+  // Helper function to wrap Cuda boilerplate needed to call Spotrf.
+  LinearSolverTerminationType CudaCholeskyFactorize(std::string* message);
+  // Helper function to wrap Cuda boilerplate needed to call Spotrs.
+  LinearSolverTerminationType CudaCholeskySolve(std::string* message);
+  // Picks up the cuSolverDN and cuStream handles from the context in the
+  // options, and the number of refinement iterations from the options. If
+  // the context is unable to initialize CUDA, returns false with a
+  // human-readable message indicating the reason.
+  bool Init(const LinearSolver::Options& options, std::string* message);
+
+  ContextImpl* context_ = nullptr;
+  // Number of columns in the A matrix, to be cached between calls to *Factorize
+  // and *Solve.
+  size_t num_cols_ = 0;
+  CudaBuffer<double> lhs_fp64_;
+  CudaBuffer<double> rhs_fp64_;
+  CudaBuffer<float> lhs_fp32_;
+  // Scratch space for cuSOLVER on the GPU.
+  CudaBuffer<float> device_workspace_;
+  // Required for error handling with cuSOLVER.
+  CudaBuffer<int> error_;
+
+  // Solution to lhs * x = rhs.
+  CudaBuffer<double> x_fp64_;
+  // Incremental correction to x.
+  CudaBuffer<float> correction_fp32_;
+  // Residual to iterative refinement.
+  CudaBuffer<float> residual_fp32_;
+  CudaBuffer<double> residual_fp64_;
+
+  // Number of inner refinement iterations to perform.
+  int max_num_refinement_iterations_ = 0;
+  // Cache the result of Factorize to ensure that when Solve is called, the
+  // factorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LinearSolverTerminationType::FATAL_ERROR;
 };

 #endif  // CERES_NO_CUDA

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_DENSE_CHOLESKY_H_
--- a/extern/ceres/internal/ceres/dense_jacobian_writer.h
+++ b/extern/ceres/internal/ceres/dense_jacobian_writer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -75,8 +75,8 @@ class CERES_NO_EXPORT DenseJacobianWriter {
    DenseSparseMatrix* dense_jacobian = down_cast<DenseSparseMatrix*>(jacobian);
    const ResidualBlock* residual_block =
        program_->residual_blocks()[residual_id];
-    int num_parameter_blocks = residual_block->NumParameterBlocks();
-    int num_residuals = residual_block->NumResiduals();
+    const int num_parameter_blocks = residual_block->NumParameterBlocks();
+    const int num_residuals = residual_block->NumResiduals();

    // Now copy the jacobians for each parameter into the dense jacobian matrix.
    for (int j = 0; j < num_parameter_blocks; ++j) {
--- a/extern/ceres/internal/ceres/dense_normal_cholesky_solver.cc
+++ b/extern/ceres/internal/ceres/dense_normal_cholesky_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DenseNormalCholeskySolver::DenseNormalCholeskySolver(
    LinearSolver::Options options)
@@ -87,5 +86,4 @@ LinearSolver::Summary DenseNormalCholeskySolver::SolveImpl(
  return summary;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dense_normal_cholesky_solver.h
+++ b/extern/ceres/internal/ceres/dense_normal_cholesky_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class DenseSparseMatrix;

@@ -94,8 +93,7 @@ class CERES_NO_EXPORT DenseNormalCholeskySolver
  std::unique_ptr<DenseCholesky> cholesky_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dense_qr.cc
+++ b/extern/ceres/internal/ceres/dense_qr.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #ifndef CERES_NO_CUDA
 #include "ceres/context_impl.h"
 #include "cublas_v2.h"
@@ -98,7 +99,7 @@ extern "C" void dormqr_(const char* side, const char* trans, const int* m,
 // a is a column major lda x n.
 // b is a column major matrix of ldb x nrhs
 //
-// info = 0 succesful.
+// info = 0 successful.
 //      = -i < 0 i^th argument is an illegal value.
 //      = i > 0, i^th diagonal element of A is zero.
 extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,
@@ -108,8 +109,7 @@ extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,

 #endif

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DenseQR::~DenseQR() = default;

@@ -153,7 +153,7 @@ LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows,
                                                    std::string* message) {
  LinearSolverTerminationType termination_type =
      Factorize(num_rows, num_cols, lhs, message);
-  if (termination_type == LINEAR_SOLVER_SUCCESS) {
+  if (termination_type == LinearSolverTerminationType::SUCCESS) {
    termination_type = Solve(rhs, solution, message);
  }
  return termination_type;
@@ -166,7 +166,7 @@ LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows,
  Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols);
  qr_ = std::make_unique<QRType>(m);
  *message = "Success.";
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
@@ -175,7 +175,7 @@ LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
  VectorRef(solution, qr_->cols()) =
      qr_->solve(ConstVectorRef(rhs, qr_->rows()));
  *message = "Success.";
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 #ifndef CERES_NO_LAPACK
@@ -237,7 +237,7 @@ LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
               << "Argument: " << -info << " is invalid.";
  }

-  termination_type_ = LINEAR_SOLVER_SUCCESS;
+  termination_type_ = LinearSolverTerminationType::SUCCESS;
  *message = "Success.";
  return termination_type_;
 }
@@ -245,7 +245,7 @@ LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
 LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
                                                 double* solution,
                                                 std::string* message) {
-  if (termination_type_ != LINEAR_SOLVER_SUCCESS) {
+  if (termination_type_ != LinearSolverTerminationType::SUCCESS) {
    *message = "QR factorization failed and solve called.";
    return termination_type_;
  }
@@ -298,10 +298,10 @@ LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
    *message =
        "QR factorization failure. The factorization is not full rank. R has "
        "zeros on the diagonal.";
-    termination_type_ = LINEAR_SOLVER_FAILURE;
+    termination_type_ = LinearSolverTerminationType::FAILURE;
  } else {
    std::copy_n(q_transpose_rhs_.data(), num_cols_, solution);
-    termination_type_ = LINEAR_SOLVER_SUCCESS;
+    termination_type_ = LinearSolverTerminationType::SUCCESS;
  }

  return termination_type_;
@@ -311,30 +311,26 @@ LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,

 #ifndef CERES_NO_CUDA

-bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
-  if (!context->InitCUDA(message)) {
-    return false;
-  }
-  cublas_handle_ = context->cublas_handle_;
-  cusolver_handle_ = context->cusolver_handle_;
-  stream_ = context->stream_;
-  error_.Reserve(1);
-  *message = "CUDADenseQR::Init Success.";
-  return true;
-}
+CUDADenseQR::CUDADenseQR(ContextImpl* context)
+    : context_(context),
+      lhs_{context},
+      rhs_{context},
+      tau_{context},
+      device_workspace_{context},
+      error_(context, 1) {}

 LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
                                                   int num_cols,
                                                   double* lhs,
                                                   std::string* message) {
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
  lhs_.Reserve(num_rows * num_cols);
  tau_.Reserve(std::min(num_rows, num_cols));
  num_rows_ = num_rows;
  num_cols_ = num_cols;
-  lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_rows * num_cols);
  int device_workspace_size = 0;
-  if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
+  if (cusolverDnDgeqrf_bufferSize(context_->cusolver_handle_,
                                  num_rows,
                                  num_cols,
                                  lhs_.data(),
@@ -342,10 +338,10 @@ LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
                                  &device_workspace_size) !=
      CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  device_workspace_.Reserve(device_workspace_size);
-  if (cusolverDnDgeqrf(cusolver_handle_,
+  if (cusolverDnDgeqrf(context_->cusolver_handle_,
                       num_rows,
                       num_cols,
                       lhs_.data(),
@@ -355,15 +351,10 @@ LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
                       device_workspace_.size(),
                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDgeqrf failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
  if (error < 0) {
    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
               << "please report it. "
@@ -371,24 +362,24 @@ LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
               << "Argument: " << -error << " is invalid.";
    // The following line is unreachable, but return failure just to be
    // pedantic, since the compiler does not know that.
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }

  *message = "Success";
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  factorize_result_ = LinearSolverTerminationType::SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
                                               double* solution,
                                               std::string* message) {
-  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
-    *message = "Factorize did not complete succesfully previously.";
+  if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
+    *message = "Factorize did not complete successfully previously.";
    return factorize_result_;
  }
-  rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
+  rhs_.CopyFromCpu(rhs, num_rows_);
  int device_workspace_size = 0;
-  if (cusolverDnDormqr_bufferSize(cusolver_handle_,
+  if (cusolverDnDormqr_bufferSize(context_->cusolver_handle_,
                                  CUBLAS_SIDE_LEFT,
                                  CUBLAS_OP_T,
                                  num_rows_,
@@ -402,12 +393,12 @@ LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
                                  &device_workspace_size) !=
      CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  device_workspace_.Reserve(device_workspace_size);
  // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
  // The result of factorization would have stored Q in a packed form in lhs_.
-  if (cusolverDnDormqr(cusolver_handle_,
+  if (cusolverDnDormqr(context_->cusolver_handle_,
                       CUBLAS_SIDE_LEFT,
                       CUBLAS_OP_T,
                       num_rows_,
@@ -422,10 +413,10 @@ LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
                       device_workspace_.size(),
                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
    *message = "cuSolverDN::cusolverDnDormqr failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
  int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
  if (error < 0) {
    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
               << "Please report it."
@@ -434,7 +425,7 @@ LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
  }
  // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
  // replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
-  if (cublasDtrsv(cublas_handle_,
+  if (cublasDtrsv(context_->cublas_handle_,
                  CUBLAS_FILL_MODE_UPPER,
                  CUBLAS_OP_N,
                  CUBLAS_DIAG_NON_UNIT,
@@ -444,38 +435,22 @@ LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
                  rhs_.data(),
                  1) != CUBLAS_STATUS_SUCCESS) {
    *message = "cuBLAS::cublasDtrsv failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  rhs_.CopyToHost(solution, num_cols_);
+  rhs_.CopyToCpu(solution, num_cols_);
  *message = "Success";
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

 std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
    const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      options.context == nullptr || !options.context->IsCudaInitialized()) {
    return nullptr;
  }
-  auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
-  std::string cuda_error;
-  if (cuda_dense_qr->Init(options.context, &cuda_error)) {
-    return cuda_dense_qr;
-  }
-  // Initialization failed, destroy the object (done automatically) and return a
-  // nullptr.
-  LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
-  return nullptr;
+  return std::unique_ptr<CUDADenseQR>(new CUDADenseQR(options.context));
 }

-CUDADenseQR::CUDADenseQR() = default;
-
 #endif  // CERES_NO_CUDA

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dense_qr.h
+++ b/extern/ceres/internal/ceres/dense_qr.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include <vector>

 #include "Eigen/Dense"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
@@ -54,8 +55,7 @@
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // An interface that abstracts away the internal details of various dense linear
 // algebra libraries and offers a simple API for solving dense linear systems
@@ -92,7 +92,7 @@ class CERES_NO_EXPORT DenseQR {
                                            std::string* message) = 0;

  // Convenience method which combines a call to Factorize and Solve. Solve is
-  // only called if Factorize returns LINEAR_SOLVER_SUCCESS.
+  // only called if Factorize returns LinearSolverTerminationType::SUCCESS.
  //
  // The input matrix lhs may be modified by the implementation to store the
  // factorization, irrespective of whether the method succeeds or not. It is
@@ -136,7 +136,8 @@ class CERES_NO_EXPORT LAPACKDenseQR final : public DenseQR {
  double* lhs_ = nullptr;
  int num_rows_;
  int num_cols_;
-  LinearSolverTerminationType termination_type_ = LINEAR_SOLVER_FATAL_ERROR;
+  LinearSolverTerminationType termination_type_ =
+      LinearSolverTerminationType::FATAL_ERROR;
  Vector work_;
  Vector tau_;
  Vector q_transpose_rhs_;
@@ -164,18 +165,9 @@ class CERES_NO_EXPORT CUDADenseQR final : public DenseQR {
                                    std::string* message) override;

 private:
-  CUDADenseQR();
-  // Picks up the cuSolverDN, cuBLAS, and cuStream handles from the context. If
-  // the context is unable to initialize CUDA, returns false with a
-  // human-readable message indicating the reason.
-  bool Init(ContextImpl* context, std::string* message);
+  explicit CUDADenseQR(ContextImpl* context);

-  // Handle to the cuSOLVER context.
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // Handle to cuBLAS context.
-  cublasHandle_t cublas_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
+  ContextImpl* context_ = nullptr;
  // Number of rowns in the A matrix, to be cached between calls to *Factorize
  // and *Solve.
  size_t num_rows_ = 0;
@@ -194,13 +186,13 @@ class CERES_NO_EXPORT CUDADenseQR final : public DenseQR {
  CudaBuffer<int> error_;
  // Cache the result of Factorize to ensure that when Solve is called, the
  // factiorization of lhs is valid.
-  LinearSolverTerminationType factorize_result_ = LINEAR_SOLVER_FATAL_ERROR;
+  LinearSolverTerminationType factorize_result_ =
+      LinearSolverTerminationType::FATAL_ERROR;
 };

 #endif  // CERES_NO_CUDA

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dense_qr_solver.cc
+++ b/extern/ceres/internal/ceres/dense_qr_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DenseQRSolver::DenseQRSolver(const LinearSolver::Options& options)
    : options_(options), dense_qr_(DenseQR::Create(options)) {}
@@ -81,5 +80,4 @@ LinearSolver::Summary DenseQRSolver::SolveImpl(
  return summary;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dense_qr_solver.h
+++ b/extern/ceres/internal/ceres/dense_qr_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class DenseSparseMatrix;

@@ -112,8 +111,7 @@ class CERES_NO_EXPORT DenseQRSolver final : public DenseSparseMatrixSolver {
  std::unique_ptr<DenseQR> dense_qr_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dense_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/dense_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DenseSparseMatrix::DenseSparseMatrix(int num_rows, int num_cols)
    : m_(Matrix(num_rows, num_cols)) {}
@@ -60,17 +59,31 @@ DenseSparseMatrix::DenseSparseMatrix(Matrix m) : m_(std::move(m)) {}

 void DenseSparseMatrix::SetZero() { m_.setZero(); }

-void DenseSparseMatrix::RightMultiply(const double* x, double* y) const {
-  VectorRef(y, num_rows()) += matrix() * ConstVectorRef(x, num_cols());
+void DenseSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                   double* y) const {
+  VectorRef(y, num_rows()).noalias() += m_ * ConstVectorRef(x, num_cols());
 }

-void DenseSparseMatrix::LeftMultiply(const double* x, double* y) const {
-  VectorRef(y, num_cols()) +=
-      matrix().transpose() * ConstVectorRef(x, num_rows());
+void DenseSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+                                                  double* y) const {
+  VectorRef(y, num_cols()).noalias() +=
+      m_.transpose() * ConstVectorRef(x, num_rows());
 }

 void DenseSparseMatrix::SquaredColumnNorm(double* x) const {
-  VectorRef(x, num_cols()) = m_.colwise().squaredNorm();
+  // This implementation is 3x faster than the naive version
+  // x = m_.colwise().square().sum(), likely because m_
+  // is a row major matrix.
+
+  const int num_rows = m_.rows();
+  const int num_cols = m_.cols();
+  std::fill_n(x, num_cols, 0.0);
+  const double* m = m_.data();
+  for (int i = 0; i < num_rows; ++i) {
+    for (int j = 0; j < num_cols; ++j, ++m) {
+      x[j] += (*m) * (*m);
+    }
+  }
 }

 void DenseSparseMatrix::ScaleColumns(const double* scale) {
@@ -100,5 +113,4 @@ void DenseSparseMatrix::ToTextFile(FILE* file) const {
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dense_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/dense_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/sparse_matrix.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class TripletSparseMatrix;

@@ -54,8 +53,8 @@ class CERES_NO_EXPORT DenseSparseMatrix final : public SparseMatrix {

  // SparseMatrix interface.
  void SetZero() final;
-  void RightMultiply(const double* x, double* y) const final;
-  void LeftMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
  void SquaredColumnNorm(double* x) const final;
  void ScaleColumns(const double* scale) final;
  void ToDenseMatrix(Matrix* dense_matrix) const final;
@@ -73,8 +72,7 @@ class CERES_NO_EXPORT DenseSparseMatrix final : public SparseMatrix {
  Matrix m_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/detect_structure.cc
+++ b/extern/ceres/internal/ceres/detect_structure.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,7 @@
 #include "ceres/internal/eigen.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 void DetectStructure(const CompressedRowBlockStructure& bs,
                     const int num_eliminate_blocks,
@@ -119,5 +118,4 @@ void DetectStructure(const CompressedRowBlockStructure& bs,
  // clang-format on
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/detect_structure.h
+++ b/extern/ceres/internal/ceres/detect_structure.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Detect static blocks in the problem sparsity. For rows containing
 // e_blocks, we are interested in detecting if the size of the row
@@ -63,8 +62,7 @@ void CERES_NO_EXPORT DetectStructure(const CompressedRowBlockStructure& bs,
                                     int* e_block_size,
                                     int* f_block_size);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dogleg_strategy.cc
+++ b/extern/ceres/internal/ceres/dogleg_strategy.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 namespace {
 const double kMaxMu = 1.0;
 const double kMinMu = 1e-8;
@@ -101,7 +100,7 @@ TrustRegionStrategy::Summary DoglegStrategy::ComputeStep(
    }
    TrustRegionStrategy::Summary summary;
    summary.num_iterations = 0;
-    summary.termination_type = LINEAR_SOLVER_SUCCESS;
+    summary.termination_type = LinearSolverTerminationType::SUCCESS;
    return summary;
  }

@@ -138,11 +137,13 @@ TrustRegionStrategy::Summary DoglegStrategy::ComputeStep(
  summary.num_iterations = linear_solver_summary.num_iterations;
  summary.termination_type = linear_solver_summary.termination_type;

-  if (linear_solver_summary.termination_type == LINEAR_SOLVER_FATAL_ERROR) {
+  if (linear_solver_summary.termination_type ==
+      LinearSolverTerminationType::FATAL_ERROR) {
    return summary;
  }

-  if (linear_solver_summary.termination_type != LINEAR_SOLVER_FAILURE) {
+  if (linear_solver_summary.termination_type !=
+      LinearSolverTerminationType::FAILURE) {
    switch (dogleg_type_) {
      // Interpolate the Cauchy point and the Gauss-Newton step.
      case TRADITIONAL_DOGLEG:
@@ -153,7 +154,7 @@ TrustRegionStrategy::Summary DoglegStrategy::ComputeStep(
      // Cauchy point and the (Gauss-)Newton step.
      case SUBSPACE_DOGLEG:
        if (!ComputeSubspaceModel(jacobian)) {
-          summary.termination_type = LINEAR_SOLVER_FAILURE;
+          summary.termination_type = LinearSolverTerminationType::FAILURE;
          break;
        }
        ComputeSubspaceDoglegStep(step);
@@ -174,7 +175,7 @@ TrustRegionStrategy::Summary DoglegStrategy::ComputeStep(
 void DoglegStrategy::ComputeGradient(SparseMatrix* jacobian,
                                     const double* residuals) {
  gradient_.setZero();
-  jacobian->LeftMultiply(residuals, gradient_.data());
+  jacobian->LeftMultiplyAndAccumulate(residuals, gradient_.data());
  gradient_.array() /= diagonal_.array();
 }

@@ -187,7 +188,7 @@ void DoglegStrategy::ComputeCauchyPoint(SparseMatrix* jacobian) {
  // The Jacobian is scaled implicitly by computing J * (D^-1 * (D^-1 * g))
  // instead of (J * D^-1) * (D^-1 * g).
  Vector scaled_gradient = (gradient_.array() / diagonal_.array()).matrix();
-  jacobian->RightMultiply(scaled_gradient.data(), Jg.data());
+  jacobian->RightMultiplyAndAccumulate(scaled_gradient.data(), Jg.data());
  alpha_ = gradient_.squaredNorm() / Jg.squaredNorm();
 }

@@ -518,7 +519,7 @@ LinearSolver::Summary DoglegStrategy::ComputeGaussNewtonStep(
    const double* residuals) {
  const int n = jacobian->num_cols();
  LinearSolver::Summary linear_solver_summary;
-  linear_solver_summary.termination_type = LINEAR_SOLVER_FAILURE;
+  linear_solver_summary.termination_type = LinearSolverTerminationType::FAILURE;

  // The Jacobian matrix is often quite poorly conditioned. Thus it is
  // necessary to add a diagonal matrix at the bottom to prevent the
@@ -531,7 +532,7 @@ LinearSolver::Summary DoglegStrategy::ComputeGaussNewtonStep(
  // If the solve fails, the multiplier to the diagonal is increased
  // up to max_mu_ by a factor of mu_increase_factor_ every time. If
  // the linear solver is still not successful, the strategy returns
-  // with LINEAR_SOLVER_FAILURE.
+  // with LinearSolverTerminationType::FAILURE.
  //
  // Next time when a new Gauss-Newton step is requested, the
  // multiplier starts out from the last successful solve.
@@ -582,21 +583,25 @@ LinearSolver::Summary DoglegStrategy::ComputeGaussNewtonStep(
      }
    }

-    if (linear_solver_summary.termination_type == LINEAR_SOLVER_FATAL_ERROR) {
+    if (linear_solver_summary.termination_type ==
+        LinearSolverTerminationType::FATAL_ERROR) {
      return linear_solver_summary;
    }

-    if (linear_solver_summary.termination_type == LINEAR_SOLVER_FAILURE ||
+    if (linear_solver_summary.termination_type ==
+            LinearSolverTerminationType::FAILURE ||
        !IsArrayValid(n, gauss_newton_step_.data())) {
      mu_ *= mu_increase_factor_;
      VLOG(2) << "Increasing mu " << mu_;
-      linear_solver_summary.termination_type = LINEAR_SOLVER_FAILURE;
+      linear_solver_summary.termination_type =
+          LinearSolverTerminationType::FAILURE;
      continue;
    }
    break;
  }

-  if (linear_solver_summary.termination_type != LINEAR_SOLVER_FAILURE) {
+  if (linear_solver_summary.termination_type !=
+      LinearSolverTerminationType::FAILURE) {
    // The scaled Gauss-Newton step is D * GN:
    //
    //     - (D^-1 J^T J D^-1)^-1 (D^-1 g)
@@ -627,7 +632,7 @@ void DoglegStrategy::StepAccepted(double step_quality) {
  reuse_ = false;
 }

-void DoglegStrategy::StepRejected(double step_quality) {
+void DoglegStrategy::StepRejected(double /*step_quality*/) {
  radius_ *= 0.5;
  reuse_ = true;
 }
@@ -701,14 +706,13 @@ bool DoglegStrategy::ComputeSubspaceModel(SparseMatrix* jacobian) {

  Vector tmp;
  tmp = (subspace_basis_.col(0).array() / diagonal_.array()).matrix();
-  jacobian->RightMultiply(tmp.data(), Jb.row(0).data());
+  jacobian->RightMultiplyAndAccumulate(tmp.data(), Jb.row(0).data());
  tmp = (subspace_basis_.col(1).array() / diagonal_.array()).matrix();
-  jacobian->RightMultiply(tmp.data(), Jb.row(1).data());
+  jacobian->RightMultiplyAndAccumulate(tmp.data(), Jb.row(1).data());

  subspace_B_ = Jb * Jb.transpose();

  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dogleg_strategy.h
+++ b/extern/ceres/internal/ceres/dogleg_strategy.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,7 @@
 #include "ceres/linear_solver.h"
 #include "ceres/trust_region_strategy.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Dogleg step computation and trust region sizing strategy based on
 // on "Methods for Nonlinear Least Squares" by K. Madsen, H.B. Nielsen
@@ -159,8 +158,7 @@ class CERES_NO_EXPORT DoglegStrategy final : public TrustRegionStrategy {
  Matrix2d subspace_B_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dynamic_compressed_row_finalizer.h
+++ b/extern/ceres/internal/ceres/dynamic_compressed_row_finalizer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,14 @@
 //
 // Author: richie.stebbing@gmail.com (Richard Stebbing)

-#ifndef CERES_INTERNAL_DYNAMIC_COMPRESED_ROW_FINALIZER_H_
-#define CERES_INTERNAL_DYNAMIC_COMPRESED_ROW_FINALIZER_H_
+#ifndef CERES_INTERNAL_DYNAMIC_COMPRESSED_ROW_FINALIZER_H_
+#define CERES_INTERNAL_DYNAMIC_COMPRESSED_ROW_FINALIZER_H_

 #include "ceres/casts.h"
 #include "ceres/dynamic_compressed_row_sparse_matrix.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct CERES_NO_EXPORT DynamicCompressedRowJacobianFinalizer {
  void operator()(SparseMatrix* base_jacobian, int num_parameters) {
@@ -46,7 +45,6 @@ struct CERES_NO_EXPORT DynamicCompressedRowJacobianFinalizer {
  }
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

-#endif  // CERES_INTERNAL_DYNAMIC_COMPRESED_ROW_FINALISER_H_
+#endif  // CERES_INTERNAL_DYNAMIC_COMPRESSED_ROW_FINALISER_H_
--- a/extern/ceres/internal/ceres/dynamic_compressed_row_jacobian_writer.cc
+++ b/extern/ceres/internal/ceres/dynamic_compressed_row_jacobian_writer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,8 @@
 #include "ceres/dynamic_compressed_row_jacobian_writer.h"

 #include <memory>
+#include <utility>
+#include <vector>

 #include "ceres/casts.h"
 #include "ceres/compressed_row_jacobian_writer.h"
@@ -39,11 +41,7 @@
 #include "ceres/program.h"
 #include "ceres/residual_block.h"

-namespace ceres {
-namespace internal {
-
-using std::pair;
-using std::vector;
+namespace ceres::internal {

 std::unique_ptr<ScratchEvaluatePreparer[]>
 DynamicCompressedRowJacobianWriter::CreateEvaluatePreparers(int num_threads) {
@@ -69,7 +67,7 @@ void DynamicCompressedRowJacobianWriter::Write(int residual_id,
      program_->residual_blocks()[residual_id];
  const int num_residuals = residual_block->NumResiduals();

-  vector<pair<int, int>> evaluated_jacobian_blocks;
+  std::vector<std::pair<int, int>> evaluated_jacobian_blocks;
  CompressedRowJacobianWriter::GetOrderedParameterBlocks(
      program_, residual_id, &evaluated_jacobian_blocks);

@@ -100,5 +98,4 @@ void DynamicCompressedRowJacobianWriter::Write(int residual_id,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dynamic_compressed_row_jacobian_writer.h
+++ b/extern/ceres/internal/ceres/dynamic_compressed_row_jacobian_writer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/scratch_evaluate_preparer.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;
 class SparseMatrix;
@@ -68,7 +67,7 @@ class CERES_NO_EXPORT DynamicCompressedRowJacobianWriter {

  // Write only the non-zero jacobian entries for a residual block
  // (specified by `residual_id`) into `base_jacobian`, starting at the row
-  // specifed by `residual_offset`.
+  // specified by `residual_offset`.
  //
  // This method is thread-safe over residual blocks (each `residual_id`).
  void Write(int residual_id,
@@ -80,7 +79,6 @@ class CERES_NO_EXPORT DynamicCompressedRowJacobianWriter {
  Program* program_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_DYNAMIC_COMPRESSED_ROW_JACOBIAN_WRITER_H_
--- a/extern/ceres/internal/ceres/dynamic_compressed_row_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/dynamic_compressed_row_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,7 @@

 #include <cstring>

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DynamicCompressedRowSparseMatrix::DynamicCompressedRowSparseMatrix(
    int num_rows, int num_cols, int initial_max_num_nonzeros)
@@ -99,5 +98,4 @@ void DynamicCompressedRowSparseMatrix::Finalize(int num_additional_elements) {
      << "the number of jacobian nonzeros. Please contact the developers!";
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dynamic_compressed_row_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/dynamic_compressed_row_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,13 +47,12 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT DynamicCompressedRowSparseMatrix final
    : public CompressedRowSparseMatrix {
 public:
-  // Set the number of rows and columns for the underlyig
+  // Set the number of rows and columns for the underlying
  // `CompressedRowSparseMatrix` and set the initial number of maximum non-zero
  // entries. Note that following the insertion of entries, when `Finalize`
  // is called the number of non-zeros is determined and all internal
@@ -74,7 +73,7 @@ class CERES_NO_EXPORT DynamicCompressedRowSparseMatrix final

  // Insert an entry at a given row and column position. This method is
  // thread-safe across rows i.e. different threads can insert values
-  // simultaneously into different rows. It should be emphasised that this
+  // simultaneously into different rows. It should be emphasized that this
  // method always inserts a new entry and does not check for existing
  // entries at the specified row and column position. Duplicate entries
  // for a given row and column position will result in undefined
@@ -98,8 +97,7 @@ class CERES_NO_EXPORT DynamicCompressedRowSparseMatrix final
  std::vector<std::vector<double>> dynamic_values_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/dynamic_sparse_normal_cholesky_solver.cc
+++ b/extern/ceres/internal/ceres/dynamic_sparse_normal_cholesky_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@

 #include "Eigen/SparseCore"
 #include "ceres/compressed_row_sparse_matrix.h"
-#include "ceres/cxsparse.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
@@ -52,8 +51,7 @@
 #include "Eigen/SparseCholesky"
 #endif

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 DynamicSparseNormalCholeskySolver::DynamicSparseNormalCholeskySolver(
    LinearSolver::Options options)
@@ -66,7 +64,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImpl(
    double* x) {
  const int num_cols = A->num_cols();
  VectorRef(x, num_cols).setZero();
-  A->LeftMultiply(b, x);
+  A->LeftMultiplyAndAccumulate(b, x);

  if (per_solve_options.D != nullptr) {
    // Temporarily append a diagonal block to the A matrix, but undo
@@ -87,9 +85,6 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImpl(
    case SUITE_SPARSE:
      summary = SolveImplUsingSuiteSparse(A, x);
      break;
-    case CX_SPARSE:
-      summary = SolveImplUsingCXSparse(A, x);
-      break;
    case EIGEN_SPARSE:
      summary = SolveImplUsingEigen(A, x);
      break;
@@ -113,7 +108,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(

  LinearSolver::Summary summary;
  summary.num_iterations = 0;
-  summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
+  summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
  summary.message =
      "SPARSE_NORMAL_CHOLESKY cannot be used with EIGEN_SPARSE "
      "because Ceres was not built with support for "
@@ -138,7 +133,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(

  LinearSolver::Summary summary;
  summary.num_iterations = 1;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
+  summary.termination_type = LinearSolverTerminationType::SUCCESS;
  summary.message = "Success.";

  solver.analyzePattern(lhs);
@@ -150,7 +145,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(

  event_logger.AddEvent("Analyze");
  if (solver.info() != Eigen::Success) {
-    summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
+    summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
    summary.message = "Eigen failure. Unable to find symbolic factorization.";
    return summary;
  }
@@ -158,7 +153,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(
  solver.factorize(lhs);
  event_logger.AddEvent("Factorize");
  if (solver.info() != Eigen::Success) {
-    summary.termination_type = LINEAR_SOLVER_FAILURE;
+    summary.termination_type = LinearSolverTerminationType::FAILURE;
    summary.message = "Eigen failure. Unable to find numeric factorization.";
    return summary;
  }
@@ -167,7 +162,7 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(
  VectorRef(rhs_and_solution, lhs.cols()) = solver.solve(rhs);
  event_logger.AddEvent("Solve");
  if (solver.info() != Eigen::Success) {
-    summary.termination_type = LINEAR_SOLVER_FAILURE;
+    summary.termination_type = LinearSolverTerminationType::FAILURE;
    summary.message = "Eigen failure. Unable to do triangular solve.";
    return summary;
  }
@@ -176,66 +171,16 @@ LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingEigen(
 #endif  // CERES_USE_EIGEN_SPARSE
 }

-LinearSolver::Summary DynamicSparseNormalCholeskySolver::SolveImplUsingCXSparse(
-    CompressedRowSparseMatrix* A, double* rhs_and_solution) {
-#ifdef CERES_NO_CXSPARSE
-
-  LinearSolver::Summary summary;
-  summary.num_iterations = 0;
-  summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
-  summary.message =
-      "SPARSE_NORMAL_CHOLESKY cannot be used with CX_SPARSE "
-      "because Ceres was not built with support for CXSparse. "
-      "This requires enabling building with -DCXSPARSE=ON.";
-
-  return summary;
-
-#else
-  EventLogger event_logger(
-      "DynamicSparseNormalCholeskySolver::CXSparse::Solve");
-
-  LinearSolver::Summary summary;
-  summary.num_iterations = 1;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
-  summary.message = "Success.";
-
-  CXSparse cxsparse;
-
-  // Wrap the augmented Jacobian in a compressed sparse column matrix.
-  cs_di a_transpose = cxsparse.CreateSparseMatrixTransposeView(A);
-
-  // Compute the normal equations. J'J delta = J'f and solve them
-  // using a sparse Cholesky factorization. Notice that when compared
-  // to SuiteSparse we have to explicitly compute the transpose of Jt,
-  // and then the normal equations before they can be
-  // factorized. CHOLMOD/SuiteSparse on the other hand can just work
-  // off of Jt to compute the Cholesky factorization of the normal
-  // equations.
-  cs_di* a = cxsparse.TransposeMatrix(&a_transpose);
-  cs_di* lhs = cxsparse.MatrixMatrixMultiply(&a_transpose, a);
-  cxsparse.Free(a);
-  event_logger.AddEvent("NormalEquations");
-
-  if (!cxsparse.SolveCholesky(lhs, rhs_and_solution)) {
-    summary.termination_type = LINEAR_SOLVER_FAILURE;
-    summary.message = "CXSparse::SolveCholesky failed";
-  }
-  event_logger.AddEvent("Solve");
-
-  cxsparse.Free(lhs);
-  event_logger.AddEvent("TearDown");
-  return summary;
-#endif
-}
-
 LinearSolver::Summary
 DynamicSparseNormalCholeskySolver::SolveImplUsingSuiteSparse(
    CompressedRowSparseMatrix* A, double* rhs_and_solution) {
 #ifdef CERES_NO_SUITESPARSE
+  (void)A;
+  (void)rhs_and_solution;

  LinearSolver::Summary summary;
  summary.num_iterations = 0;
-  summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
+  summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
  summary.message =
      "SPARSE_NORMAL_CHOLESKY cannot be used with SUITE_SPARSE "
      "because Ceres was not built with support for SuiteSparse. "
@@ -247,7 +192,7 @@ DynamicSparseNormalCholeskySolver::SolveImplUsingSuiteSparse(
  EventLogger event_logger(
      "DynamicSparseNormalCholeskySolver::SuiteSparse::Solve");
  LinearSolver::Summary summary;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
+  summary.termination_type = LinearSolverTerminationType::SUCCESS;
  summary.num_iterations = 1;
  summary.message = "Success.";

@@ -255,16 +200,17 @@ DynamicSparseNormalCholeskySolver::SolveImplUsingSuiteSparse(
  const int num_cols = A->num_cols();
  cholmod_sparse lhs = ss.CreateSparseMatrixTransposeView(A);
  event_logger.AddEvent("Setup");
-  cholmod_factor* factor = ss.AnalyzeCholesky(&lhs, &summary.message);
+  cholmod_factor* factor =
+      ss.AnalyzeCholesky(&lhs, options_.ordering_type, &summary.message);
  event_logger.AddEvent("Analysis");

  if (factor == nullptr) {
-    summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
+    summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
    return summary;
  }

  summary.termination_type = ss.Cholesky(&lhs, factor, &summary.message);
-  if (summary.termination_type == LINEAR_SOLVER_SUCCESS) {
+  if (summary.termination_type == LinearSolverTerminationType::SUCCESS) {
    cholmod_dense cholmod_rhs =
        ss.CreateDenseVectorView(rhs_and_solution, num_cols);
    cholmod_dense* solution = ss.Solve(factor, &cholmod_rhs, &summary.message);
@@ -274,7 +220,7 @@ DynamicSparseNormalCholeskySolver::SolveImplUsingSuiteSparse(
          rhs_and_solution, solution->x, num_cols * sizeof(*rhs_and_solution));
      ss.Free(solution);
    } else {
-      summary.termination_type = LINEAR_SOLVER_FAILURE;
+      summary.termination_type = LinearSolverTerminationType::FAILURE;
    }
  }

@@ -285,5 +231,4 @@ DynamicSparseNormalCholeskySolver::SolveImplUsingSuiteSparse(
 #endif
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/dynamic_sparse_normal_cholesky_solver.h
+++ b/extern/ceres/internal/ceres/dynamic_sparse_normal_cholesky_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CompressedRowSparseMatrix;

@@ -77,7 +76,6 @@ class CERES_NO_EXPORT DynamicSparseNormalCholeskySolver
  const LinearSolver::Options options_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_DYNAMIC_SPARSE_NORMAL_CHOLESKY_SOLVER_H_
--- a/extern/ceres/internal/ceres/eigen_vector_ops.h
+++ b/extern/ceres/internal/ceres/eigen_vector_ops.h
@@ -0,0 +1,105 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+
+#ifndef CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
+#define CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
+
+#include <numeric>
+
+#include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
+#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
+
+namespace ceres::internal {
+
+// Blas1 operations on Eigen vectors. These functions are needed as an
+// abstraction layer so that we can use different versions of a vector style
+// object in the conjugate gradients linear solver.
+template <typename Derived>
+inline double Norm(const Eigen::DenseBase<Derived>& x,
+                   ContextImpl* context,
+                   int num_threads) {
+  FixedArray<double> norms(num_threads, 0.);
+  ParallelFor(
+      context,
+      0,
+      x.rows(),
+      num_threads,
+      [&x, &norms](int thread_id, std::tuple<int, int> range) {
+        auto [start, end] = range;
+        norms[thread_id] += x.segment(start, end - start).squaredNorm();
+      },
+      kMinBlockSizeParallelVectorOps);
+  return std::sqrt(std::accumulate(norms.begin(), norms.end(), 0.));
+}
+inline void SetZero(Vector& x, ContextImpl* context, int num_threads) {
+  ParallelSetZero(context, num_threads, x);
+}
+inline void Axpby(double a,
+                  const Vector& x,
+                  double b,
+                  const Vector& y,
+                  Vector& z,
+                  ContextImpl* context,
+                  int num_threads) {
+  ParallelAssign(context, num_threads, z, a * x + b * y);
+}
+template <typename VectorLikeX, typename VectorLikeY>
+inline double Dot(const VectorLikeX& x,
+                  const VectorLikeY& y,
+                  ContextImpl* context,
+                  int num_threads) {
+  FixedArray<double> dots(num_threads, 0.);
+  ParallelFor(
+      context,
+      0,
+      x.rows(),
+      num_threads,
+      [&x, &y, &dots](int thread_id, std::tuple<int, int> range) {
+        auto [start, end] = range;
+        const int block_size = end - start;
+        const auto& x_block = x.segment(start, block_size);
+        const auto& y_block = y.segment(start, block_size);
+        dots[thread_id] += x_block.dot(y_block);
+      },
+      kMinBlockSizeParallelVectorOps);
+  return std::accumulate(dots.begin(), dots.end(), 0.);
+}
+inline void Copy(const Vector& from,
+                 Vector& to,
+                 ContextImpl* context,
+                 int num_threads) {
+  ParallelAssign(context, num_threads, to, from);
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
--- a/extern/ceres/internal/ceres/eigensparse.cc
+++ b/extern/ceres/internal/ceres/eigensparse.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,22 +36,25 @@

 #include <sstream>

+#ifndef CERES_NO_EIGEN_METIS
+#include <iostream>  // This is needed because MetisSupport depends on iostream.
+
+#include "Eigen/MetisSupport"
+#endif
+
 #include "Eigen/SparseCholesky"
 #include "Eigen/SparseCore"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-// TODO(sameeragarwal): Use enable_if to clean up the implementations
-// for when Scalar == double.
 template <typename Solver>
 class EigenSparseCholeskyTemplate final : public SparseCholesky {
 public:
  EigenSparseCholeskyTemplate() = default;
  CompressedRowSparseMatrix::StorageType StorageType() const final {
-    return CompressedRowSparseMatrix::LOWER_TRIANGULAR;
+    return CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR;
  }

  LinearSolverTerminationType Factorize(
@@ -68,7 +71,7 @@ class EigenSparseCholeskyTemplate final : public SparseCholesky {

      if (solver_.info() != Eigen::Success) {
        *message = "Eigen failure. Unable to find symbolic factorization.";
-        return LINEAR_SOLVER_FATAL_ERROR;
+        return LinearSolverTerminationType::FATAL_ERROR;
      }

      analyzed_ = true;
@@ -77,9 +80,9 @@ class EigenSparseCholeskyTemplate final : public SparseCholesky {
    solver_.factorize(lhs);
    if (solver_.info() != Eigen::Success) {
      *message = "Eigen failure. Unable to find numeric factorization.";
-      return LINEAR_SOLVER_FAILURE;
+      return LinearSolverTerminationType::FAILURE;
    }
-    return LINEAR_SOLVER_SUCCESS;
+    return LinearSolverTerminationType::SUCCESS;
  }

  LinearSolverTerminationType Solve(const double* rhs_ptr,
@@ -87,23 +90,23 @@ class EigenSparseCholeskyTemplate final : public SparseCholesky {
                                    std::string* message) override {
    CHECK(analyzed_) << "Solve called without a call to Factorize first.";

-    scalar_rhs_ = ConstVectorRef(rhs_ptr, solver_.cols())
-                      .template cast<typename Solver::Scalar>();
-
-    // The two casts are needed if the Scalar in this class is not
-    // double. For code simplicity we are going to assume that Eigen
-    // is smart enough to figure out that casting a double Vector to a
-    // double Vector is a straight copy. If this turns into a
-    // performance bottleneck (unlikely), we can revisit this.
-    scalar_solution_ = solver_.solve(scalar_rhs_);
-    VectorRef(solution_ptr, solver_.cols()) =
-        scalar_solution_.template cast<double>();
+    // Avoid copying when the scalar type is double
+    if constexpr (std::is_same_v<typename Solver::Scalar, double>) {
+      ConstVectorRef scalar_rhs(rhs_ptr, solver_.cols());
+      VectorRef(solution_ptr, solver_.cols()) = solver_.solve(scalar_rhs);
+    } else {
+      auto scalar_rhs = ConstVectorRef(rhs_ptr, solver_.cols())
+                            .template cast<typename Solver::Scalar>();
+      auto scalar_solution = solver_.solve(scalar_rhs);
+      VectorRef(solution_ptr, solver_.cols()) =
+          scalar_solution.template cast<double>();
+    }

    if (solver_.info() != Eigen::Success) {
      *message = "Eigen failure. Unable to do triangular solve.";
-      return LINEAR_SOLVER_FAILURE;
+      return LinearSolverTerminationType::FAILURE;
    }
-    return LINEAR_SOLVER_SUCCESS;
+    return LinearSolverTerminationType::SUCCESS;
  }

  LinearSolverTerminationType Factorize(CompressedRowSparseMatrix* lhs,
@@ -111,9 +114,8 @@ class EigenSparseCholeskyTemplate final : public SparseCholesky {
    CHECK_EQ(lhs->storage_type(), StorageType());

    typename Solver::Scalar* values_ptr = nullptr;
-    if (std::is_same<typename Solver::Scalar, double>::value) {
-      values_ptr =
-          reinterpret_cast<typename Solver::Scalar*>(lhs->mutable_values());
+    if constexpr (std::is_same_v<typename Solver::Scalar, double>) {
+      values_ptr = lhs->mutable_values();
    } else {
      // In the case where the scalar used in this class is not
      // double. In that case, make a copy of the values array in the
@@ -123,19 +125,20 @@ class EigenSparseCholeskyTemplate final : public SparseCholesky {
      values_ptr = values_.data();
    }

-    Eigen::Map<Eigen::SparseMatrix<typename Solver::Scalar, Eigen::ColMajor>>
+    Eigen::Map<
+        const Eigen::SparseMatrix<typename Solver::Scalar, Eigen::ColMajor>>
        eigen_lhs(lhs->num_rows(),
                  lhs->num_rows(),
                  lhs->num_nonzeros(),
-                  lhs->mutable_rows(),
-                  lhs->mutable_cols(),
+                  lhs->rows(),
+                  lhs->cols(),
                  values_ptr);
    return Factorize(eigen_lhs, message);
  }

 private:
-  Eigen::Matrix<typename Solver::Scalar, Eigen::Dynamic, 1> values_,
-      scalar_rhs_, scalar_solution_;
+  Eigen::Matrix<typename Solver::Scalar, Eigen::Dynamic, 1> values_;
+
  bool analyzed_{false};
  Solver solver_;
 };
@@ -150,11 +153,22 @@ std::unique_ptr<SparseCholesky> EigenSparseCholesky::Create(
                            Eigen::Upper,
                            Eigen::NaturalOrdering<int>>;

-  if (ordering_type == AMD) {
+  if (ordering_type == OrderingType::AMD) {
    return std::make_unique<EigenSparseCholeskyTemplate<WithAMDOrdering>>();
-  } else {
-    return std::make_unique<EigenSparseCholeskyTemplate<WithNaturalOrdering>>();
+  } else if (ordering_type == OrderingType::NESDIS) {
+#ifndef CERES_NO_EIGEN_METIS
+    using WithMetisOrdering = Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>,
+                                                    Eigen::Upper,
+                                                    Eigen::MetisOrdering<int>>;
+    return std::make_unique<EigenSparseCholeskyTemplate<WithMetisOrdering>>();
+#else
+    LOG(FATAL)
+        << "Congratulations you have found a bug in Ceres Solver. Please "
+           "report it to the Ceres Solver developers.";
+    return nullptr;
+#endif  // CERES_NO_EIGEN_METIS
  }
+  return std::make_unique<EigenSparseCholeskyTemplate<WithNaturalOrdering>>();
 }

 EigenSparseCholesky::~EigenSparseCholesky() = default;
@@ -168,16 +182,26 @@ std::unique_ptr<SparseCholesky> FloatEigenSparseCholesky::Create(
      Eigen::SimplicialLDLT<Eigen::SparseMatrix<float>,
                            Eigen::Upper,
                            Eigen::NaturalOrdering<int>>;
-  if (ordering_type == AMD) {
+  if (ordering_type == OrderingType::AMD) {
    return std::make_unique<EigenSparseCholeskyTemplate<WithAMDOrdering>>();
-  } else {
-    return std::make_unique<EigenSparseCholeskyTemplate<WithNaturalOrdering>>();
+  } else if (ordering_type == OrderingType::NESDIS) {
+#ifndef CERES_NO_EIGEN_METIS
+    using WithMetisOrdering = Eigen::SimplicialLDLT<Eigen::SparseMatrix<float>,
+                                                    Eigen::Upper,
+                                                    Eigen::MetisOrdering<int>>;
+    return std::make_unique<EigenSparseCholeskyTemplate<WithMetisOrdering>>();
+#else
+    LOG(FATAL)
+        << "Congratulations you have found a bug in Ceres Solver. Please "
+           "report it to the Ceres Solver developers.";
+    return nullptr;
+#endif  // CERES_NO_EIGEN_METIS
  }
+  return std::make_unique<EigenSparseCholeskyTemplate<WithNaturalOrdering>>();
 }

 FloatEigenSparseCholesky::~FloatEigenSparseCholesky() = default;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_USE_EIGEN_SPARSE
--- a/extern/ceres/internal/ceres/eigensparse.h
+++ b/extern/ceres/internal/ceres/eigensparse.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,8 +46,18 @@
 #include "ceres/linear_solver.h"
 #include "ceres/sparse_cholesky.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+
+class EigenSparse {
+ public:
+  static constexpr bool IsNestedDissectionAvailable() noexcept {
+#ifdef CERES_NO_EIGEN_METIS
+    return false;
+#else
+    return true;
+#endif
+  }
+};

 class CERES_NO_EXPORT EigenSparseCholesky : public SparseCholesky {
 public:
@@ -83,8 +93,18 @@ class CERES_NO_EXPORT FloatEigenSparseCholesky : public SparseCholesky {
                                    std::string* message) override = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
+
+#else
+
+namespace ceres::internal {
+
+class EigenSparse {
+ public:
+  static constexpr bool IsNestedDissectionAvailable() noexcept { return false; }
+};
+
+}  // namespace ceres::internal

 #endif  // CERES_USE_EIGEN_SPARSE

--- a/extern/ceres/internal/ceres/evaluation_callback.cc
+++ b/extern/ceres/internal/ceres/evaluation_callback.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/evaluator.cc
+++ b/extern/ceres/internal/ceres/evaluator.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,8 +46,7 @@
 #include "ceres/scratch_evaluate_preparer.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 Evaluator::~Evaluator() = default;

@@ -65,10 +64,17 @@ std::unique_ptr<Evaluator> Evaluator::Create(const Evaluator::Options& options,
    case DENSE_SCHUR:
    case SPARSE_SCHUR:
    case ITERATIVE_SCHUR:
-    case CGNR:
-      return std::make_unique<
-          ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
-          options, program);
+    case CGNR: {
+      if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+        return std::make_unique<ProgramEvaluator<ScratchEvaluatePreparer,
+                                                 CompressedRowJacobianWriter>>(
+            options, program);
+      } else {
+        return std::make_unique<
+            ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
+            options, program);
+      }
+    }
    case SPARSE_NORMAL_CHOLESKY:
      if (options.dynamic_sparsity) {
        return std::make_unique<
@@ -88,5 +94,4 @@ std::unique_ptr<Evaluator> Evaluator::Create(const Evaluator::Options& options,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/evaluator.h
+++ b/extern/ceres/internal/ceres/evaluator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -65,6 +65,8 @@ class CERES_NO_EXPORT Evaluator {
    int num_threads = 1;
    int num_eliminate_blocks = -1;
    LinearSolverType linear_solver_type = DENSE_QR;
+    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =
+        NO_SPARSE;
    bool dynamic_sparsity = false;
    ContextImpl* context = nullptr;
    EvaluationCallback* evaluation_callback = nullptr;
--- a/extern/ceres/internal/ceres/execution_summary.h
+++ b/extern/ceres/internal/ceres/execution_summary.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct CallStatistics {
  CallStatistics() = default;
@@ -85,7 +84,6 @@ class ScopedExecutionTimer {
  ExecutionSummary* summary_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_EXECUTION_SUMMARY_H_
--- a/extern/ceres/internal/ceres/fake_bundle_adjustment_jacobian.cc
+++ b/extern/ceres/internal/ceres/fake_bundle_adjustment_jacobian.cc
@@ -0,0 +1,120 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/fake_bundle_adjustment_jacobian.h"
+
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+
+#include "Eigen/Dense"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/internal/eigen.h"
+
+namespace ceres::internal {
+
+std::unique_ptr<BlockSparseMatrix> CreateFakeBundleAdjustmentJacobian(
+    int num_cameras,
+    int num_points,
+    int camera_size,
+    int point_size,
+    double visibility,
+    std::mt19937& prng) {
+  constexpr int kResidualSize = 2;
+
+  CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
+  int c = 0;
+  // Add column blocks for each point
+  for (int i = 0; i < num_points; ++i) {
+    bs->cols.push_back(Block(point_size, c));
+    c += point_size;
+  }
+
+  // Add column blocks for each camera.
+  for (int i = 0; i < num_cameras; ++i) {
+    bs->cols.push_back(Block(camera_size, c));
+    c += camera_size;
+  }
+
+  std::bernoulli_distribution visibility_distribution(visibility);
+  int row_pos = 0;
+  int cell_pos = 0;
+  for (int i = 0; i < num_points; ++i) {
+    for (int j = 0; j < num_cameras; ++j) {
+      if (!visibility_distribution(prng)) {
+        continue;
+      }
+      bs->rows.emplace_back();
+      auto& row = bs->rows.back();
+      row.block.position = row_pos;
+      row.block.size = kResidualSize;
+      auto& cells = row.cells;
+      cells.resize(2);
+
+      cells[0].block_id = i;
+      cells[0].position = cell_pos;
+      cell_pos += kResidualSize * point_size;
+
+      cells[1].block_id = num_points + j;
+      cells[1].position = cell_pos;
+      cell_pos += kResidualSize * camera_size;
+
+      row_pos += kResidualSize;
+    }
+  }
+
+  auto jacobian = std::make_unique<BlockSparseMatrix>(bs);
+  VectorRef(jacobian->mutable_values(), jacobian->num_nonzeros()).setRandom();
+  return jacobian;
+}
+
+std::pair<
+    std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+    std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              int camera_size,
+                                              int landmark_size,
+                                              double visibility,
+                                              std::mt19937& rng) {
+  using PartitionedView =
+      PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>;
+  auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+      num_cameras, num_points, camera_size, landmark_size, visibility, rng);
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(num_points);
+  auto partitioned_view =
+      std::make_unique<PartitionedView>(options, *block_sparse_matrix);
+  return std::make_pair(std::move(partitioned_view),
+                        std::move(block_sparse_matrix));
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/fake_bundle_adjustment_jacobian.h
+++ b/extern/ceres/internal/ceres/fake_bundle_adjustment_jacobian.h
@@ -0,0 +1,78 @@
+
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+
+#ifndef CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
+#define CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
+
+#include <memory>
+#include <random>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/partitioned_matrix_view.h"
+
+namespace ceres::internal {
+std::unique_ptr<BlockSparseMatrix> CreateFakeBundleAdjustmentJacobian(
+    int num_cameras,
+    int num_points,
+    int camera_size,
+    int point_size,
+    double visibility,
+    std::mt19937& prng);
+
+template <int kEBlockSize = 3, int kFBlockSize = 6>
+std::pair<std::unique_ptr<PartitionedMatrixView<2, kEBlockSize, kFBlockSize>>,
+          std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              double visibility,
+                                              std::mt19937& rng) {
+  using PartitionedView = PartitionedMatrixView<2, kEBlockSize, kFBlockSize>;
+  auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+      num_cameras, num_points, kFBlockSize, kEBlockSize, visibility, rng);
+  auto partitioned_view =
+      std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+  return std::make_pair(std::move(partitioned_view),
+                        std::move(block_sparse_matrix));
+}
+
+std::pair<
+    std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+    std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              int camera_size,
+                                              int landmark_size,
+                                              double visibility,
+                                              std::mt19937& rng);
+
+}  // namespace ceres::internal
+
+#endif  // CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
--- a/extern/ceres/internal/ceres/file.cc
+++ b/extern/ceres/internal/ceres/file.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,15 +33,14 @@
 #include "ceres/file.h"

 #include <cstdio>
+#include <string>

 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-using std::string;
-
-void WriteStringToFileOrDie(const string& data, const string& filename) {
+void WriteStringToFileOrDie(const std::string& data,
+                            const std::string& filename) {
  FILE* file_descriptor = fopen(filename.c_str(), "wb");
  if (!file_descriptor) {
    LOG(FATAL) << "Couldn't write to file: " << filename;
@@ -50,7 +49,7 @@ void WriteStringToFileOrDie(const string& data, const string& filename) {
  fclose(file_descriptor);
 }

-void ReadFileToStringOrDie(const string& filename, string* data) {
+void ReadFileToStringOrDie(const std::string& filename, std::string* data) {
  FILE* file_descriptor = fopen(filename.c_str(), "r");

  if (!file_descriptor) {
@@ -59,12 +58,12 @@ void ReadFileToStringOrDie(const string& filename, string* data) {

  // Resize the input buffer appropriately.
  fseek(file_descriptor, 0L, SEEK_END);
-  int num_bytes = ftell(file_descriptor);
+  int64_t num_bytes = ftell(file_descriptor);
  data->resize(num_bytes);

  // Read the data.
  fseek(file_descriptor, 0L, SEEK_SET);
-  int num_read =
+  int64_t num_read =
      fread(&((*data)[0]), sizeof((*data)[0]), num_bytes, file_descriptor);
  if (num_read != num_bytes) {
    LOG(FATAL) << "Couldn't read all of " << filename
@@ -74,7 +73,7 @@ void ReadFileToStringOrDie(const string& filename, string* data) {
  fclose(file_descriptor);
 }

-string JoinPath(const string& dirname, const string& basename) {
+std::string JoinPath(const std::string& dirname, const std::string& basename) {
 #ifdef _WIN32
  static const char separator = '\\';
 #else
@@ -86,9 +85,8 @@ string JoinPath(const string& dirname, const string& basename) {
  } else if (dirname[dirname.size() - 1] == separator) {
    return dirname + basename;
  } else {
-    return dirname + string(&separator, 1) + basename;
+    return dirname + std::string(&separator, 1) + basename;
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/file.h
+++ b/extern/ceres/internal/ceres/file.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 CERES_NO_EXPORT
 void WriteStringToFileOrDie(const std::string& data,
@@ -52,8 +51,7 @@ void ReadFileToStringOrDie(const std::string& filename, std::string* data);
 CERES_NO_EXPORT
 std::string JoinPath(const std::string& dirname, const std::string& basename);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/first_order_function.cc
+++ b/extern/ceres/internal/ceres/first_order_function.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/float_suitesparse.cc
+++ b/extern/ceres/internal/ceres/float_suitesparse.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,7 @@

 #if !defined(CERES_NO_SUITESPARSE)

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 std::unique_ptr<SparseCholesky> FloatSuiteSparseCholesky::Create(
    OrderingType ordering_type) {
@@ -43,7 +42,6 @@ std::unique_ptr<SparseCholesky> FloatSuiteSparseCholesky::Create(
  return {};
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // !defined(CERES_NO_SUITESPARSE)
--- a/extern/ceres/internal/ceres/float_suitesparse.h
+++ b/extern/ceres/internal/ceres/float_suitesparse.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@

 #if !defined(CERES_NO_SUITESPARSE)

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Fake implementation of a single precision Sparse Cholesky using
 // SuiteSparse.
@@ -53,8 +52,7 @@ class CERES_NO_EXPORT FloatSuiteSparseCholesky : public SparseCholesky {
  static std::unique_ptr<SparseCholesky> Create(OrderingType ordering_type);
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // !defined(CERES_NO_SUITESPARSE)

--- a/extern/ceres/internal/ceres/function_sample.cc
+++ b/extern/ceres/internal/ceres/function_sample.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,7 @@

 #include "ceres/stringprintf.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 FunctionSample::FunctionSample()
    : x(0.0),
@@ -75,5 +74,4 @@ std::string FunctionSample::ToDebugString() const {
      gradient_is_valid);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/function_sample.h
+++ b/extern/ceres/internal/ceres/function_sample.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // FunctionSample is used by the line search routines to store and
 // communicate the value and (optionally) the gradient of the function
@@ -83,13 +82,12 @@ struct CERES_NO_EXPORT FunctionSample {
  //
  // where d is the search direction.
  double gradient;
-  // True if the evaluation of the gradient was sucessful and the
+  // True if the evaluation of the gradient was successful and the
  // value is a finite number.
  bool gradient_is_valid;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/generate_bundle_adjustment_tests.py
+++ b/extern/ceres/internal/ceres/generate_bundle_adjustment_tests.py
@@ -0,0 +1,305 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2023 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: keir@google.com (Keir Mierle)
+#
+# Generate bundle adjustment tests as separate binaries. Since the bundle
+# adjustment tests are fairly processing intensive, serializing them makes the
+# tests take forever to run. Splitting them into separate binaries makes it
+# easier to parallelize in continuous integration systems, and makes local
+# processing on multi-core workstations much faster.
+
+# Product of ORDERINGS, THREAD_CONFIGS, and SOLVER_CONFIGS is the full set of
+# tests to generate.
+ORDERINGS = ["kAutomaticOrdering", "kUserOrdering"]
+SINGLE_THREADED = "1"
+MULTI_THREADED = "4"
+THREAD_CONFIGS = [SINGLE_THREADED, MULTI_THREADED]
+
+DENSE_SOLVER_CONFIGS = [
+    # Linear solver  Dense backend
+    ('DENSE_SCHUR',  'EIGEN'),
+    ('DENSE_SCHUR',  'LAPACK'),
+    ('DENSE_SCHUR',  'CUDA'),
+]
+
+SPARSE_SOLVER_CONFIGS = [
+    # Linear solver            Sparse backend
+    ('SPARSE_NORMAL_CHOLESKY', 'SUITE_SPARSE'),
+    ('SPARSE_NORMAL_CHOLESKY', 'EIGEN_SPARSE'),
+    ('SPARSE_NORMAL_CHOLESKY', 'ACCELERATE_SPARSE'),
+    ('SPARSE_SCHUR',           'SUITE_SPARSE'),
+    ('SPARSE_SCHUR',           'EIGEN_SPARSE'),
+    ('SPARSE_SCHUR',           'ACCELERATE_SPARSE'),
+]
+
+ITERATIVE_SOLVER_CONFIGS = [
+    # Linear solver            Sparse backend      Preconditioner
+    ('ITERATIVE_SCHUR',        'NO_SPARSE',        'JACOBI'),
+    ('ITERATIVE_SCHUR',        'NO_SPARSE',        'SCHUR_JACOBI'),
+    ('ITERATIVE_SCHUR',        'NO_SPARSE',        'SCHUR_POWER_SERIES_EXPANSION'),
+    ('ITERATIVE_SCHUR',        'SUITE_SPARSE',     'CLUSTER_JACOBI'),
+    ('ITERATIVE_SCHUR',        'EIGEN_SPARSE',     'CLUSTER_JACOBI'),
+    ('ITERATIVE_SCHUR',        'ACCELERATE_SPARSE','CLUSTER_JACOBI'),
+    ('ITERATIVE_SCHUR',        'SUITE_SPARSE',     'CLUSTER_TRIDIAGONAL'),
+    ('ITERATIVE_SCHUR',        'EIGEN_SPARSE',     'CLUSTER_TRIDIAGONAL'),
+    ('ITERATIVE_SCHUR',        'ACCELERATE_SPARSE','CLUSTER_TRIDIAGONAL'),
+]
+
+FILENAME_SHORTENING_MAP = dict(
+  DENSE_SCHUR='denseschur',
+  ITERATIVE_SCHUR='iterschur',
+  SPARSE_NORMAL_CHOLESKY='sparsecholesky',
+  SPARSE_SCHUR='sparseschur',
+  EIGEN='eigen',
+  LAPACK='lapack',
+  CUDA='cuda',
+  NO_SPARSE='',  # Omit sparse reference entirely for dense tests.
+  SUITE_SPARSE='suitesparse',
+  EIGEN_SPARSE='eigensparse',
+  ACCELERATE_SPARSE='acceleratesparse',
+  IDENTITY='identity',
+  JACOBI='jacobi',
+  SCHUR_JACOBI='schurjacobi',
+  CLUSTER_JACOBI='clustjacobi',
+  CLUSTER_TRIDIAGONAL='clusttri',
+  SCHUR_POWER_SERIES_EXPANSION='spse',
+  kAutomaticOrdering='auto',
+  kUserOrdering='user',
+)
+
+COPYRIGHT_HEADER = (
+"""// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// ========================================
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// ========================================
+//
+// This file is generated using generate_bundle_adjustment_tests.py.""")
+
+BUNDLE_ADJUSTMENT_TEST_TEMPLATE = (COPYRIGHT_HEADER + """
+
+#include "ceres/bundle_adjustment_test_util.h"
+#include "ceres/internal/config.h"
+#include "gtest/gtest.h"
+%(preprocessor_conditions_begin)s
+namespace ceres::internal {
+
+TEST_F(BundleAdjustmentTest,
+       %(test_class_name)s) {  // NOLINT
+  BundleAdjustmentProblem bundle_adjustment_problem;
+  Solver::Options* options = bundle_adjustment_problem.mutable_solver_options();
+  options->eta = 0.01;
+  options->num_threads = %(num_threads)s;
+  options->linear_solver_type = %(linear_solver)s;
+  options->dense_linear_algebra_library_type = %(dense_backend)s;
+  options->sparse_linear_algebra_library_type = %(sparse_backend)s;
+  options->preconditioner_type = %(preconditioner)s;
+  if (%(ordering)s) {
+    options->linear_solver_ordering = nullptr;
+  }
+  Problem* problem = bundle_adjustment_problem.mutable_problem();
+  RunSolverForConfigAndExpectResidualsMatch(*options, problem);
+}
+
+}  // namespace ceres::internal
+%(preprocessor_conditions_end)s""")
+
+def camelcasify(token):
+  """Convert capitalized underscore tokens to camel case"""
+  return ''.join([x.lower().capitalize() for x in token.split('_')])
+
+
+def generate_bundle_test(linear_solver,
+                         dense_backend,
+                         sparse_backend,
+                         preconditioner,
+                         ordering,
+                         thread_config):
+  """Generate a bundle adjustment test executable configured appropriately"""
+
+  # Preconditioner only makes sense for iterative schur; drop it otherwise.
+  preconditioner_tag = preconditioner
+  if linear_solver != 'ITERATIVE_SCHUR':
+    preconditioner_tag = ''
+
+  dense_backend_tag = dense_backend
+  if linear_solver != 'DENSE_SCHUR':
+    dense_backend_tag=''
+
+  # Omit references to the sparse backend when one is not in use.
+  sparse_backend_tag = sparse_backend
+  if sparse_backend == 'NO_SPARSE':
+    sparse_backend_tag = ''
+
+  # Use a double underscore; otherwise the names are harder to understand.
+  test_class_name = '_'.join(filter(lambda x: x, [
+      camelcasify(linear_solver),
+      camelcasify(dense_backend_tag),
+      camelcasify(sparse_backend_tag),
+      camelcasify(preconditioner_tag),
+      ordering[1:],  # Strip 'k'
+      'Threads' if thread_config == MULTI_THREADED else '']))
+
+  # Initial template parameters (augmented more below).
+  template_parameters = dict(
+          linear_solver=linear_solver,
+          dense_backend=dense_backend,
+          sparse_backend=sparse_backend,
+          preconditioner=preconditioner,
+          ordering=ordering,
+          num_threads=thread_config,
+          test_class_name=test_class_name)
+
+  # Accumulate appropriate #ifdef/#ifndefs for the solver's sparse backend.
+  preprocessor_conditions_begin = []
+  preprocessor_conditions_end = []
+  if sparse_backend == 'SUITE_SPARSE':
+    preprocessor_conditions_begin.append('#ifndef CERES_NO_SUITESPARSE')
+    preprocessor_conditions_end.insert(0, '#endif  // CERES_NO_SUITESPARSE')
+  elif sparse_backend == 'ACCELERATE_SPARSE':
+    preprocessor_conditions_begin.append('#ifndef CERES_NO_ACCELERATE_SPARSE')
+    preprocessor_conditions_end.insert(0, '#endif  // CERES_NO_ACCELERATE_SPARSE')
+  elif sparse_backend == 'EIGEN_SPARSE':
+    preprocessor_conditions_begin.append('#ifdef CERES_USE_EIGEN_SPARSE')
+    preprocessor_conditions_end.insert(0, '#endif  // CERES_USE_EIGEN_SPARSE')
+
+  if dense_backend == "LAPACK":
+    preprocessor_conditions_begin.append('#ifndef CERES_NO_LAPACK')
+    preprocessor_conditions_end.insert(0, '#endif  // CERES_NO_LAPACK')
+  elif dense_backend == "CUDA":
+    preprocessor_conditions_begin.append('#ifndef CERES_NO_CUDA')
+    preprocessor_conditions_end.insert(0, '#endif  // CERES_NO_CUDA')
+
+  # If there are #ifdefs, put newlines around them.
+  if preprocessor_conditions_begin:
+    preprocessor_conditions_begin.insert(0, '')
+    preprocessor_conditions_begin.append('')
+    preprocessor_conditions_end.insert(0, '')
+    preprocessor_conditions_end.append('')
+
+  # Put #ifdef/#ifndef stacks into the template parameters.
+  template_parameters['preprocessor_conditions_begin'] = '\n'.join(
+      preprocessor_conditions_begin)
+  template_parameters['preprocessor_conditions_end'] = '\n'.join(
+      preprocessor_conditions_end)
+
+  # Substitute variables into the test template, and write the result to a file.
+  filename_tag = '_'.join(FILENAME_SHORTENING_MAP.get(x) for x in [
+      linear_solver,
+      dense_backend_tag,
+      sparse_backend_tag,
+      preconditioner_tag,
+      ordering]
+      if FILENAME_SHORTENING_MAP.get(x))
+
+  if (thread_config == MULTI_THREADED):
+    filename_tag += '_threads'
+
+  filename = ('generated_bundle_adjustment_tests/ba_%s_test.cc' %
+                filename_tag.lower())
+  with open(filename, 'w') as fd:
+    fd.write(BUNDLE_ADJUSTMENT_TEST_TEMPLATE % template_parameters)
+
+  # All done.
+  print('Generated', filename)
+
+  return filename
+
+
+if __name__ == '__main__':
+  # Iterate over all the possible configurations and generate the tests.
+  generated_files = []
+
+  for ordering in ORDERINGS:
+    for thread_config in THREAD_CONFIGS:
+      for linear_solver, dense_backend in DENSE_SOLVER_CONFIGS:
+        generated_files.append(
+            generate_bundle_test(linear_solver,
+                                 dense_backend,
+                                 'NO_SPARSE',
+                                 'IDENTITY',
+                                 ordering,
+                                 thread_config))
+
+      for linear_solver, sparse_backend, in SPARSE_SOLVER_CONFIGS:
+        generated_files.append(
+            generate_bundle_test(linear_solver,
+                                 'EIGEN',
+                                 sparse_backend,
+                                 'IDENTITY',
+                                 ordering,
+                                 thread_config))
+
+      for linear_solver, sparse_backend, preconditioner, in ITERATIVE_SOLVER_CONFIGS:
+        generated_files.append(
+            generate_bundle_test(linear_solver,
+                                 'EIGEN',
+                                 sparse_backend,
+                                 preconditioner,
+                                 ordering,
+                                 thread_config))
+
+
+  # Generate the CMakeLists.txt as well.
+  with open('generated_bundle_adjustment_tests/CMakeLists.txt', 'w') as fd:
+    fd.write(COPYRIGHT_HEADER.replace('//', '#').replace('http:#', 'http://'))
+    fd.write('\n')
+    fd.write('\n')
+    for generated_file in generated_files:
+      fd.write('ceres_test(%s)\n' %
+               generated_file.split('/')[1].replace('_test.cc', ''))
--- a/extern/ceres/internal/ceres/generate_template_specializations.py
+++ b/extern/ceres/internal/ceres/generate_template_specializations.py
@@ -0,0 +1,246 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2023 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: sameeragarwal@google.com (Sameer Agarwal)
+#
+# Script for explicitly generating template specialization of the
+# SchurEliminator class. It is a rather large class
+# and the number of explicit instantiations is also large. Explicitly
+# generating these instantiations in separate .cc files breaks the
+# compilation into separate compilation unit rather than one large cc
+# file which takes 2+GB of RAM to compile.
+#
+# This script creates three sets of files.
+#
+# 1. schur_eliminator_x_x_x.cc and partitioned_matrix_view_x_x_x.cc
+# where, the x indicates the template parameters and
+#
+# 2. schur_eliminator.cc & partitioned_matrix_view.cc
+#
+# that contains a factory function for instantiating these classes
+# based on runtime parameters.
+#
+# 3. schur_templates.cc
+#
+# that contains a function which can be queried to determine what
+# template specializations are available.
+#
+# The following list of tuples, specializations indicates the set of
+# specializations that is generated.
+SPECIALIZATIONS = [(2, 2, 2),
+                   (2, 2, 3),
+                   (2, 2, 4),
+                   (2, 2, "Eigen::Dynamic"),
+                   (2, 3, 3),
+                   (2, 3, 4),
+                   (2, 3, 6),
+                   (2, 3, 9),
+                   (2, 3, "Eigen::Dynamic"),
+                   (2, 4, 3),
+                   (2, 4, 4),
+                   (2, 4, 6),
+                   (2, 4, 8),
+                   (2, 4, 9),
+                   (2, 4, "Eigen::Dynamic"),
+                   (2, "Eigen::Dynamic", "Eigen::Dynamic"),
+                   (3, 3, 3),
+                   (4, 4, 2),
+                   (4, 4, 3),
+                   (4, 4, 4),
+                   (4, 4, "Eigen::Dynamic")]
+
+import schur_eliminator_template
+import partitioned_matrix_view_template
+import os
+import glob
+
+def SuffixForSize(size):
+  if size == "Eigen::Dynamic":
+    return "d"
+  return str(size)
+
+def SpecializationFilename(prefix, row_block_size, e_block_size, f_block_size):
+  return "_".join([prefix] + list(map(SuffixForSize, (row_block_size,
+                                                 e_block_size,
+                                                 f_block_size))))
+
+def GenerateFactoryConditional(row_block_size, e_block_size, f_block_size):
+  conditionals = []
+  if (row_block_size != "Eigen::Dynamic"):
+    conditionals.append("(options.row_block_size == %s)" % row_block_size)
+  if (e_block_size != "Eigen::Dynamic"):
+    conditionals.append("(options.e_block_size == %s)" % e_block_size)
+  if (f_block_size != "Eigen::Dynamic"):
+    conditionals.append("(options.f_block_size == %s)" % f_block_size)
+  if (len(conditionals) == 0):
+    return "%s"
+
+  if (len(conditionals) == 1):
+    return "  if " + conditionals[0] + " {\n  %s\n  }\n"
+
+  return "  if (" + " &&\n     ".join(conditionals) + ") {\n  %s\n  }\n"
+
+def Specialize(name, data):
+  """
+  Generate specialization code and the conditionals to instantiate it.
+  """
+
+  # Specialization files
+  for row_block_size, e_block_size, f_block_size in SPECIALIZATIONS:
+      output = SpecializationFilename("generated/" + name,
+                                      row_block_size,
+                                      e_block_size,
+                                      f_block_size) + ".cc"
+
+      with open(output, "w") as f:
+        f.write(data["HEADER"])
+        f.write(data["SPECIALIZATION_FILE"] %
+                  (row_block_size, e_block_size, f_block_size))
+
+  # Generate the _d_d_d specialization.
+  output = SpecializationFilename("generated/" + name,
+                                   "Eigen::Dynamic",
+                                   "Eigen::Dynamic",
+                                   "Eigen::Dynamic") + ".cc"
+  with open(output, "w") as f:
+    f.write(data["HEADER"])
+    f.write(data["DYNAMIC_FILE"] %
+              ("Eigen::Dynamic", "Eigen::Dynamic", "Eigen::Dynamic"))
+
+  # Factory
+  with open(name + ".cc", "w") as f:
+    f.write(data["HEADER"])
+    f.write(data["FACTORY_FILE_HEADER"])
+    for row_block_size, e_block_size, f_block_size in SPECIALIZATIONS:
+        factory_conditional = GenerateFactoryConditional(
+            row_block_size, e_block_size, f_block_size)
+        factory = data["FACTORY"] % (row_block_size, e_block_size, f_block_size)
+        f.write(factory_conditional % factory);
+    f.write(data["FACTORY_FOOTER"])
+
+QUERY_HEADER = """// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+//
+// What template specializations are available.
+//
+// ========================================
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+//=========================================
+//
+// This file is generated using generate_template_specializations.py.
+"""
+
+QUERY_FILE_HEADER = """
+#include "ceres/internal/eigen.h"
+#include "ceres/schur_templates.h"
+
+namespace ceres {
+namespace internal {
+
+void GetBestSchurTemplateSpecialization(int* row_block_size,
+                                        int* e_block_size,
+                                        int* f_block_size) {
+  LinearSolver::Options options;
+  options.row_block_size = *row_block_size;
+  options.e_block_size = *e_block_size;
+  options.f_block_size = *f_block_size;
+  *row_block_size = Eigen::Dynamic;
+  *e_block_size = Eigen::Dynamic;
+  *f_block_size = Eigen::Dynamic;
+#ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
+"""
+
+QUERY_FOOTER = """
+#endif
+  return;
+}
+
+}  // namespace internal
+}  // namespace ceres
+"""
+
+QUERY_ACTION = """  *row_block_size = %s;
+    *e_block_size = %s;
+    *f_block_size = %s;
+    return;"""
+
+def GenerateQueryFile():
+  """
+  Generate file that allows querying for available template specializations.
+  """
+
+  with open("schur_templates.cc", "w") as f:
+    f.write(QUERY_HEADER)
+    f.write(QUERY_FILE_HEADER)
+    for row_block_size, e_block_size, f_block_size in SPECIALIZATIONS:
+      factory_conditional = GenerateFactoryConditional(
+        row_block_size, e_block_size, f_block_size)
+      action = QUERY_ACTION % (row_block_size, e_block_size, f_block_size)
+      f.write(factory_conditional % action)
+    f.write(QUERY_FOOTER)
+
+
+if __name__ == "__main__":
+  for f in glob.glob("generated/*"):
+    os.remove(f)
+
+  Specialize("schur_eliminator",
+               schur_eliminator_template.__dict__)
+  Specialize("partitioned_matrix_view",
+               partitioned_matrix_view_template.__dict__)
+  GenerateQueryFile()
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_2.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_2.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 2, 2>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_3.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 2, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_4.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 2, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_2_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 2, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_3.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 3, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_4.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 3, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_6.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_6.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 3, 6>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_9.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_9.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 3, 9>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_3_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 3, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_3.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_4.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_6.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_6.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, 6>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_8.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_8.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, 8>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_9.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_9.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, 9>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_4_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, 4, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_d_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_2_d_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_3_3_3.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_3_3_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<3, 3, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_2.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_2.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<4, 4, 2>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_3.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<4, 4, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_4.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<4, 4, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_4_4_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<4, 4, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/partitioned_matrix_view_d_d_d.cc
+++ b/extern/ceres/internal/ceres/generated/partitioned_matrix_view_d_d_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,12 +41,10 @@

 #include "ceres/partitioned_matrix_view_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class PartitionedMatrixView<Eigen::Dynamic,
                                     Eigen::Dynamic,
                                     Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_2.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_2.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 2, 2>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_3.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 2, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_4.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 2, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_2_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 2, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_3.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 3, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_4.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 3, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_6.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_6.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 3, 6>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_9.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_9.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 3, 9>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_3_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 3, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_3.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_4.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_6.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_6.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, 6>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_8.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_8.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, 8>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_9.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_9.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, 9>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_4_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, 4, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_2_d_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_2_d_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<2, Eigen::Dynamic, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_3_3_3.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_3_3_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<3, 3, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_2.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_2.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<4, 4, 2>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_3.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_3.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<4, 4, 3>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_4.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_4.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<4, 4, 4>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_4_4_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,10 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<4, 4, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
--- a/extern/ceres/internal/ceres/generated/schur_eliminator_d_d_d.cc
+++ b/extern/ceres/internal/ceres/generated/schur_eliminator_d_d_d.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,10 +41,8 @@

 #include "ceres/schur_eliminator_impl.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template class SchurEliminator<Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic>;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/gradient_checker.cc
+++ b/extern/ceres/internal/ceres/gradient_checker.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,7 +40,6 @@
 #include <vector>

 #include "ceres/is_close.h"
-#include "ceres/manifold_adapter.h"
 #include "ceres/stringprintf.h"
 #include "ceres/types.h"

@@ -49,8 +48,6 @@ namespace ceres {
 using internal::IsClose;
 using internal::StringAppendF;
 using internal::StringPrintf;
-using std::string;
-using std::vector;

 namespace {
 // Evaluate the cost function and transform the returned Jacobians to
@@ -65,12 +62,12 @@ bool EvaluateCostFunction(const CostFunction* function,
  CHECK(jacobians != nullptr);
  CHECK(local_jacobians != nullptr);

-  const vector<int32_t>& block_sizes = function->parameter_block_sizes();
+  const std::vector<int32_t>& block_sizes = function->parameter_block_sizes();
  const int num_parameter_blocks = block_sizes.size();

  // Allocate Jacobian matrices in tangent space.
  local_jacobians->resize(num_parameter_blocks);
-  vector<double*> local_jacobian_data(num_parameter_blocks);
+  std::vector<double*> local_jacobian_data(num_parameter_blocks);
  for (int i = 0; i < num_parameter_blocks; ++i) {
    int block_size = block_sizes.at(i);
    if (manifolds.at(i) != nullptr) {
@@ -83,7 +80,7 @@ bool EvaluateCostFunction(const CostFunction* function,

  // Allocate Jacobian matrices in ambient space.
  jacobians->resize(num_parameter_blocks);
-  vector<double*> jacobian_data(num_parameter_blocks);
+  std::vector<double*> jacobian_data(num_parameter_blocks);
  for (int i = 0; i < num_parameter_blocks; ++i) {
    jacobians->at(i).resize(function->num_residuals(), block_sizes.at(i));
    jacobians->at(i).setZero();
@@ -116,39 +113,8 @@ bool EvaluateCostFunction(const CostFunction* function,
 }
 }  // namespace

-GradientChecker::GradientChecker(
-    const CostFunction* function,
-    const vector<const LocalParameterization*>* local_parameterizations,
-    const NumericDiffOptions& options)
-    : delete_manifolds_(true), function_(function) {
-  CHECK(function != nullptr);
-  manifolds_.resize(function->parameter_block_sizes().size(), nullptr);
-
-  // Wrap the local parameterization into manifold objects using
-  // ManifoldAdapter.
-  for (int i = 0; i < manifolds_.size(); ++i) {
-    const LocalParameterization* local_param = local_parameterizations->at(i);
-    if (local_param == nullptr) {
-      continue;
-    }
-    manifolds_[i] = new internal::ManifoldAdapter(local_param);
-  }
-
-  auto finite_diff_cost_function =
-      std::make_unique<DynamicNumericDiffCostFunction<CostFunction, RIDDERS>>(
-          function, DO_NOT_TAKE_OWNERSHIP, options);
-  const vector<int32_t>& parameter_block_sizes =
-      function->parameter_block_sizes();
-  for (int32_t parameter_block_size : parameter_block_sizes) {
-    finite_diff_cost_function->AddParameterBlock(parameter_block_size);
-  }
-  finite_diff_cost_function->SetNumResiduals(function->num_residuals());
-
-  finite_diff_cost_function_ = std::move(finite_diff_cost_function);
-}
-
 GradientChecker::GradientChecker(const CostFunction* function,
-                                 const vector<const Manifold*>* manifolds,
+                                 const std::vector<const Manifold*>* manifolds,
                                 const NumericDiffOptions& options)
    : function_(function) {
  CHECK(function != nullptr);
@@ -161,7 +127,7 @@ GradientChecker::GradientChecker(const CostFunction* function,
  auto finite_diff_cost_function =
      std::make_unique<DynamicNumericDiffCostFunction<CostFunction, RIDDERS>>(
          function, DO_NOT_TAKE_OWNERSHIP, options);
-  const vector<int32_t>& parameter_block_sizes =
+  const std::vector<int32_t>& parameter_block_sizes =
      function->parameter_block_sizes();
  const int num_parameter_blocks = parameter_block_sizes.size();
  for (int i = 0; i < num_parameter_blocks; ++i) {
@@ -172,14 +138,6 @@ GradientChecker::GradientChecker(const CostFunction* function,
  finite_diff_cost_function_ = std::move(finite_diff_cost_function);
 }

-GradientChecker::~GradientChecker() {
-  if (delete_manifolds_) {
-    for (const auto m : manifolds_) {
-      delete m;
-    }
-  }
-}
-
 bool GradientChecker::Probe(double const* const* parameters,
                            double relative_precision,
                            ProbeResults* results_param) const {
@@ -204,8 +162,8 @@ bool GradientChecker::Probe(double const* const* parameters,
  results->return_value = true;

  // Evaluate the derivative using the user supplied code.
-  vector<Matrix>& jacobians = results->jacobians;
-  vector<Matrix>& local_jacobians = results->local_jacobians;
+  std::vector<Matrix>& jacobians = results->jacobians;
+  std::vector<Matrix>& local_jacobians = results->local_jacobians;
  if (!EvaluateCostFunction(function_,
                            parameters,
                            manifolds_,
@@ -217,8 +175,9 @@ bool GradientChecker::Probe(double const* const* parameters,
  }

  // Evaluate the derivative using numeric derivatives.
-  vector<Matrix>& numeric_jacobians = results->numeric_jacobians;
-  vector<Matrix>& local_numeric_jacobians = results->local_numeric_jacobians;
+  std::vector<Matrix>& numeric_jacobians = results->numeric_jacobians;
+  std::vector<Matrix>& local_numeric_jacobians =
+      results->local_numeric_jacobians;
  Vector finite_diff_residuals;
  if (!EvaluateCostFunction(finite_diff_cost_function_.get(),
                            parameters,
@@ -258,7 +217,7 @@ bool GradientChecker::Probe(double const* const* parameters,

  // Accumulate the error message for all the jacobians, since it won't get
  // output if there are no bad jacobian components.
-  string error_log;
+  std::string error_log;
  for (int k = 0; k < function_->parameter_block_sizes().size(); k++) {
    StringAppendF(&error_log,
                  "========== "
@@ -312,7 +271,7 @@ bool GradientChecker::Probe(double const* const* parameters,

  // Since there were some bad errors, dump comprehensive debug info.
  if (num_bad_jacobian_components) {
-    string header = StringPrintf(
+    std::string header = StringPrintf(
        "\nDetected %d bad Jacobian component(s). "
        "Worst relative error was %g.\n",
        num_bad_jacobian_components,
--- a/extern/ceres/internal/ceres/gradient_checking_cost_function.cc
+++ b/extern/ceres/internal/ceres/gradient_checking_cost_function.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -52,13 +52,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::abs;
-using std::max;
-using std::string;
-using std::vector;
+namespace ceres::internal {

 namespace {

@@ -68,7 +62,7 @@ class GradientCheckingCostFunction final : public CostFunction {
                               const std::vector<const Manifold*>* manifolds,
                               const NumericDiffOptions& options,
                               double relative_precision,
-                               string extra_info,
+                               std::string extra_info,
                               GradientCheckingIterationCallback* callback)
      : function_(function),
        gradient_checker_(function, manifolds, options),
@@ -76,7 +70,7 @@ class GradientCheckingCostFunction final : public CostFunction {
        extra_info_(std::move(extra_info)),
        callback_(callback) {
    CHECK(callback_ != nullptr);
-    const vector<int32_t>& parameter_block_sizes =
+    const std::vector<int32_t>& parameter_block_sizes =
        function->parameter_block_sizes();
    *mutable_parameter_block_sizes() = parameter_block_sizes;
    set_num_residuals(function->num_residuals());
@@ -105,7 +99,8 @@ class GradientCheckingCostFunction final : public CostFunction {
    MatrixRef(residuals, num_residuals, 1) = results.residuals;

    // Copy the original jacobian blocks into the jacobians array.
-    const vector<int32_t>& block_sizes = function_->parameter_block_sizes();
+    const std::vector<int32_t>& block_sizes =
+        function_->parameter_block_sizes();
    for (int k = 0; k < block_sizes.size(); k++) {
      if (jacobians[k] != nullptr) {
        MatrixRef(jacobians[k],
@@ -127,7 +122,7 @@ class GradientCheckingCostFunction final : public CostFunction {
  const CostFunction* function_;
  GradientChecker gradient_checker_;
  double relative_precision_;
-  string extra_info_;
+  std::string extra_info_;
  GradientCheckingIterationCallback* callback_;
 };

@@ -137,7 +132,7 @@ GradientCheckingIterationCallback::GradientCheckingIterationCallback()
    : gradient_error_detected_(false) {}

 CallbackReturnType GradientCheckingIterationCallback::operator()(
-    const IterationSummary& summary) {
+    const IterationSummary& /*summary*/) {
  if (gradient_error_detected_) {
    LOG(ERROR) << "Gradient error detected. Terminating solver.";
    return SOLVER_ABORT;
@@ -198,7 +193,8 @@ std::unique_ptr<ProblemImpl> CreateGradientCheckingProblemImpl(

  // For every ParameterBlock in problem_impl, create a new parameter block with
  // the same manifold and constancy.
-  const vector<ParameterBlock*>& parameter_blocks = program->parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program->parameter_blocks();
  for (auto* parameter_block : parameter_blocks) {
    gradient_checking_problem_impl->AddParameterBlock(
        parameter_block->mutable_user_state(),
@@ -225,17 +221,18 @@ std::unique_ptr<ProblemImpl> CreateGradientCheckingProblemImpl(
  // For every ResidualBlock in problem_impl, create a new
  // ResidualBlock by wrapping its CostFunction inside a
  // GradientCheckingCostFunction.
-  const vector<ResidualBlock*>& residual_blocks = program->residual_blocks();
+  const std::vector<ResidualBlock*>& residual_blocks =
+      program->residual_blocks();
  for (int i = 0; i < residual_blocks.size(); ++i) {
    ResidualBlock* residual_block = residual_blocks[i];

    // Build a human readable string which identifies the
    // ResidualBlock. This is used by the GradientCheckingCostFunction
    // when logging debugging information.
-    string extra_info =
+    std::string extra_info =
        StringPrintf("Residual block id %d; depends on parameters [", i);
-    vector<double*> parameter_blocks;
-    vector<const Manifold*> manifolds;
+    std::vector<double*> parameter_blocks;
+    std::vector<const Manifold*> manifolds;
    parameter_blocks.reserve(residual_block->NumParameterBlocks());
    manifolds.reserve(residual_block->NumParameterBlocks());
    for (int j = 0; j < residual_block->NumParameterBlocks(); ++j) {
@@ -280,5 +277,4 @@ std::unique_ptr<ProblemImpl> CreateGradientCheckingProblemImpl(
  return gradient_checking_problem_impl;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/gradient_checking_cost_function.h
+++ b/extern/ceres/internal/ceres/gradient_checking_cost_function.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,7 @@
 #include "ceres/iteration_callback.h"
 #include "ceres/manifold.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class ProblemImpl;

@@ -109,8 +108,7 @@ CERES_NO_EXPORT std::unique_ptr<ProblemImpl> CreateGradientCheckingProblemImpl(
    double relative_precision,
    GradientCheckingIterationCallback* callback);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/gradient_problem.cc
+++ b/extern/ceres/internal/ceres/gradient_problem.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,6 @@

 #include <memory>

-#include "ceres/local_parameterization.h"
-#include "ceres/manifold_adapter.h"
 #include "glog/logging.h"

 namespace ceres {
@@ -46,22 +44,6 @@ GradientProblem::GradientProblem(FirstOrderFunction* function)
  CHECK(function != nullptr);
 }

-GradientProblem::GradientProblem(FirstOrderFunction* function,
-                                 LocalParameterization* parameterization)
-    : function_(function),
-      parameterization_(parameterization),
-      scratch_(new double[function_->NumParameters()]) {
-  CHECK(function != nullptr);
-  if (parameterization != nullptr) {
-    manifold_ =
-        std::make_unique<internal::ManifoldAdapter>(parameterization_.get());
-  } else {
-    manifold_ = std::make_unique<EuclideanManifold<DYNAMIC>>(
-        function_->NumParameters());
-  }
-  CHECK_EQ(function_->NumParameters(), manifold_->AmbientSize());
-}
-
 GradientProblem::GradientProblem(FirstOrderFunction* function,
                                 Manifold* manifold)
    : function_(function), scratch_(new double[function_->NumParameters()]) {
--- a/extern/ceres/internal/ceres/gradient_problem_evaluator.h
+++ b/extern/ceres/internal/ceres/gradient_problem_evaluator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/sparse_matrix.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT GradientProblemEvaluator final : public Evaluator {
 public:
@@ -53,10 +52,10 @@ class CERES_NO_EXPORT GradientProblemEvaluator final : public Evaluator {

  std::unique_ptr<SparseMatrix> CreateJacobian() const final { return nullptr; }

-  bool Evaluate(const EvaluateOptions& evaluate_options,
+  bool Evaluate(const EvaluateOptions& /*evaluate_options*/,
                const double* state,
                double* cost,
-                double* residuals,
+                double* /*residuals*/,
                double* gradient,
                SparseMatrix* jacobian) final {
    CHECK(jacobian == nullptr);
@@ -83,7 +82,7 @@ class CERES_NO_EXPORT GradientProblemEvaluator final : public Evaluator {
  int NumParameters() const final { return problem_.NumParameters(); }

  int NumEffectiveParameters() const final {
-    return problem_.NumLocalParameters();
+    return problem_.NumTangentParameters();
  }

  int NumResiduals() const final { return 1; }
@@ -97,8 +96,7 @@ class CERES_NO_EXPORT GradientProblemEvaluator final : public Evaluator {
  ::ceres::internal::ExecutionSummary execution_summary_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/gradient_problem_solver.cc
+++ b/extern/ceres/internal/ceres/gradient_problem_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,9 @@

 #include "ceres/gradient_problem_solver.h"

+#include <map>
 #include <memory>
+#include <string>

 #include "ceres/callbacks.h"
 #include "ceres/gradient_problem.h"
@@ -48,7 +50,6 @@
 namespace ceres {
 using internal::StringAppendF;
 using internal::StringPrintf;
-using std::string;

 namespace {

@@ -112,7 +113,7 @@ void GradientProblemSolver::Solve(const GradientProblemSolver::Options& options,
  *summary = Summary();
  // clang-format off
  summary->num_parameters                    = problem.NumParameters();
-  summary->num_local_parameters              = problem.NumLocalParameters();
+  summary->num_tangent_parameters            = problem.NumTangentParameters();
  summary->line_search_direction_type        = options.line_search_direction_type;         //  NOLINT
  summary->line_search_interpolation_type    = options.line_search_interpolation_type;     //  NOLINT
  summary->line_search_type                  = options.line_search_type;
@@ -180,7 +181,7 @@ void GradientProblemSolver::Solve(const GradientProblemSolver::Options& options,
    SetSummaryFinalCost(summary);
  }

-  const std::map<string, CallStatistics>& evaluator_statistics =
+  const std::map<std::string, CallStatistics>& evaluator_statistics =
      minimizer_options.evaluator->Statistics();
  {
    const CallStatistics& call_stats = FindWithDefault(
@@ -203,7 +204,7 @@ bool GradientProblemSolver::Summary::IsSolutionUsable() const {
  return internal::IsSolutionUsable(*this);
 }

-string GradientProblemSolver::Summary::BriefReport() const {
+std::string GradientProblemSolver::Summary::BriefReport() const {
  return StringPrintf(
      "Ceres GradientProblemSolver Report: "
      "Iterations: %d, "
@@ -216,17 +217,20 @@ string GradientProblemSolver::Summary::BriefReport() const {
      TerminationTypeToString(termination_type));
 }

-string GradientProblemSolver::Summary::FullReport() const {
+std::string GradientProblemSolver::Summary::FullReport() const {
  using internal::VersionString;

-  string report = string("\nSolver Summary (v " + VersionString() + ")\n\n");
+  // NOTE operator+ is not usable for concatenating a string and a string_view.
+  std::string report =
+      std::string{"\nSolver Summary (v "}.append(VersionString()) + ")\n\n";

  StringAppendF(&report, "Parameters          % 25d\n", num_parameters);
-  if (num_local_parameters != num_parameters) {
-    StringAppendF(&report, "Local parameters    % 25d\n", num_local_parameters);
+  if (num_tangent_parameters != num_parameters) {
+    StringAppendF(
+        &report, "Tangent parameters   % 25d\n", num_tangent_parameters);
  }

-  string line_search_direction_string;
+  std::string line_search_direction_string;
  if (line_search_direction_type == LBFGS) {
    line_search_direction_string = StringPrintf("LBFGS (%d)", max_lbfgs_rank);
  } else if (line_search_direction_type == NONLINEAR_CONJUGATE_GRADIENT) {
@@ -241,7 +245,7 @@ string GradientProblemSolver::Summary::FullReport() const {
                "Line search direction     %19s\n",
                line_search_direction_string.c_str());

-  const string line_search_type_string = StringPrintf(
+  const std::string line_search_type_string = StringPrintf(
      "%s %s",
      LineSearchInterpolationTypeToString(line_search_interpolation_type),
      LineSearchTypeToString(line_search_type));
--- a/extern/ceres/internal/ceres/graph.h
+++ b/extern/ceres/internal/ceres/graph.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A unweighted undirected graph templated over the vertex ids. Vertex
 // should be hashable.
@@ -206,7 +205,6 @@ class WeightedGraph {
      edge_weights_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_GRAPH_H_
--- a/extern/ceres/internal/ceres/graph_algorithms.h
+++ b/extern/ceres/internal/ceres/graph_algorithms.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,7 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Compare two vertices of a graph by their degrees, if the degrees
 // are equal then order them by their ids.
@@ -340,7 +339,6 @@ std::unique_ptr<WeightedGraph<Vertex>> Degree2MaximumSpanningForest(
  return forest;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_GRAPH_ALGORITHMS_H_
--- a/extern/ceres/internal/ceres/implicit_schur_complement.cc
+++ b/extern/ceres/internal/ceres/implicit_schur_complement.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,16 @@
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
+#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 ImplicitSchurComplement::ImplicitSchurComplement(
    const LinearSolver::Options& options)
-    : options_(options), D_(nullptr), b_(nullptr) {}
+    : options_(options) {}

 void ImplicitSchurComplement::Init(const BlockSparseMatrix& A,
                                   const double* D,
@@ -57,11 +58,16 @@ void ImplicitSchurComplement::Init(const BlockSparseMatrix& A,
  D_ = D;
  b_ = b;

+  compute_ftf_inverse_ =
+      options_.use_spse_initialization ||
+      options_.preconditioner_type == JACOBI ||
+      options_.preconditioner_type == SCHUR_POWER_SERIES_EXPANSION;
+
  // Initialize temporary storage and compute the block diagonals of
  // E'E and F'E.
  if (block_diagonal_EtE_inverse_ == nullptr) {
    block_diagonal_EtE_inverse_ = A_->CreateBlockDiagonalEtE();
-    if (options_.preconditioner_type == JACOBI) {
+    if (compute_ftf_inverse_) {
      block_diagonal_FtF_inverse_ = A_->CreateBlockDiagonalFtF();
    }
    rhs_.resize(A_->num_cols_f());
@@ -72,7 +78,7 @@ void ImplicitSchurComplement::Init(const BlockSparseMatrix& A,
    tmp_f_cols_.resize(A_->num_cols_f());
  } else {
    A_->UpdateBlockDiagonalEtE(block_diagonal_EtE_inverse_.get());
-    if (options_.preconditioner_type == JACOBI) {
+    if (compute_ftf_inverse_) {
      A_->UpdateBlockDiagonalFtF(block_diagonal_FtF_inverse_.get());
    }
  }
@@ -81,7 +87,7 @@ void ImplicitSchurComplement::Init(const BlockSparseMatrix& A,
  // contributions from the diagonal D if it is non-null. Add that to
  // the block diagonals and invert them.
  AddDiagonalAndInvert(D_, block_diagonal_EtE_inverse_.get());
-  if (options_.preconditioner_type == JACOBI) {
+  if (compute_ftf_inverse_) {
    AddDiagonalAndInvert((D_ == nullptr) ? nullptr : D_ + A_->num_cols_e(),
                         block_diagonal_FtF_inverse_.get());
  }
@@ -97,36 +103,74 @@ void ImplicitSchurComplement::Init(const BlockSparseMatrix& A,
 // By breaking it down into individual matrix vector products
 // involving the matrices E and F. This is implemented using a
 // PartitionedMatrixView of the input matrix A.
-void ImplicitSchurComplement::RightMultiply(const double* x, double* y) const {
+void ImplicitSchurComplement::RightMultiplyAndAccumulate(const double* x,
+                                                         double* y) const {
  // y1 = F x
-  tmp_rows_.setZero();
-  A_->RightMultiplyF(x, tmp_rows_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());

  // y2 = E' y1
-  tmp_e_cols_.setZero();
-  A_->LeftMultiplyE(tmp_rows_.data(), tmp_e_cols_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());

  // y3 = -(E'E)^-1 y2
-  tmp_e_cols_2_.setZero();
-  block_diagonal_EtE_inverse_->RightMultiply(tmp_e_cols_.data(),
-                                             tmp_e_cols_2_.data());
-  tmp_e_cols_2_ *= -1.0;
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);
+
+  ParallelAssign(
+      options_.context, options_.num_threads, tmp_e_cols_2_, -tmp_e_cols_2_);

  // y1 = y1 + E y3
-  A_->RightMultiplyE(tmp_e_cols_2_.data(), tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());

  // y5 = D * x
  if (D_ != nullptr) {
    ConstVectorRef Dref(D_ + A_->num_cols_e(), num_cols());
-    VectorRef(y, num_cols()) =
-        (Dref.array().square() * ConstVectorRef(x, num_cols()).array())
-            .matrix();
+    VectorRef y_cols(y, num_cols());
+    ParallelAssign(
+        options_.context,
+        options_.num_threads,
+        y_cols,
+        (Dref.array().square() * ConstVectorRef(x, num_cols()).array()));
  } else {
-    VectorRef(y, num_cols()).setZero();
+    ParallelSetZero(options_.context, options_.num_threads, y, num_cols());
  }

  // y = y5 + F' y1
-  A_->LeftMultiplyF(tmp_rows_.data(), y);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), y);
+}
+
+void ImplicitSchurComplement::InversePowerSeriesOperatorRightMultiplyAccumulate(
+    const double* x, double* y) const {
+  CHECK(compute_ftf_inverse_);
+  // y1 = F x
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+
+  // y2 = E' y1
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+
+  // y3 = (E'E)^-1 y2
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);
+  // y1 = E y3
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
+
+  // y4 = F' y1
+  ParallelSetZero(options_.context, options_.num_threads, tmp_f_cols_);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
+
+  // y += (F'F)^-1 y4
+  block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
+      tmp_f_cols_.data(), y, options_.context, options_.num_threads);
 }

 // Given a block diagonal matrix and an optional array of diagonal
@@ -136,26 +180,31 @@ void ImplicitSchurComplement::AddDiagonalAndInvert(
    const double* D, BlockSparseMatrix* block_diagonal) {
  const CompressedRowBlockStructure* block_diagonal_structure =
      block_diagonal->block_structure();
-  for (const auto& row : block_diagonal_structure->rows) {
-    const int row_block_pos = row.block.position;
-    const int row_block_size = row.block.size;
-    const Cell& cell = row.cells[0];
-    MatrixRef m(block_diagonal->mutable_values() + cell.position,
-                row_block_size,
-                row_block_size);
+  ParallelFor(options_.context,
+              0,
+              block_diagonal_structure->rows.size(),
+              options_.num_threads,
+              [block_diagonal_structure, D, block_diagonal](int row_block_id) {
+                auto& row = block_diagonal_structure->rows[row_block_id];
+                const int row_block_pos = row.block.position;
+                const int row_block_size = row.block.size;
+                const Cell& cell = row.cells[0];
+                MatrixRef m(block_diagonal->mutable_values() + cell.position,
+                            row_block_size,
+                            row_block_size);

-    if (D != nullptr) {
-      ConstVectorRef d(D + row_block_pos, row_block_size);
-      m += d.array().square().matrix().asDiagonal();
-    }
+                if (D != nullptr) {
+                  ConstVectorRef d(D + row_block_pos, row_block_size);
+                  m += d.array().square().matrix().asDiagonal();
+                }

-    m = m.selfadjointView<Eigen::Upper>().llt().solve(
-        Matrix::Identity(row_block_size, row_block_size));
-  }
+                m = m.selfadjointView<Eigen::Upper>().llt().solve(
+                    Matrix::Identity(row_block_size, row_block_size));
+              });
 }

-// Similar to RightMultiply, use the block structure of the matrix A
-// to compute y = (E'E)^-1 (E'b - E'F x).
+// Similar to RightMultiplyAndAccumulate, use the block structure of the matrix
+// A to compute y = (E'E)^-1 (E'b - E'F x).
 void ImplicitSchurComplement::BackSubstitute(const double* x, double* y) {
  const int num_cols_e = A_->num_cols_e();
  const int num_cols_f = A_->num_cols_f();
@@ -163,26 +212,34 @@ void ImplicitSchurComplement::BackSubstitute(const double* x, double* y) {
  const int num_rows = A_->num_rows();

  // y1 = F x
-  tmp_rows_.setZero();
-  A_->RightMultiplyF(x, tmp_rows_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());

  // y2 = b - y1
-  tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 tmp_rows_,
+                 ConstVectorRef(b_, num_rows) - tmp_rows_);

  // y3 = E' y2
-  tmp_e_cols_.setZero();
-  A_->LeftMultiplyE(tmp_rows_.data(), tmp_e_cols_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());

  // y = (E'E)^-1 y3
-  VectorRef(y, num_cols).setZero();
-  block_diagonal_EtE_inverse_->RightMultiply(tmp_e_cols_.data(), y);
+  ParallelSetZero(options_.context, options_.num_threads, y, num_cols);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
+      tmp_e_cols_.data(), y, options_.context, options_.num_threads);

  // The full solution vector y has two blocks. The first block of
  // variables corresponds to the eliminated variables, which we just
  // computed via back substitution. The second block of variables
  // corresponds to the Schur complement system, so we just copy those
  // values from the solution to the Schur complement.
-  VectorRef(y + num_cols_e, num_cols_f) = ConstVectorRef(x, num_cols_f);
+  VectorRef y_cols_f(y + num_cols_e, num_cols_f);
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 y_cols_f,
+                 ConstVectorRef(x, num_cols_f));
 }

 // Compute the RHS of the Schur complement system.
@@ -193,24 +250,29 @@ void ImplicitSchurComplement::BackSubstitute(const double* x, double* y) {
 // this using a series of matrix vector products.
 void ImplicitSchurComplement::UpdateRhs() {
  // y1 = E'b
-  tmp_e_cols_.setZero();
-  A_->LeftMultiplyE(b_, tmp_e_cols_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
+  A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());

  // y2 = (E'E)^-1 y1
-  Vector y2 = Vector::Zero(A_->num_cols_e());
-  block_diagonal_EtE_inverse_->RightMultiply(tmp_e_cols_.data(), y2.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);

  // y3 = E y2
-  tmp_rows_.setZero();
-  A_->RightMultiplyE(y2.data(), tmp_rows_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());

  // y3 = b - y3
-  tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 tmp_rows_,
+                 ConstVectorRef(b_, A_->num_rows()) - tmp_rows_);

  // rhs = F' y3
-  rhs_.setZero();
-  A_->LeftMultiplyF(tmp_rows_.data(), rhs_.data());
+  ParallelSetZero(options_.context, options_.num_threads, rhs_);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/implicit_schur_complement.h
+++ b/extern/ceres/internal/ceres/implicit_schur_complement.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,7 @@
 #include "ceres/partitioned_matrix_view.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;

@@ -82,13 +81,13 @@ class BlockSparseMatrix;
 // (which for our purposes is an easily inverted block diagonal
 // matrix), it can be done in terms of matrix vector products with E,
 // F and (E'E)^-1. This class implements this functionality and other
-// auxilliary bits needed to implement a CG solver on the Schur
+// auxiliary bits needed to implement a CG solver on the Schur
 // complement using the PartitionedMatrixView object.
 //
-// THREAD SAFETY: This class is nqot thread safe. In particular, the
-// RightMultiply (and the LeftMultiply) methods are not thread safe as
-// they depend on mutable arrays used for the temporaries needed to
-// compute the product y += Sx;
+// THREAD SAFETY: This class is not thread safe. In particular, the
+// RightMultiplyAndAccumulate (and the LeftMultiplyAndAccumulate) methods are
+// not thread safe as they depend on mutable arrays used for the temporaries
+// needed to compute the product y += Sx;
 class CERES_NO_EXPORT ImplicitSchurComplement final : public LinearOperator {
 public:
  // num_eliminate_blocks is the number of E blocks in the matrix
@@ -115,14 +114,20 @@ class CERES_NO_EXPORT ImplicitSchurComplement final : public LinearOperator {
  void Init(const BlockSparseMatrix& A, const double* D, const double* b);

  // y += Sx, where S is the Schur complement.
-  void RightMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;

  // The Schur complement is a symmetric positive definite matrix,
  // thus the left and right multiply operators are the same.
-  void LeftMultiply(const double* x, double* y) const final {
-    RightMultiply(x, y);
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final {
+    RightMultiplyAndAccumulate(x, y);
  }

+  // Following is useful for approximation of S^-1 via power series expansion.
+  // Z = (F'F)^-1 F'E (E'E)^-1 E'F
+  // y += Zx
+  void InversePowerSeriesOperatorRightMultiplyAccumulate(const double* x,
+                                                         double* y) const;
+
  // y = (E'E)^-1 (E'b - E'F x). Given an estimate of the solution to
  // the Schur complement system, this method computes the value of
  // the e_block variables that were eliminated to form the Schur
@@ -138,6 +143,7 @@ class CERES_NO_EXPORT ImplicitSchurComplement final : public LinearOperator {
  }

  const BlockSparseMatrix* block_diagonal_FtF_inverse() const {
+    CHECK(compute_ftf_inverse_);
    return block_diagonal_FtF_inverse_.get();
  }

@@ -146,25 +152,24 @@ class CERES_NO_EXPORT ImplicitSchurComplement final : public LinearOperator {
  void UpdateRhs();

  const LinearSolver::Options& options_;
-
+  bool compute_ftf_inverse_ = false;
  std::unique_ptr<PartitionedMatrixViewBase> A_;
-  const double* D_;
-  const double* b_;
+  const double* D_ = nullptr;
+  const double* b_ = nullptr;

  std::unique_ptr<BlockSparseMatrix> block_diagonal_EtE_inverse_;
  std::unique_ptr<BlockSparseMatrix> block_diagonal_FtF_inverse_;

  Vector rhs_;

-  // Temporary storage vectors used to implement RightMultiply.
+  // Temporary storage vectors used to implement RightMultiplyAndAccumulate.
  mutable Vector tmp_rows_;
  mutable Vector tmp_e_cols_;
  mutable Vector tmp_e_cols_2_;
  mutable Vector tmp_f_cols_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/inner_product_computer.cc
+++ b/extern/ceres/internal/ceres/inner_product_computer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@

 #include "ceres/small_blas.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Create the CompressedRowSparseMatrix matrix that will contain the
 // inner product.
@@ -52,16 +51,9 @@ InnerProductComputer::CreateResultMatrix(
  auto matrix = std::make_unique<CompressedRowSparseMatrix>(
      m_.num_cols(), m_.num_cols(), num_nonzeros);
  matrix->set_storage_type(storage_type);
-
  const CompressedRowBlockStructure* bs = m_.block_structure();
-  const std::vector<Block>& blocks = bs->cols;
-  matrix->mutable_row_blocks()->resize(blocks.size());
-  matrix->mutable_col_blocks()->resize(blocks.size());
-  for (int i = 0; i < blocks.size(); ++i) {
-    (*(matrix->mutable_row_blocks()))[i] = blocks[i].size;
-    (*(matrix->mutable_col_blocks()))[i] = blocks[i].size;
-  }
-
+  *matrix->mutable_row_blocks() = bs->cols;
+  *matrix->mutable_col_blocks() = bs->cols;
  return matrix;
 }

@@ -78,6 +70,10 @@ int InnerProductComputer::ComputeNonzeros(
  row_nnz->resize(blocks.size());
  std::fill(row_nnz->begin(), row_nnz->end(), 0);

+  if (product_terms.empty()) {
+    return 0;
+  }
+
  // First product term.
  (*row_nnz)[product_terms[0].row] = blocks[product_terms[0].col].size;
  int num_nonzeros =
@@ -130,8 +126,10 @@ std::unique_ptr<InnerProductComputer> InnerProductComputer::Create(
    const int start_row_block,
    const int end_row_block,
    CompressedRowSparseMatrix::StorageType product_storage_type) {
-  CHECK(product_storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR ||
-        product_storage_type == CompressedRowSparseMatrix::UPPER_TRIANGULAR);
+  CHECK(product_storage_type ==
+            CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR ||
+        product_storage_type ==
+            CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR);
  CHECK_GT(m.num_nonzeros(), 0)
      << "Congratulations, you found a bug in Ceres. Please report it.";
  std::unique_ptr<InnerProductComputer> inner_product_computer(
@@ -157,7 +155,8 @@ void InnerProductComputer::Init(
    for (int c1 = 0; c1 < row.cells.size(); ++c1) {
      const Cell& cell1 = row.cells[c1];
      int c2_begin, c2_end;
-      if (product_storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR) {
+      if (product_storage_type ==
+          CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR) {
        c2_begin = 0;
        c2_end = c1 + 1;
      } else {
@@ -195,6 +194,10 @@ void InnerProductComputer::ComputeOffsetsAndCreateResultMatrix(
      *(crsm_rows + 1) = *crsm_rows + row_block_nnz[i];
    }
  }
+  result_offsets_.resize(product_terms.size());
+  if (num_nonzeros == 0) {
+    return;
+  }

  // The following macro FILL_CRSM_COL_BLOCK is key to understanding
  // how this class works.
@@ -241,12 +244,11 @@ void InnerProductComputer::ComputeOffsetsAndCreateResultMatrix(
    }                                                      \
  }

-  result_offsets_.resize(product_terms.size());
  int col_nnz = 0;
  int nnz = 0;

  // Process the first term.
-  const InnerProductComputer::ProductTerm* current = &product_terms[0];
+  const InnerProductComputer::ProductTerm* current = product_terms.data();
  FILL_CRSM_COL_BLOCK;

  // Process the rest of the terms.
@@ -264,7 +266,7 @@ void InnerProductComputer::ComputeOffsetsAndCreateResultMatrix(
    if (previous->row == current->row) {
      // if the current and previous terms are in the same row block,
      // then they differ in the column block, in which case advance
-      // col_nnz by the column size of the prevous term.
+      // col_nnz by the column size of the previous term.
      col_nnz += col_blocks[previous->col].size;
    } else {
      // If we have moved to a new row-block , then col_nnz is zero,
@@ -302,7 +304,8 @@ void InnerProductComputer::Compute() {
                          rows[bs->cols[cell1.block_id].position];

      int c2_begin, c2_end;
-      if (storage_type == CompressedRowSparseMatrix::LOWER_TRIANGULAR) {
+      if (storage_type ==
+          CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR) {
        c2_begin = 0;
        c2_end = c1 + 1;
      } else {
@@ -330,5 +333,4 @@ void InnerProductComputer::Compute() {
  CHECK_EQ(cursor, result_offsets_.size());
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/inner_product_computer.h
+++ b/extern/ceres/internal/ceres/inner_product_computer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // This class is used to repeatedly compute the inner product
 //
@@ -153,8 +152,7 @@ class CERES_NO_EXPORT InnerProductComputer {
  std::vector<int> result_offsets_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/invert_psd_matrix.h
+++ b/extern/ceres/internal/ceres/invert_psd_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@
 #include "ceres/internal/eigen.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Helper routine to compute the inverse or pseudo-inverse of a
 // symmetric positive semi-definite matrix.
@@ -73,7 +72,6 @@ typename EigenTypes<kSize, kSize>::Matrix InvertPSDMatrix(
  return svd.solve(MType::Identity(size, size));
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_INVERT_PSD_MATRIX_H_
--- a/extern/ceres/internal/ceres/is_close.cc
+++ b/extern/ceres/internal/ceres/is_close.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,7 @@
 #include <algorithm>
 #include <cmath>

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 bool IsClose(double x,
             double y,
             double relative_precision,
@@ -57,5 +56,4 @@ bool IsClose(double x,
  }
  return *relative_error < std::fabs(relative_precision);
 }
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/is_close.h
+++ b/extern/ceres/internal/ceres/is_close.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 // Returns true if x and y have a relative (unsigned) difference less than
 // relative_precision and false otherwise. Stores the relative and absolute
 // difference in relative/absolute_error if non-nullptr. If one of the two
@@ -48,8 +47,7 @@ CERES_NO_EXPORT bool IsClose(double x,
                             double relative_precision,
                             double* relative_error,
                             double* absolute_error);
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/iteration_callback.cc
+++ b/extern/ceres/internal/ceres/iteration_callback.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/iterative_refiner.cc
+++ b/extern/ceres/internal/ceres/iterative_refiner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,43 +33,69 @@
 #include <string>

 #include "Eigen/Core"
+#include "ceres/dense_cholesky.h"
 #include "ceres/sparse_cholesky.h"
 #include "ceres/sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-IterativeRefiner::IterativeRefiner(const int max_num_iterations)
+SparseIterativeRefiner::SparseIterativeRefiner(const int max_num_iterations)
    : max_num_iterations_(max_num_iterations) {}

-IterativeRefiner::~IterativeRefiner() = default;
+SparseIterativeRefiner::~SparseIterativeRefiner() = default;

-void IterativeRefiner::Allocate(int num_cols) {
+void SparseIterativeRefiner::Allocate(int num_cols) {
  residual_.resize(num_cols);
  correction_.resize(num_cols);
  lhs_x_solution_.resize(num_cols);
 }

-void IterativeRefiner::Refine(const SparseMatrix& lhs,
-                              const double* rhs_ptr,
-                              SparseCholesky* sparse_cholesky,
-                              double* solution_ptr) {
+void SparseIterativeRefiner::Refine(const SparseMatrix& lhs,
+                                    const double* rhs_ptr,
+                                    SparseCholesky* cholesky,
+                                    double* solution_ptr) {
  const int num_cols = lhs.num_cols();
  Allocate(num_cols);
  ConstVectorRef rhs(rhs_ptr, num_cols);
  VectorRef solution(solution_ptr, num_cols);
+  std::string ignored_message;
  for (int i = 0; i < max_num_iterations_; ++i) {
    // residual = rhs - lhs * solution
    lhs_x_solution_.setZero();
-    lhs.RightMultiply(solution_ptr, lhs_x_solution_.data());
+    lhs.RightMultiplyAndAccumulate(solution_ptr, lhs_x_solution_.data());
    residual_ = rhs - lhs_x_solution_;
    // solution += lhs^-1 residual
-    std::string ignored_message;
-    sparse_cholesky->Solve(
-        residual_.data(), correction_.data(), &ignored_message);
+    cholesky->Solve(residual_.data(), correction_.data(), &ignored_message);
    solution += correction_;
  }
 };

-}  // namespace internal
-}  // namespace ceres
+DenseIterativeRefiner::DenseIterativeRefiner(const int max_num_iterations)
+    : max_num_iterations_(max_num_iterations) {}
+
+DenseIterativeRefiner::~DenseIterativeRefiner() = default;
+
+void DenseIterativeRefiner::Allocate(int num_cols) {
+  residual_.resize(num_cols);
+  correction_.resize(num_cols);
+}
+
+void DenseIterativeRefiner::Refine(const int num_cols,
+                                   const double* lhs_ptr,
+                                   const double* rhs_ptr,
+                                   DenseCholesky* cholesky,
+                                   double* solution_ptr) {
+  Allocate(num_cols);
+  ConstMatrixRef lhs(lhs_ptr, num_cols, num_cols);
+  ConstVectorRef rhs(rhs_ptr, num_cols);
+  VectorRef solution(solution_ptr, num_cols);
+  std::string ignored_message;
+  for (int i = 0; i < max_num_iterations_; ++i) {
+    residual_ = rhs - lhs * solution;
+    // solution += lhs^-1 residual
+    cholesky->Solve(residual_.data(), correction_.data(), &ignored_message);
+    solution += correction_;
+  }
+};
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/iterative_refiner.h
+++ b/extern/ceres/internal/ceres/iterative_refiner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,9 +39,9 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

+class DenseCholesky;
 class SparseCholesky;
 class SparseMatrix;

@@ -58,20 +58,20 @@ class SparseMatrix;
 // Definite linear systems.
 //
 // The above iterative loop is run until max_num_iterations is reached.
-class CERES_NO_EXPORT IterativeRefiner {
+class CERES_NO_EXPORT SparseIterativeRefiner {
 public:
  // max_num_iterations is the number of refinement iterations to
  // perform.
-  explicit IterativeRefiner(int max_num_iterations);
+  explicit SparseIterativeRefiner(int max_num_iterations);

  // Needed for mocking.
-  virtual ~IterativeRefiner();
+  virtual ~SparseIterativeRefiner();

  // Given an initial estimate of the solution of lhs * x = rhs, use
  // max_num_iterations rounds of iterative refinement to improve it.
  //
-  // sparse_cholesky is assumed to contain an already computed
-  // factorization (or approximation thereof) of lhs.
+  // cholesky is assumed to contain an already computed factorization (or
+  // an approximation thereof) of lhs.
  //
  // solution is expected to contain a approximation to the solution
  // to lhs * x = rhs. It can be zero.
@@ -79,7 +79,7 @@ class CERES_NO_EXPORT IterativeRefiner {
  // This method is virtual to facilitate mocking.
  virtual void Refine(const SparseMatrix& lhs,
                      const double* rhs,
-                      SparseCholesky* sparse_cholesky,
+                      SparseCholesky* cholesky,
                      double* solution);

 private:
@@ -91,7 +91,39 @@ class CERES_NO_EXPORT IterativeRefiner {
  Vector lhs_x_solution_;
 };

-}  // namespace internal
-}  // namespace ceres
+class CERES_NO_EXPORT DenseIterativeRefiner {
+ public:
+  // max_num_iterations is the number of refinement iterations to
+  // perform.
+  explicit DenseIterativeRefiner(int max_num_iterations);
+
+  // Needed for mocking.
+  virtual ~DenseIterativeRefiner();
+
+  // Given an initial estimate of the solution of lhs * x = rhs, use
+  // max_num_iterations rounds of iterative refinement to improve it.
+  //
+  // cholesky is assumed to contain an already computed factorization (or
+  // an approximation thereof) of lhs.
+  //
+  // solution is expected to contain a approximation to the solution
+  // to lhs * x = rhs. It can be zero.
+  //
+  // This method is virtual to facilitate mocking.
+  virtual void Refine(int num_cols,
+                      const double* lhs,
+                      const double* rhs,
+                      DenseCholesky* cholesky,
+                      double* solution);
+
+ private:
+  void Allocate(int num_cols);
+
+  int max_num_iterations_;
+  Vector residual_;
+  Vector correction_;
+};
+
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_ITERATIVE_REFINER_H_
--- a/extern/ceres/internal/ceres/iterative_schur_complement_solver.cc
+++ b/extern/ceres/internal/ceres/iterative_schur_complement_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 #include "ceres/implicit_schur_complement.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
+#include "ceres/power_series_expansion_preconditioner.h"
 #include "ceres/preconditioner.h"
 #include "ceres/schur_jacobi_preconditioner.h"
 #include "ceres/triplet_sparse_matrix.h"
@@ -51,8 +52,7 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 IterativeSchurComplementSolver::IterativeSchurComplementSolver(
    LinearSolver::Options options)
@@ -68,6 +68,8 @@ LinearSolver::Summary IterativeSchurComplementSolver::SolveImpl(
  EventLogger event_logger("IterativeSchurComplementSolver::Solve");

  CHECK(A->block_structure() != nullptr);
+  CHECK(A->transpose_block_structure() != nullptr);
+
  const int num_eliminate_blocks = options_.elimination_groups[0];
  // Initialize a ImplicitSchurComplement object.
  if (schur_complement_ == nullptr) {
@@ -86,45 +88,66 @@ LinearSolver::Summary IterativeSchurComplementSolver::SolveImpl(
    VLOG(2) << "No parameter blocks left in the schur complement.";
    LinearSolver::Summary summary;
    summary.num_iterations = 0;
-    summary.termination_type = LINEAR_SOLVER_SUCCESS;
+    summary.termination_type = LinearSolverTerminationType::SUCCESS;
    schur_complement_->BackSubstitute(nullptr, x);
    return summary;
  }

-  // Initialize the solution to the Schur complement system to zero.
+  // Initialize the solution to the Schur complement system.
  reduced_linear_system_solution_.resize(schur_complement_->num_rows());
  reduced_linear_system_solution_.setZero();
-
-  LinearSolver::Options cg_options;
-  cg_options.min_num_iterations = options_.min_num_iterations;
-  cg_options.max_num_iterations = options_.max_num_iterations;
-  ConjugateGradientsSolver cg_solver(cg_options);
-
-  LinearSolver::PerSolveOptions cg_per_solve_options;
-  cg_per_solve_options.r_tolerance = per_solve_options.r_tolerance;
-  cg_per_solve_options.q_tolerance = per_solve_options.q_tolerance;
+  if (options_.use_spse_initialization) {
+    Preconditioner::Options preconditioner_options(options_);
+    preconditioner_options.type = SCHUR_POWER_SERIES_EXPANSION;
+    PowerSeriesExpansionPreconditioner pse_solver(
+        schur_complement_.get(),
+        options_.max_num_spse_iterations,
+        options_.spse_tolerance,
+        preconditioner_options);
+    pse_solver.RightMultiplyAndAccumulate(
+        schur_complement_->rhs().data(),
+        reduced_linear_system_solution_.data());
+  }

  CreatePreconditioner(A);
-  if (preconditioner_.get() != nullptr) {
+  if (preconditioner_ != nullptr) {
    if (!preconditioner_->Update(*A, per_solve_options.D)) {
      LinearSolver::Summary summary;
      summary.num_iterations = 0;
-      summary.termination_type = LINEAR_SOLVER_FAILURE;
+      summary.termination_type = LinearSolverTerminationType::FAILURE;
      summary.message = "Preconditioner update failed.";
      return summary;
    }
-
-    cg_per_solve_options.preconditioner = preconditioner_.get();
  }

+  ConjugateGradientsSolverOptions cg_options;
+  cg_options.min_num_iterations = options_.min_num_iterations;
+  cg_options.max_num_iterations = options_.max_num_iterations;
+  cg_options.residual_reset_period = options_.residual_reset_period;
+  cg_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_options.r_tolerance = per_solve_options.r_tolerance;
+
+  LinearOperatorAdapter lhs(*schur_complement_);
+  LinearOperatorAdapter preconditioner(*preconditioner_);
+
+  Vector scratch[4];
+  for (int i = 0; i < 4; ++i) {
+    scratch[i].resize(schur_complement_->num_cols());
+  }
+  Vector* scratch_ptr[4] = {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
+
  event_logger.AddEvent("Setup");
+
  LinearSolver::Summary summary =
-      cg_solver.Solve(schur_complement_.get(),
-                      schur_complement_->rhs().data(),
-                      cg_per_solve_options,
-                      reduced_linear_system_solution_.data());
-  if (summary.termination_type != LINEAR_SOLVER_FAILURE &&
-      summary.termination_type != LINEAR_SOLVER_FATAL_ERROR) {
+      ConjugateGradientsSolver(cg_options,
+                               lhs,
+                               schur_complement_->rhs(),
+                               preconditioner,
+                               scratch_ptr,
+                               reduced_linear_system_solution_);
+
+  if (summary.termination_type != LinearSolverTerminationType::FAILURE &&
+      summary.termination_type != LinearSolverTerminationType::FATAL_ERROR) {
    schur_complement_->BackSubstitute(reduced_linear_system_solution_.data(),
                                      x);
  }
@@ -134,29 +157,31 @@ LinearSolver::Summary IterativeSchurComplementSolver::SolveImpl(

 void IterativeSchurComplementSolver::CreatePreconditioner(
    BlockSparseMatrix* A) {
-  if (options_.preconditioner_type == IDENTITY ||
-      preconditioner_.get() != nullptr) {
+  if (preconditioner_ != nullptr) {
    return;
  }

-  Preconditioner::Options preconditioner_options;
-  preconditioner_options.type = options_.preconditioner_type;
-  preconditioner_options.visibility_clustering_type =
-      options_.visibility_clustering_type;
-  preconditioner_options.sparse_linear_algebra_library_type =
-      options_.sparse_linear_algebra_library_type;
-  preconditioner_options.num_threads = options_.num_threads;
-  preconditioner_options.row_block_size = options_.row_block_size;
-  preconditioner_options.e_block_size = options_.e_block_size;
-  preconditioner_options.f_block_size = options_.f_block_size;
-  preconditioner_options.elimination_groups = options_.elimination_groups;
+  Preconditioner::Options preconditioner_options(options_);
  CHECK(options_.context != nullptr);
-  preconditioner_options.context = options_.context;

  switch (options_.preconditioner_type) {
+    case IDENTITY:
+      preconditioner_ = std::make_unique<IdentityPreconditioner>(
+          schur_complement_->num_cols());
+      break;
    case JACOBI:
      preconditioner_ = std::make_unique<SparseMatrixPreconditionerWrapper>(
-          schur_complement_->block_diagonal_FtF_inverse());
+          schur_complement_->block_diagonal_FtF_inverse(),
+          preconditioner_options);
+      break;
+    case SCHUR_POWER_SERIES_EXPANSION:
+      // Ignoring the value of spse_tolerance to ensure preconditioner stays
+      // fixed during the iterations of cg.
+      preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>(
+          schur_complement_.get(),
+          options_.max_num_spse_iterations,
+          0,
+          preconditioner_options);
      break;
    case SCHUR_JACOBI:
      preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
@@ -172,5 +197,4 @@ void IterativeSchurComplementSolver::CreatePreconditioner(
  }
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/iterative_schur_complement_solver.h
+++ b/extern/ceres/internal/ceres/iterative_schur_complement_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/linear_solver.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;
 class ImplicitSchurComplement;
@@ -53,7 +52,7 @@ class Preconditioner;
 // The algorithm used by this solver was developed in a series of
 // papers - "Agarwal et al, Bundle Adjustment in the Large, ECCV 2010"
 // and "Wu et al, Multicore Bundle Adjustment, submitted to CVPR
-// 2011" at the Univeristy of Washington.
+// 2011" at the University of Washington.
 //
 // The key idea is that one can run Conjugate Gradients on the Schur
 // Complement system without explicitly forming the Schur Complement
@@ -94,8 +93,7 @@ class CERES_NO_EXPORT IterativeSchurComplementSolver final
  Vector reduced_linear_system_solution_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/levenberg_marquardt_strategy.cc
+++ b/extern/ceres/internal/ceres/levenberg_marquardt_strategy.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,13 +38,13 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_least_squares_problems.h"
 #include "ceres/linear_solver.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/sparse_matrix.h"
 #include "ceres/trust_region_strategy.h"
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 LevenbergMarquardtStrategy::LevenbergMarquardtStrategy(
    const TrustRegionStrategy::Options& options)
@@ -54,7 +54,9 @@ LevenbergMarquardtStrategy::LevenbergMarquardtStrategy(
      min_diagonal_(options.min_lm_diagonal),
      max_diagonal_(options.max_lm_diagonal),
      decrease_factor_(2.0),
-      reuse_diagonal_(false) {
+      reuse_diagonal_(false),
+      context_(options.context),
+      num_threads_(options.num_threads) {
  CHECK(linear_solver_ != nullptr);
  CHECK_GT(min_diagonal_, 0.0);
  CHECK_LE(min_diagonal_, max_diagonal_);
@@ -78,14 +80,18 @@ TrustRegionStrategy::Summary LevenbergMarquardtStrategy::ComputeStep(
      diagonal_.resize(num_parameters, 1);
    }

-    jacobian->SquaredColumnNorm(diagonal_.data());
-    for (int i = 0; i < num_parameters; ++i) {
-      diagonal_[i] =
-          std::min(std::max(diagonal_[i], min_diagonal_), max_diagonal_);
-    }
+    jacobian->SquaredColumnNorm(diagonal_.data(), context_, num_threads_);
+    ParallelAssign(context_,
+                   num_threads_,
+                   diagonal_,
+                   diagonal_.array().max(min_diagonal_).min(max_diagonal_));
  }

-  lm_diagonal_ = (diagonal_ / radius_).array().sqrt();
+  if (lm_diagonal_.size() == 0) {
+    lm_diagonal_.resize(num_parameters);
+  }
+  ParallelAssign(
+      context_, num_threads_, lm_diagonal_, (diagonal_ / radius_).cwiseSqrt());

  LinearSolver::PerSolveOptions solve_options;
  solve_options.D = lm_diagonal_.data();
@@ -99,7 +105,7 @@ TrustRegionStrategy::Summary LevenbergMarquardtStrategy::ComputeStep(
  // Invalidate the output array lm_step, so that we can detect if
  // the linear solver generated numerical garbage.  This is known
  // to happen for the DENSE_QR and then DENSE_SCHUR solver when
-  // the Jacobin is severely rank deficient and mu is too small.
+  // the Jacobian is severely rank deficient and mu is too small.
  InvalidateArray(num_parameters, step);

  // Instead of solving Jx = -r, solve Jy = r.
@@ -108,17 +114,21 @@ TrustRegionStrategy::Summary LevenbergMarquardtStrategy::ComputeStep(
  LinearSolver::Summary linear_solver_summary =
      linear_solver_->Solve(jacobian, residuals, solve_options, step);

-  if (linear_solver_summary.termination_type == LINEAR_SOLVER_FATAL_ERROR) {
+  if (linear_solver_summary.termination_type ==
+      LinearSolverTerminationType::FATAL_ERROR) {
    LOG(WARNING) << "Linear solver fatal error: "
                 << linear_solver_summary.message;
-  } else if (linear_solver_summary.termination_type == LINEAR_SOLVER_FAILURE) {
+  } else if (linear_solver_summary.termination_type ==
+             LinearSolverTerminationType::FAILURE) {
    LOG(WARNING) << "Linear solver failure. Failed to compute a step: "
                 << linear_solver_summary.message;
  } else if (!IsArrayValid(num_parameters, step)) {
    LOG(WARNING) << "Linear solver failure. Failed to compute a finite step.";
-    linear_solver_summary.termination_type = LINEAR_SOLVER_FAILURE;
+    linear_solver_summary.termination_type =
+        LinearSolverTerminationType::FAILURE;
  } else {
-    VectorRef(step, num_parameters) *= -1.0;
+    VectorRef step_vec(step, num_parameters);
+    ParallelAssign(context_, num_threads_, step_vec, -step_vec);
  }
  reuse_diagonal_ = true;

@@ -153,7 +163,7 @@ void LevenbergMarquardtStrategy::StepAccepted(double step_quality) {
  reuse_diagonal_ = false;
 }

-void LevenbergMarquardtStrategy::StepRejected(double step_quality) {
+void LevenbergMarquardtStrategy::StepRejected(double /*step_quality*/) {
  radius_ = radius_ / decrease_factor_;
  decrease_factor_ *= 2.0;
  reuse_diagonal_ = true;
@@ -161,5 +171,4 @@ void LevenbergMarquardtStrategy::StepRejected(double step_quality) {

 double LevenbergMarquardtStrategy::Radius() const { return radius_; }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/levenberg_marquardt_strategy.h
+++ b/extern/ceres/internal/ceres/levenberg_marquardt_strategy.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,9 @@
 #include "ceres/internal/export.h"
 #include "ceres/trust_region_strategy.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+
+class ContextImpl;

 // Levenberg-Marquardt step computation and trust region sizing
 // strategy based on on "Methods for Nonlinear Least Squares" by
@@ -82,10 +83,11 @@ class CERES_NO_EXPORT LevenbergMarquardtStrategy final
  // allocations in every iteration and reuse when a step fails and
  // ComputeStep is called again.
  Vector lm_diagonal_;  // lm_diagonal_ = sqrt(diagonal_ / radius_);
+  ContextImpl* context_;
+  int num_threads_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/line_search.cc
+++ b/extern/ceres/internal/ceres/line_search.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,11 @@
 #include <algorithm>
 #include <cmath>
 #include <iomanip>
-#include <iostream>  // NOLINT
+#include <map>
 #include <memory>
+#include <ostream>  // NOLINT
+#include <string>
+#include <vector>

 #include "ceres/evaluator.h"
 #include "ceres/function_sample.h"
@@ -45,23 +48,17 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::map;
-using std::ostream;
-using std::string;
-using std::vector;
+namespace ceres::internal {

 namespace {
 // Precision used for floating point values in error message output.
 const int kErrorMessageNumericPrecision = 8;
 }  // namespace

-ostream& operator<<(ostream& os, const FunctionSample& sample);
+std::ostream& operator<<(std::ostream& os, const FunctionSample& sample);

 // Convenience stream operator for pushing FunctionSamples into log messages.
-ostream& operator<<(ostream& os, const FunctionSample& sample) {
+std::ostream& operator<<(std::ostream& os, const FunctionSample& sample) {
  os << sample.ToDebugString();
  return os;
 }
@@ -74,16 +71,16 @@ LineSearch::LineSearch(const LineSearch::Options& options)
 std::unique_ptr<LineSearch> LineSearch::Create(
    const LineSearchType line_search_type,
    const LineSearch::Options& options,
-    string* error) {
+    std::string* error) {
  switch (line_search_type) {
    case ceres::ARMIJO:
      return std::make_unique<ArmijoLineSearch>(options);
    case ceres::WOLFE:
      return std::make_unique<WolfeLineSearch>(options);
    default:
-      *error = string("Invalid line search algorithm type: ") +
+      *error = std::string("Invalid line search algorithm type: ") +
               LineSearchTypeToString(line_search_type) +
-               string(", unable to create line search.");
+               std::string(", unable to create line search.");
  }
  return nullptr;
 }
@@ -150,7 +147,7 @@ double LineSearchFunction::DirectionInfinityNorm() const {
 }

 void LineSearchFunction::ResetTimeStatistics() {
-  const map<string, CallStatistics> evaluator_statistics =
+  const std::map<std::string, CallStatistics> evaluator_statistics =
      evaluator_->Statistics();

  initial_evaluator_residual_time_in_seconds =
@@ -166,7 +163,7 @@ void LineSearchFunction::ResetTimeStatistics() {
 void LineSearchFunction::TimeStatistics(
    double* cost_evaluation_time_in_seconds,
    double* gradient_evaluation_time_in_seconds) const {
-  const map<string, CallStatistics> evaluator_time_statistics =
+  const std::map<std::string, CallStatistics> evaluator_time_statistics =
      evaluator_->Statistics();
  *cost_evaluation_time_in_seconds =
      FindWithDefault(
@@ -243,7 +240,7 @@ double LineSearch::InterpolatingPolynomialMinimizingStepSize(

  // Select step size by interpolating the function and gradient values
  // and minimizing the corresponding polynomial.
-  vector<FunctionSample> samples;
+  std::vector<FunctionSample> samples;
  samples.push_back(lowerbound);

  if (interpolation_type == QUADRATIC) {
@@ -427,7 +424,7 @@ void WolfeLineSearch::DoSearch(const double step_size_estimate,
    // shrank the bracket width until it was below our minimum tolerance.
    // As these are 'artificial' constraints, and we would otherwise fail to
    // produce a valid point when ArmijoLineSearch would succeed, we return the
-    // point with the lowest cost found thus far which satsifies the Armijo
+    // point with the lowest cost found thus far which satisfies the Armijo
    // condition (but not the Wolfe conditions).
    summary->optimal_point = bracket_low;
    summary->success = true;
@@ -449,8 +446,8 @@ void WolfeLineSearch::DoSearch(const double step_size_estimate,
  // defined by bracket_low & bracket_high, which satisfy:
  //
  //   1. The interval bounded by step sizes: bracket_low.x & bracket_high.x
-  //      contains step sizes that satsify the strong Wolfe conditions.
-  //   2. bracket_low.x is of all the step sizes evaluated *which satisifed the
+  //      contains step sizes that satisfy the strong Wolfe conditions.
+  //   2. bracket_low.x is of all the step sizes evaluated *which satisfied the
  //      Armijo sufficient decrease condition*, the one which generated the
  //      smallest function value, i.e. bracket_low.value <
  //      f(all other steps satisfying Armijo).
@@ -494,7 +491,7 @@ void WolfeLineSearch::DoSearch(const double step_size_estimate,
 // Or, searching was stopped due to an 'artificial' constraint, i.e. not
 // a condition imposed / required by the underlying algorithm, but instead an
 // engineering / implementation consideration. But a step which exceeds the
-// minimum step size, and satsifies the Armijo condition was still found,
+// minimum step size, and satisfies the Armijo condition was still found,
 // and should thus be used [zoom not required].
 //
 // Returns false if no step size > minimum step size was found which
@@ -518,7 +515,7 @@ bool WolfeLineSearch::BracketingPhase(const FunctionSample& initial_position,
  // As we require the gradient to evaluate the Wolfe condition, we always
  // calculate it together with the value, irrespective of the interpolation
  // type.  As opposed to only calculating the gradient after the Armijo
-  // condition is satisifed, as the computational saving from this approach
+  // condition is satisfied, as the computational saving from this approach
  // would be slight (perhaps even negative due to the extra call).  Also,
  // always calculating the value & gradient together protects against us
  // reporting invalid solutions if the cost function returns slightly different
@@ -821,7 +818,7 @@ bool WolfeLineSearch::ZoomPhase(const FunctionSample& initial_position,
    // As we require the gradient to evaluate the Wolfe condition, we always
    // calculate it together with the value, irrespective of the interpolation
    // type.  As opposed to only calculating the gradient after the Armijo
-    // condition is satisifed, as the computational saving from this approach
+    // condition is satisfied, as the computational saving from this approach
    // would be slight (perhaps even negative due to the extra call).  Also,
    // always calculating the value & gradient together protects against us
    // reporting invalid solutions if the cost function returns slightly
@@ -883,5 +880,4 @@ bool WolfeLineSearch::ZoomPhase(const FunctionSample& initial_position,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/line_search.h
+++ b/extern/ceres/internal/ceres/line_search.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,8 +42,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Evaluator;
 class LineSearchFunction;
@@ -302,7 +301,6 @@ class CERES_NO_EXPORT WolfeLineSearch final : public LineSearch {
                Summary* summary) const final;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_LINE_SEARCH_H_
--- a/extern/ceres/internal/ceres/line_search_direction.cc
+++ b/extern/ceres/internal/ceres/line_search_direction.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,11 @@
 #include "ceres/low_rank_inverse_hessian.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT SteepestDescent final : public LineSearchDirection {
 public:
-  bool NextDirection(const LineSearchMinimizer::State& previous,
+  bool NextDirection(const LineSearchMinimizer::State& /*previous*/,
                     const LineSearchMinimizer::State& current,
                     Vector* search_direction) override {
    *search_direction = -current.gradient;
@@ -121,8 +120,8 @@ class CERES_NO_EXPORT LBFGS final : public LineSearchDirection {
        current.gradient - previous.gradient);

    search_direction->setZero();
-    low_rank_inverse_hessian_.RightMultiply(current.gradient.data(),
-                                            search_direction->data());
+    low_rank_inverse_hessian_.RightMultiplyAndAccumulate(
+        current.gradient.data(), search_direction->data());
    *search_direction *= -1.0;

    if (search_direction->dot(current.gradient) >= 0.0) {
@@ -242,7 +241,7 @@ class CERES_NO_EXPORT BFGS final : public LineSearchDirection {
        //
        // The original origin of this rescaling trick is somewhat unclear, the
        // earliest reference appears to be Oren [1], however it is widely
-        // discussed without specific attributation in various texts including
+        // discussed without specific attribution in various texts including
        // [2] (p143).
        //
        // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms
@@ -367,5 +366,4 @@ std::unique_ptr<LineSearchDirection> LineSearchDirection::Create(
  return nullptr;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/line_search_direction.h
+++ b/extern/ceres/internal/ceres/line_search_direction.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/line_search_minimizer.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT LineSearchDirection {
 public:
@@ -61,7 +60,6 @@ class CERES_NO_EXPORT LineSearchDirection {
                             Vector* search_direction) = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_LINE_SEARCH_DIRECTION_H_
--- a/extern/ceres/internal/ceres/line_search_minimizer.cc
+++ b/extern/ceres/internal/ceres/line_search_minimizer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
 //
 // Generic loop for line search based optimization algorithms.
 //
-// This is primarily inpsired by the minFunc packaged written by Mark
+// This is primarily inspired by the minFunc packaged written by Mark
 // Schmidt.
 //
 // http://www.di.ens.fr/~mschmidt/Software/minFunc.html
@@ -59,8 +59,7 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 namespace {

 bool EvaluateGradientNorms(Evaluator* evaluator,
@@ -473,5 +472,4 @@ void LineSearchMinimizer::Minimize(const Minimizer::Options& options,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/line_search_minimizer.h
+++ b/extern/ceres/internal/ceres/line_search_minimizer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Generic line search minimization algorithm.
 //
@@ -47,7 +46,7 @@ namespace internal {
 class CERES_NO_EXPORT LineSearchMinimizer final : public Minimizer {
 public:
  struct State {
-    State(int num_parameters, int num_effective_parameters)
+    State(int /*num_parameters*/, int num_effective_parameters)
        : cost(0.0),
          gradient(num_effective_parameters),
          gradient_squared_norm(0.0),
@@ -69,7 +68,6 @@ class CERES_NO_EXPORT LineSearchMinimizer final : public Minimizer {
                Solver::Summary* summary) final;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_LINE_SEARCH_MINIMIZER_H_
--- a/extern/ceres/internal/ceres/line_search_preprocessor.cc
+++ b/extern/ceres/internal/ceres/line_search_preprocessor.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,7 @@
 #include "ceres/program.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 namespace {

 bool IsProgramValid(const Program& program, std::string* error) {
@@ -102,5 +101,4 @@ bool LineSearchPreprocessor::Preprocess(const Solver::Options& options,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/line_search_preprocessor.h
+++ b/extern/ceres/internal/ceres/line_search_preprocessor.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/preprocessor.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT LineSearchPreprocessor final : public Preprocessor {
 public:
@@ -45,8 +44,7 @@ class CERES_NO_EXPORT LineSearchPreprocessor final : public Preprocessor {
                  PreprocessedProblem* preprocessed_problem) final;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/linear_least_squares_problems.cc
+++ b/extern/ceres/internal/ceres/linear_least_squares_problems.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,10 +44,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::string;
+namespace ceres::internal {

 std::unique_ptr<LinearLeastSquaresProblem>
 CreateLinearLeastSquaresProblemFromId(int id) {
@@ -62,6 +59,10 @@ CreateLinearLeastSquaresProblemFromId(int id) {
      return LinearLeastSquaresProblem3();
    case 4:
      return LinearLeastSquaresProblem4();
+    case 5:
+      return LinearLeastSquaresProblem5();
+    case 6:
+      return LinearLeastSquaresProblem6();
    default:
      LOG(FATAL) << "Unknown problem id requested " << id;
  }
@@ -87,8 +88,7 @@ x_D = [1.78448275;
       2.82327586;]
 */
 std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem0() {
-  std::unique_ptr<LinearLeastSquaresProblem> problem =
-      std::make_unique<LinearLeastSquaresProblem>();
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();

  auto A = std::make_unique<TripletSparseMatrix>(3, 2, 6);
  problem->b = std::make_unique<double[]>(3);
@@ -161,13 +161,15 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem0() {
             12    0    1   17   1
              0   30    1    1  37]

+      cond(A'A) = 200.36
+
      S = [ 42.3419  -1.4000  -11.5806
            -1.4000   2.6000    1.0000
           -11.5806   1.0000   31.1935]

      r = [ 4.3032
            5.4000
-            5.0323]
+            4.0323]

      S\r = [ 0.2102
              2.1367
@@ -187,14 +189,21 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem1() {
  int num_rows = 6;
  int num_cols = 5;

-  std::unique_ptr<LinearLeastSquaresProblem> problem =
-      std::make_unique<LinearLeastSquaresProblem>();
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();
+
  auto A = std::make_unique<TripletSparseMatrix>(
      num_rows, num_cols, num_rows * num_cols);
  problem->b = std::make_unique<double[]>(num_rows);
  problem->D = std::make_unique<double[]>(num_cols);
  problem->num_eliminate_blocks = 2;

+  problem->x = std::make_unique<double[]>(num_cols);
+  problem->x[0] = -2.3061;
+  problem->x[1] = 0.3172;
+  problem->x[2] = 0.2102;
+  problem->x[3] = 2.1367;
+  problem->x[4] = 0.1388;
+
  int* rows = A->mutable_rows();
  int* cols = A->mutable_cols();
  double* values = A->mutable_values();
@@ -292,16 +301,21 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem2() {
  int num_rows = 6;
  int num_cols = 5;

-  std::unique_ptr<LinearLeastSquaresProblem> problem =
-      std::make_unique<LinearLeastSquaresProblem>();
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();

  problem->b = std::make_unique<double[]>(num_rows);
  problem->D = std::make_unique<double[]>(num_cols);
  problem->num_eliminate_blocks = 2;

+  problem->x = std::make_unique<double[]>(num_cols);
+  problem->x[0] = -2.3061;
+  problem->x[1] = 0.3172;
+  problem->x[2] = 0.2102;
+  problem->x[3] = 2.1367;
+  problem->x[4] = 0.1388;
+
  auto* bs = new CompressedRowBlockStructure;
-  std::unique_ptr<double[]> values =
-      std::make_unique<double[]>(num_rows * num_cols);
+  auto values = std::make_unique<double[]>(num_rows * num_cols);

  for (int c = 0; c < num_cols; ++c) {
    bs->cols.emplace_back();
@@ -427,16 +441,14 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem3() {
  int num_rows = 5;
  int num_cols = 2;

-  std::unique_ptr<LinearLeastSquaresProblem> problem =
-      std::make_unique<LinearLeastSquaresProblem>();
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();

  problem->b = std::make_unique<double[]>(num_rows);
  problem->D = std::make_unique<double[]>(num_cols);
  problem->num_eliminate_blocks = 2;

  auto* bs = new CompressedRowBlockStructure;
-  std::unique_ptr<double[]> values =
-      std::make_unique<double[]>(num_rows * num_cols);
+  auto values = std::make_unique<double[]>(num_rows * num_cols);

  for (int c = 0; c < num_cols; ++c) {
    bs->cols.emplace_back();
@@ -536,16 +548,14 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4() {
  int num_rows = 3;
  int num_cols = 7;

-  std::unique_ptr<LinearLeastSquaresProblem> problem =
-      std::make_unique<LinearLeastSquaresProblem>();
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();

  problem->b = std::make_unique<double[]>(num_rows);
  problem->D = std::make_unique<double[]>(num_cols);
  problem->num_eliminate_blocks = 1;

  auto* bs = new CompressedRowBlockStructure;
-  std::unique_ptr<double[]> values =
-      std::make_unique<double[]>(num_rows * num_cols);
+  auto values = std::make_unique<double[]>(num_rows * num_cols);

  // Column block structure
  bs->cols.emplace_back();
@@ -614,12 +624,313 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4() {
  return problem;
 }

+/*
+A problem with block-diagonal F'F.
+
+      A = [1  0 | 0 0 2
+           3  0 | 0 0 4
+           0 -1 | 0 1 0
+           0 -3 | 0 1 0
+           0 -1 | 3 0 0
+           0 -2 | 1 0 0]
+
+      b = [0
+           1
+           2
+           3
+           4
+           5]
+
+      c = A'* b = [ 22
+                   -25
+                    17
+                     7
+                     4]
+
+      A'A = [10    0    0    0   10
+              0   15   -5   -4    0
+              0   -5   10    0    0
+              0   -4    0    2    0
+             10    0    0    0   20]
+
+      cond(A'A) = 41.402
+
+      S = [ 8.3333   -1.3333         0
+           -1.3333    0.9333         0
+                 0         0   10.0000]
+
+      r = [ 8.6667
+           -1.6667
+            1.0000]
+
+      S\r = [  0.9778
+              -0.3889
+               0.1000]
+
+      A\b = [  0.2
+              -1.4444
+               0.9777
+              -0.3888
+               0.1]
+*/
+
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem5() {
+  int num_rows = 6;
+  int num_cols = 5;
+
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();
+  problem->b = std::make_unique<double[]>(num_rows);
+  problem->D = std::make_unique<double[]>(num_cols);
+  problem->num_eliminate_blocks = 2;
+
+  // TODO: add x
+  problem->x = std::make_unique<double[]>(num_cols);
+  problem->x[0] = 0.2;
+  problem->x[1] = -1.4444;
+  problem->x[2] = 0.9777;
+  problem->x[3] = -0.3888;
+  problem->x[4] = 0.1;
+
+  auto* bs = new CompressedRowBlockStructure;
+  auto values = std::make_unique<double[]>(num_rows * num_cols);
+
+  for (int c = 0; c < num_cols; ++c) {
+    bs->cols.emplace_back();
+    bs->cols.back().size = 1;
+    bs->cols.back().position = c;
+  }
+
+  int nnz = 0;
+
+  // Row 1
+  {
+    values[nnz++] = -1;
+    values[nnz++] = 2;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 0;
+    row.cells.emplace_back(0, 0);
+    row.cells.emplace_back(4, 1);
+  }
+
+  // Row 2
+  {
+    values[nnz++] = 3;
+    values[nnz++] = 4;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 1;
+    row.cells.emplace_back(0, 2);
+    row.cells.emplace_back(4, 3);
+  }
+
+  // Row 3
+  {
+    values[nnz++] = -1;
+    values[nnz++] = 1;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 2;
+    row.cells.emplace_back(1, 4);
+    row.cells.emplace_back(3, 5);
+  }
+
+  // Row 4
+  {
+    values[nnz++] = -3;
+    values[nnz++] = 1;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 3;
+    row.cells.emplace_back(1, 6);
+    row.cells.emplace_back(3, 7);
+  }
+
+  // Row 5
+  {
+    values[nnz++] = -1;
+    values[nnz++] = 3;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 4;
+    row.cells.emplace_back(1, 8);
+    row.cells.emplace_back(2, 9);
+  }
+
+  // Row 6
+  {
+    // values[nnz++] = 2;
+    values[nnz++] = -2;
+    values[nnz++] = 1;
+
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 5;
+    // row.cells.emplace_back(0, 10);
+    row.cells.emplace_back(1, 10);
+    row.cells.emplace_back(2, 11);
+  }
+
+  auto A = std::make_unique<BlockSparseMatrix>(bs);
+  memcpy(A->mutable_values(), values.get(), nnz * sizeof(*A->values()));
+
+  for (int i = 0; i < num_cols; ++i) {
+    problem->D.get()[i] = 1;
+  }
+
+  for (int i = 0; i < num_rows; ++i) {
+    problem->b.get()[i] = i;
+  }
+
+  problem->A = std::move(A);
+
+  return problem;
+}
+
+/*
+      A = [1 2 0 0 0 1 1
+           1 4 0 0 0 5 6
+           3 4 0 0 0 7 8
+           5 6 0 0 0 9 0
+           0 0 9 0 0 3 1]
+
+      b = [0
+           1
+           2
+           3
+           4]
+*/
+// BlockSparseMatrix version
+//
+// This problem has the unique property that it has two different
+// sized f-blocks, but only one of them occurs in the rows involving
+// the one e-block. So performing Schur elimination on this problem
+// tests the Schur Eliminator's ability to handle non-e-block rows
+// correctly when their structure does not conform to the static
+// structure determined by DetectStructure.
+//
+// Additionally, this problem has the first row of the last row block of E being
+// larger than number of row blocks in E
+//
+// NOTE: This problem is too small and rank deficient to be solved without
+// the diagonal regularization.
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6() {
+  int num_rows = 5;
+  int num_cols = 7;
+
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();
+
+  problem->b = std::make_unique<double[]>(num_rows);
+  problem->D = std::make_unique<double[]>(num_cols);
+  problem->num_eliminate_blocks = 1;
+
+  auto* bs = new CompressedRowBlockStructure;
+  auto values = std::make_unique<double[]>(num_rows * num_cols);
+
+  // Column block structure
+  bs->cols.emplace_back();
+  bs->cols.back().size = 2;
+  bs->cols.back().position = 0;
+
+  bs->cols.emplace_back();
+  bs->cols.back().size = 3;
+  bs->cols.back().position = 2;
+
+  bs->cols.emplace_back();
+  bs->cols.back().size = 2;
+  bs->cols.back().position = 5;
+
+  int nnz = 0;
+
+  // Row 1 & 2
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 2;
+    row.block.position = 0;
+
+    row.cells.emplace_back(0, nnz);
+    values[nnz++] = 1;
+    values[nnz++] = 2;
+    values[nnz++] = 1;
+    values[nnz++] = 4;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 1;
+    values[nnz++] = 1;
+    values[nnz++] = 5;
+    values[nnz++] = 6;
+  }
+
+  // Row 3 & 4
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 2;
+    row.block.position = 2;
+
+    row.cells.emplace_back(0, nnz);
+    values[nnz++] = 3;
+    values[nnz++] = 4;
+    values[nnz++] = 5;
+    values[nnz++] = 6;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 7;
+    values[nnz++] = 8;
+    values[nnz++] = 9;
+    values[nnz++] = 0;
+  }
+
+  // Row 5
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 4;
+
+    row.cells.emplace_back(1, nnz);
+    values[nnz++] = 9;
+    values[nnz++] = 0;
+    values[nnz++] = 0;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 3;
+    values[nnz++] = 1;
+  }
+
+  auto A = std::make_unique<BlockSparseMatrix>(bs);
+  memcpy(A->mutable_values(), values.get(), nnz * sizeof(*A->values()));
+
+  for (int i = 0; i < num_cols; ++i) {
+    problem->D.get()[i] = (i + 1) * 100;
+  }
+
+  for (int i = 0; i < num_rows; ++i) {
+    problem->b.get()[i] = i;
+  }
+
+  problem->A = std::move(A);
+  return problem;
+}
+
 namespace {
 bool DumpLinearLeastSquaresProblemToConsole(const SparseMatrix* A,
                                            const double* D,
                                            const double* b,
                                            const double* x,
-                                            int num_eliminate_blocks) {
+                                            int /*num_eliminate_blocks*/) {
  CHECK(A != nullptr);
  Matrix AA;
  A->ToDenseMatrix(&AA);
@@ -639,7 +950,7 @@ bool DumpLinearLeastSquaresProblemToConsole(const SparseMatrix* A,
  return true;
 }

-void WriteArrayToFileOrDie(const string& filename,
+void WriteArrayToFileOrDie(const std::string& filename,
                           const double* x,
                           const int size) {
  CHECK(x != nullptr);
@@ -652,23 +963,23 @@ void WriteArrayToFileOrDie(const string& filename,
  fclose(fptr);
 }

-bool DumpLinearLeastSquaresProblemToTextFile(const string& filename_base,
+bool DumpLinearLeastSquaresProblemToTextFile(const std::string& filename_base,
                                             const SparseMatrix* A,
                                             const double* D,
                                             const double* b,
                                             const double* x,
-                                             int num_eliminate_blocks) {
+                                             int /*num_eliminate_blocks*/) {
  CHECK(A != nullptr);
  LOG(INFO) << "writing to: " << filename_base << "*";

-  string matlab_script;
+  std::string matlab_script;
  StringAppendF(&matlab_script,
                "function lsqp = load_trust_region_problem()\n");
  StringAppendF(&matlab_script, "lsqp.num_rows = %d;\n", A->num_rows());
  StringAppendF(&matlab_script, "lsqp.num_cols = %d;\n", A->num_cols());

  {
-    string filename = filename_base + "_A.txt";
+    std::string filename = filename_base + "_A.txt";
    FILE* fptr = fopen(filename.c_str(), "w");
    CHECK(fptr != nullptr);
    A->ToTextFile(fptr);
@@ -683,33 +994,33 @@ bool DumpLinearLeastSquaresProblemToTextFile(const string& filename_base,
  }

  if (D != nullptr) {
-    string filename = filename_base + "_D.txt";
+    std::string filename = filename_base + "_D.txt";
    WriteArrayToFileOrDie(filename, D, A->num_cols());
    StringAppendF(
        &matlab_script, "lsqp.D = load('%s', '-ascii');\n", filename.c_str());
  }

  if (b != nullptr) {
-    string filename = filename_base + "_b.txt";
+    std::string filename = filename_base + "_b.txt";
    WriteArrayToFileOrDie(filename, b, A->num_rows());
    StringAppendF(
        &matlab_script, "lsqp.b = load('%s', '-ascii');\n", filename.c_str());
  }

  if (x != nullptr) {
-    string filename = filename_base + "_x.txt";
+    std::string filename = filename_base + "_x.txt";
    WriteArrayToFileOrDie(filename, x, A->num_cols());
    StringAppendF(
        &matlab_script, "lsqp.x = load('%s', '-ascii');\n", filename.c_str());
  }

-  string matlab_filename = filename_base + ".m";
+  std::string matlab_filename = filename_base + ".m";
  WriteStringToFileOrDie(matlab_script, matlab_filename);
  return true;
 }
 }  // namespace

-bool DumpLinearLeastSquaresProblem(const string& filename_base,
+bool DumpLinearLeastSquaresProblem(const std::string& filename_base,
                                   DumpFormatType dump_format_type,
                                   const SparseMatrix* A,
                                   const double* D,
@@ -730,5 +1041,4 @@ bool DumpLinearLeastSquaresProblem(const string& filename_base,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/linear_least_squares_problems.h
+++ b/extern/ceres/internal/ceres/linear_least_squares_problems.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Structure defining a linear least squares problem and if possible
 // ground truth solutions. To be used by various LinearSolver tests.
@@ -74,6 +73,10 @@ CERES_NO_EXPORT
 std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem3();
 CERES_NO_EXPORT
 std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4();
+CERES_NO_EXPORT
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem5();
+CERES_NO_EXPORT
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6();

 // Write the linear least squares problem to disk. The exact format
 // depends on dump_format_type.
@@ -85,8 +88,7 @@ bool DumpLinearLeastSquaresProblem(const std::string& filename_base,
                                   const double* b,
                                   const double* x,
                                   int num_eliminate_blocks);
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/linear_operator.cc
+++ b/extern/ceres/internal/ceres/linear_operator.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,34 @@

 #include "ceres/linear_operator.h"

-namespace ceres {
-namespace internal {
+#include <glog/logging.h>
+
+namespace ceres::internal {
+
+void LinearOperator::RightMultiplyAndAccumulate(const double* x,
+                                                double* y,
+                                                ContextImpl* context,
+                                                int num_threads) const {
+  (void)context;
+  if (num_threads != 1) {
+    VLOG(3) << "Parallel right product is not supported by linear operator "
+               "implementation";
+  }
+  RightMultiplyAndAccumulate(x, y);
+}
+
+void LinearOperator::LeftMultiplyAndAccumulate(const double* x,
+                                               double* y,
+                                               ContextImpl* context,
+                                               int num_threads) const {
+  (void)context;
+  if (num_threads != 1) {
+    VLOG(3) << "Parallel left product is not supported by linear operator "
+               "implementation";
+  }
+  LeftMultiplyAndAccumulate(x, y);
+}

 LinearOperator::~LinearOperator() = default;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/linear_operator.h
+++ b/extern/ceres/internal/ceres/linear_operator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,11 +33,13 @@
 #ifndef CERES_INTERNAL_LINEAR_OPERATOR_H_
 #define CERES_INTERNAL_LINEAR_OPERATOR_H_

+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+
+class ContextImpl;

 // This is an abstract base class for linear operators. It supports
 // access to size information and left and right multiply operators.
@@ -46,15 +48,44 @@ class CERES_NO_EXPORT LinearOperator {
  virtual ~LinearOperator();

  // y = y + Ax;
-  virtual void RightMultiply(const double* x, double* y) const = 0;
+  virtual void RightMultiplyAndAccumulate(const double* x, double* y) const = 0;
+  virtual void RightMultiplyAndAccumulate(const double* x,
+                                          double* y,
+                                          ContextImpl* context,
+                                          int num_threads) const;
  // y = y + A'x;
-  virtual void LeftMultiply(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulate(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulate(const double* x,
+                                         double* y,
+                                         ContextImpl* context,
+                                         int num_threads) const;
+
+  virtual void RightMultiplyAndAccumulate(const Vector& x, Vector& y) const {
+    RightMultiplyAndAccumulate(x.data(), y.data());
+  }
+
+  virtual void LeftMultiplyAndAccumulate(const Vector& x, Vector& y) const {
+    LeftMultiplyAndAccumulate(x.data(), y.data());
+  }
+
+  virtual void RightMultiplyAndAccumulate(const Vector& x,
+                                          Vector& y,
+                                          ContextImpl* context,
+                                          int num_threads) const {
+    RightMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+  }
+
+  virtual void LeftMultiplyAndAccumulate(const Vector& x,
+                                         Vector& y,
+                                         ContextImpl* context,
+                                         int num_threads) const {
+    LeftMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+  }

  virtual int num_rows() const = 0;
  virtual int num_cols() const = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_LINEAR_OPERATOR_H_
--- a/extern/ceres/internal/ceres/linear_solver.cc
+++ b/extern/ceres/internal/ceres/linear_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 LinearSolver::~LinearSolver() = default;

@@ -77,8 +76,15 @@ std::unique_ptr<LinearSolver> LinearSolver::Create(
  CHECK(options.context != nullptr);

  switch (options.type) {
-    case CGNR:
+    case CGNR: {
+#ifndef CERES_NO_CUDA
+      if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+        std::string error;
+        return CudaCgnrSolver::Create(options, &error);
+      }
+#endif
      return std::make_unique<CgnrSolver>(options);
+    } break;

    case SPARSE_NORMAL_CHOLESKY:
 #if defined(CERES_NO_SPARSE)
@@ -120,5 +126,4 @@ std::unique_ptr<LinearSolver> LinearSolver::Create(
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/linear_solver.h
+++ b/extern/ceres/internal/ceres/linear_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -52,39 +52,81 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-enum LinearSolverTerminationType {
+enum class LinearSolverTerminationType {
  // Termination criterion was met.
-  LINEAR_SOLVER_SUCCESS,
+  SUCCESS,

  // Solver ran for max_num_iterations and terminated before the
  // termination tolerance could be satisfied.
-  LINEAR_SOLVER_NO_CONVERGENCE,
+  NO_CONVERGENCE,

  // Solver was terminated due to numerical problems, generally due to
  // the linear system being poorly conditioned.
-  LINEAR_SOLVER_FAILURE,
+  FAILURE,

  // Solver failed with a fatal error that cannot be recovered from,
  // e.g. CHOLMOD ran out of memory when computing the symbolic or
  // numeric factorization or an underlying library was called with
  // the wrong arguments.
-  LINEAR_SOLVER_FATAL_ERROR
+  FATAL_ERROR
 };

+inline std::ostream& operator<<(std::ostream& s,
+                                LinearSolverTerminationType type) {
+  switch (type) {
+    case LinearSolverTerminationType::SUCCESS:
+      s << "LINEAR_SOLVER_SUCCESS";
+      break;
+    case LinearSolverTerminationType::NO_CONVERGENCE:
+      s << "LINEAR_SOLVER_NO_CONVERGENCE";
+      break;
+    case LinearSolverTerminationType::FAILURE:
+      s << "LINEAR_SOLVER_FAILURE";
+      break;
+    case LinearSolverTerminationType::FATAL_ERROR:
+      s << "LINEAR_SOLVER_FATAL_ERROR";
+      break;
+    default:
+      s << "UNKNOWN LinearSolverTerminationType";
+  }
+  return s;
+}
+
 // This enum controls the fill-reducing ordering a sparse linear
 // algebra library should use before computing a sparse factorization
 // (usually Cholesky).
-enum OrderingType {
+//
+// TODO(sameeragarwal): Add support for nested dissection
+enum class OrderingType {
  NATURAL,  // Do not re-order the matrix. This is useful when the
            // matrix has been ordered using a fill-reducing ordering
            // already.
-  AMD       // Use the Approximate Minimum Degree algorithm to re-order
-            // the matrix.
+
+  AMD,  // Use the Approximate Minimum Degree algorithm to re-order
+        // the matrix.
+
+  NESDIS,  // Use the Nested Dissection algorithm to re-order the matrix.
 };

+inline std::ostream& operator<<(std::ostream& s, OrderingType type) {
+  switch (type) {
+    case OrderingType::NATURAL:
+      s << "NATURAL";
+      break;
+    case OrderingType::AMD:
+      s << "AMD";
+      break;
+    case OrderingType::NESDIS:
+      s << "NESDIS";
+      break;
+    default:
+      s << "UNKNOWN OrderingType";
+  }
+  return s;
+}
+
 class LinearOperator;

 // Abstract base class for objects that implement algorithms for
@@ -112,9 +154,9 @@ class CERES_NO_EXPORT LinearSolver {
    DenseLinearAlgebraLibraryType dense_linear_algebra_library_type = EIGEN;
    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =
        SUITE_SPARSE;
+    OrderingType ordering_type = OrderingType::NATURAL;

    // See solver.h for information about these flags.
-    bool use_postordering = false;
    bool dynamic_sparsity = false;
    bool use_explicit_schur_complement = false;

@@ -123,6 +165,23 @@ class CERES_NO_EXPORT LinearSolver {
    int min_num_iterations = 1;
    int max_num_iterations = 1;

+    // Maximum number of iterations performed by SCHUR_POWER_SERIES_EXPANSION.
+    // This value controls the maximum number of iterations whether it is used
+    // as a preconditioner or just to initialize the solution for
+    // ITERATIVE_SCHUR.
+    int max_num_spse_iterations = 5;
+
+    // Use SCHUR_POWER_SERIES_EXPANSION to initialize the solution for
+    // ITERATIVE_SCHUR. This option can be set true regardless of what
+    // preconditioner is being used.
+    bool use_spse_initialization = false;
+
+    // When use_spse_initialization is true, this parameter along with
+    // max_num_spse_iterations controls the number of
+    // SCHUR_POWER_SERIES_EXPANSION iterations performed for initialization. It
+    // is not used to control the preconditioner.
+    double spse_tolerance = 0.1;
+
    // If possible, how many threads can the solver use.
    int num_threads = 1;

@@ -261,7 +320,8 @@ class CERES_NO_EXPORT LinearSolver {
  struct Summary {
    double residual_norm = -1.0;
    int num_iterations = -1;
-    LinearSolverTerminationType termination_type = LINEAR_SOLVER_FAILURE;
+    LinearSolverTerminationType termination_type =
+        LinearSolverTerminationType::FAILURE;
    std::string message;
  };

@@ -329,17 +389,16 @@ class TypedLinearSolver : public LinearSolver {
  ExecutionSummary execution_summary_;
 };

-// Linear solvers that depend on acccess to the low level structure of
+// Linear solvers that depend on access to the low level structure of
 // a SparseMatrix.
 // clang-format off
-typedef TypedLinearSolver<BlockSparseMatrix>         BlockSparseMatrixSolver;          // NOLINT
-typedef TypedLinearSolver<CompressedRowSparseMatrix> CompressedRowSparseMatrixSolver;  // NOLINT
-typedef TypedLinearSolver<DenseSparseMatrix>         DenseSparseMatrixSolver;          // NOLINT
-typedef TypedLinearSolver<TripletSparseMatrix>       TripletSparseMatrixSolver;        // NOLINT
+using BlockSparseMatrixSolver = TypedLinearSolver<BlockSparseMatrix>;                  // NOLINT
+using CompressedRowSparseMatrixSolver = TypedLinearSolver<CompressedRowSparseMatrix>;  // NOLINT
+using DenseSparseMatrixSolver = TypedLinearSolver<DenseSparseMatrix>;                  // NOLINT
+using TripletSparseMatrixSolver = TypedLinearSolver<TripletSparseMatrix>;              // NOLINT
 // clang-format on

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/local_parameterization.cc
+++ b/extern/ceres/internal/ceres/local_parameterization.cc
@@ -1,349 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: sameeragarwal@google.com (Sameer Agarwal)
-
-#include "ceres/local_parameterization.h"
-
-#include <algorithm>
-
-#include "Eigen/Geometry"
-#include "ceres/internal/eigen.h"
-#include "ceres/internal/fixed_array.h"
-#include "ceres/internal/householder_vector.h"
-#include "ceres/rotation.h"
-#include "glog/logging.h"
-
-namespace ceres {
-
-using std::vector;
-
-LocalParameterization::~LocalParameterization() = default;
-
-bool LocalParameterization::MultiplyByJacobian(const double* x,
-                                               const int num_rows,
-                                               const double* global_matrix,
-                                               double* local_matrix) const {
-  if (LocalSize() == 0) {
-    return true;
-  }
-
-  Matrix jacobian(GlobalSize(), LocalSize());
-  if (!ComputeJacobian(x, jacobian.data())) {
-    return false;
-  }
-
-  MatrixRef(local_matrix, num_rows, LocalSize()) =
-      ConstMatrixRef(global_matrix, num_rows, GlobalSize()) * jacobian;
-  return true;
-}
-
-IdentityParameterization::IdentityParameterization(const int size)
-    : size_(size) {
-  CHECK_GT(size, 0);
-}
-
-bool IdentityParameterization::Plus(const double* x,
-                                    const double* delta,
-                                    double* x_plus_delta) const {
-  VectorRef(x_plus_delta, size_) =
-      ConstVectorRef(x, size_) + ConstVectorRef(delta, size_);
-  return true;
-}
-
-bool IdentityParameterization::ComputeJacobian(const double* x,
-                                               double* jacobian) const {
-  MatrixRef(jacobian, size_, size_).setIdentity();
-  return true;
-}
-
-bool IdentityParameterization::MultiplyByJacobian(const double* x,
-                                                  const int num_cols,
-                                                  const double* global_matrix,
-                                                  double* local_matrix) const {
-  std::copy(
-      global_matrix, global_matrix + num_cols * GlobalSize(), local_matrix);
-  return true;
-}
-
-SubsetParameterization::SubsetParameterization(
-    int size, const vector<int>& constant_parameters)
-    : local_size_(size - constant_parameters.size()), constancy_mask_(size, 0) {
-  if (constant_parameters.empty()) {
-    return;
-  }
-
-  vector<int> constant = constant_parameters;
-  std::sort(constant.begin(), constant.end());
-  CHECK_GE(constant.front(), 0) << "Indices indicating constant parameter must "
-                                   "be greater than equal to zero.";
-  CHECK_LT(constant.back(), size)
-      << "Indices indicating constant parameter must be less than the size "
-      << "of the parameter block.";
-  CHECK(std::adjacent_find(constant.begin(), constant.end()) == constant.end())
-      << "The set of constant parameters cannot contain duplicates";
-  for (int parameter : constant_parameters) {
-    constancy_mask_[parameter] = 1;
-  }
-}
-
-bool SubsetParameterization::Plus(const double* x,
-                                  const double* delta,
-                                  double* x_plus_delta) const {
-  const int global_size = GlobalSize();
-  for (int i = 0, j = 0; i < global_size; ++i) {
-    if (constancy_mask_[i]) {
-      x_plus_delta[i] = x[i];
-    } else {
-      x_plus_delta[i] = x[i] + delta[j++];
-    }
-  }
-  return true;
-}
-
-bool SubsetParameterization::ComputeJacobian(const double* x,
-                                             double* jacobian) const {
-  if (local_size_ == 0) {
-    return true;
-  }
-
-  const int global_size = GlobalSize();
-  MatrixRef m(jacobian, global_size, local_size_);
-  m.setZero();
-  for (int i = 0, j = 0; i < global_size; ++i) {
-    if (!constancy_mask_[i]) {
-      m(i, j++) = 1.0;
-    }
-  }
-  return true;
-}
-
-bool SubsetParameterization::MultiplyByJacobian(const double* x,
-                                                const int num_cols,
-                                                const double* global_matrix,
-                                                double* local_matrix) const {
-  if (local_size_ == 0) {
-    return true;
-  }
-
-  const int global_size = GlobalSize();
-  for (int col = 0; col < num_cols; ++col) {
-    for (int i = 0, j = 0; i < global_size; ++i) {
-      if (!constancy_mask_[i]) {
-        local_matrix[col * local_size_ + j++] =
-            global_matrix[col * global_size + i];
-      }
-    }
-  }
-  return true;
-}
-
-bool QuaternionParameterization::Plus(const double* x,
-                                      const double* delta,
-                                      double* x_plus_delta) const {
-  const double norm_delta =
-      sqrt(delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2]);
-  if (norm_delta > 0.0) {
-    const double sin_delta_by_delta = (sin(norm_delta) / norm_delta);
-    double q_delta[4];
-    q_delta[0] = cos(norm_delta);
-    q_delta[1] = sin_delta_by_delta * delta[0];
-    q_delta[2] = sin_delta_by_delta * delta[1];
-    q_delta[3] = sin_delta_by_delta * delta[2];
-    QuaternionProduct(q_delta, x, x_plus_delta);
-  } else {
-    for (int i = 0; i < 4; ++i) {
-      x_plus_delta[i] = x[i];
-    }
-  }
-  return true;
-}
-
-bool QuaternionParameterization::ComputeJacobian(const double* x,
-                                                 double* jacobian) const {
-  // clang-format off
-  jacobian[0] = -x[1];  jacobian[1]  = -x[2];   jacobian[2]  = -x[3];
-  jacobian[3] =  x[0];  jacobian[4]  =  x[3];   jacobian[5]  = -x[2];
-  jacobian[6] = -x[3];  jacobian[7]  =  x[0];   jacobian[8]  =  x[1];
-  jacobian[9] =  x[2];  jacobian[10] = -x[1];   jacobian[11] =  x[0];
-  // clang-format on
-  return true;
-}
-
-bool EigenQuaternionParameterization::Plus(const double* x_ptr,
-                                           const double* delta,
-                                           double* x_plus_delta_ptr) const {
-  Eigen::Map<Eigen::Quaterniond> x_plus_delta(x_plus_delta_ptr);
-  Eigen::Map<const Eigen::Quaterniond> x(x_ptr);
-
-  const double norm_delta =
-      sqrt(delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2]);
-  if (norm_delta > 0.0) {
-    const double sin_delta_by_delta = sin(norm_delta) / norm_delta;
-
-    // Note, in the constructor w is first.
-    Eigen::Quaterniond delta_q(cos(norm_delta),
-                               sin_delta_by_delta * delta[0],
-                               sin_delta_by_delta * delta[1],
-                               sin_delta_by_delta * delta[2]);
-    x_plus_delta = delta_q * x;
-  } else {
-    x_plus_delta = x;
-  }
-
-  return true;
-}
-
-bool EigenQuaternionParameterization::ComputeJacobian(const double* x,
-                                                      double* jacobian) const {
-  // clang-format off
-  jacobian[0] =  x[3];  jacobian[1]  =  x[2];  jacobian[2]  = -x[1];
-  jacobian[3] = -x[2];  jacobian[4]  =  x[3];  jacobian[5]  =  x[0];
-  jacobian[6] =  x[1];  jacobian[7]  = -x[0];  jacobian[8]  =  x[3];
-  jacobian[9] = -x[0];  jacobian[10] = -x[1];  jacobian[11] = -x[2];
-  // clang-format on
-  return true;
-}
-
-HomogeneousVectorParameterization::HomogeneousVectorParameterization(int size)
-    : size_(size) {
-  CHECK_GT(size_, 1) << "The size of the homogeneous vector needs to be "
-                     << "greater than 1.";
-}
-
-bool HomogeneousVectorParameterization::Plus(const double* x_ptr,
-                                             const double* delta_ptr,
-                                             double* x_plus_delta_ptr) const {
-  ConstVectorRef x(x_ptr, size_);
-  ConstVectorRef delta(delta_ptr, size_ - 1);
-  VectorRef x_plus_delta(x_plus_delta_ptr, size_);
-
-  const double norm_delta = delta.norm();
-
-  if (norm_delta == 0.0) {
-    x_plus_delta = x;
-    return true;
-  }
-
-  // Map the delta from the minimum representation to the over parameterized
-  // homogeneous vector. See section A6.9.2 on page 624 of Hartley & Zisserman
-  // (2nd Edition) for a detailed description.  Note there is a typo on Page
-  // 625, line 4 so check the book errata.
-  const double norm_delta_div_2 = 0.5 * norm_delta;
-  const double sin_delta_by_delta =
-      std::sin(norm_delta_div_2) / norm_delta_div_2;
-
-  Vector y(size_);
-  y.head(size_ - 1) = 0.5 * sin_delta_by_delta * delta;
-  y(size_ - 1) = std::cos(norm_delta_div_2);
-
-  Vector v(size_);
-  double beta;
-
-  // NOTE: The explicit template arguments are needed here because
-  // ComputeHouseholderVector is templated and some versions of MSVC
-  // have trouble deducing the type of v automatically.
-  internal::ComputeHouseholderVector<ConstVectorRef, double, Eigen::Dynamic>(
-      x, &v, &beta);
-
-  // Apply the delta update to remain on the unit sphere. See section A6.9.3
-  // on page 625 of Hartley & Zisserman (2nd Edition) for a detailed
-  // description.
-  x_plus_delta = x.norm() * (y - v * (beta * (v.transpose() * y)));
-
-  return true;
-}
-
-bool HomogeneousVectorParameterization::ComputeJacobian(
-    const double* x_ptr, double* jacobian_ptr) const {
-  ConstVectorRef x(x_ptr, size_);
-  MatrixRef jacobian(jacobian_ptr, size_, size_ - 1);
-
-  Vector v(size_);
-  double beta;
-
-  // NOTE: The explicit template arguments are needed here because
-  // ComputeHouseholderVector is templated and some versions of MSVC
-  // have trouble deducing the type of v automatically.
-  internal::ComputeHouseholderVector<ConstVectorRef, double, Eigen::Dynamic>(
-      x, &v, &beta);
-
-  // The Jacobian is equal to J = 0.5 * H.leftCols(size_ - 1) where H is the
-  // Householder matrix (H = I - beta * v * v').
-  for (int i = 0; i < size_ - 1; ++i) {
-    jacobian.col(i) = -0.5 * beta * v(i) * v;
-    jacobian.col(i)(i) += 0.5;
-  }
-  jacobian *= x.norm();
-
-  return true;
-}
-
-bool ProductParameterization::Plus(const double* x,
-                                   const double* delta,
-                                   double* x_plus_delta) const {
-  int x_cursor = 0;
-  int delta_cursor = 0;
-  for (const auto& param : local_params_) {
-    if (!param->Plus(
-            x + x_cursor, delta + delta_cursor, x_plus_delta + x_cursor)) {
-      return false;
-    }
-    delta_cursor += param->LocalSize();
-    x_cursor += param->GlobalSize();
-  }
-
-  return true;
-}
-
-bool ProductParameterization::ComputeJacobian(const double* x,
-                                              double* jacobian_ptr) const {
-  MatrixRef jacobian(jacobian_ptr, GlobalSize(), LocalSize());
-  jacobian.setZero();
-  internal::FixedArray<double> buffer(buffer_size_);
-
-  int x_cursor = 0;
-  int delta_cursor = 0;
-  for (const auto& param : local_params_) {
-    const int local_size = param->LocalSize();
-    const int global_size = param->GlobalSize();
-
-    if (!param->ComputeJacobian(x + x_cursor, buffer.data())) {
-      return false;
-    }
-    jacobian.block(x_cursor, delta_cursor, global_size, local_size) =
-        MatrixRef(buffer.data(), global_size, local_size);
-
-    delta_cursor += local_size;
-    x_cursor += global_size;
-  }
-
-  return true;
-}
-
-}  // namespace ceres
--- a/extern/ceres/internal/ceres/loss_function.cc
+++ b/extern/ceres/internal/ceres/loss_function.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/low_rank_inverse_hessian.cc
+++ b/extern/ceres/internal/ceres/low_rank_inverse_hessian.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,10 +35,7 @@
 #include "ceres/internal/eigen.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::list;
+namespace ceres::internal {

 // The (L)BFGS algorithm explicitly requires that the secant equation:
 //
@@ -117,8 +114,8 @@ bool LowRankInverseHessian::Update(const Vector& delta_x,
  return true;
 }

-void LowRankInverseHessian::RightMultiply(const double* x_ptr,
-                                          double* y_ptr) const {
+void LowRankInverseHessian::RightMultiplyAndAccumulate(const double* x_ptr,
+                                                       double* y_ptr) const {
  ConstVectorRef gradient(x_ptr, num_parameters_);
  VectorRef search_direction(y_ptr, num_parameters_);

@@ -159,7 +156,7 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr,
    //
    // The original origin of this rescaling trick is somewhat unclear, the
    // earliest reference appears to be Oren [1], however it is widely discussed
-    // without specific attributation in various texts including [2] (p143/178).
+    // without specific attribution in various texts including [2] (p143/178).
    //
    // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms Part II:
    //     Implementation and experiments, Management Science,
@@ -179,5 +176,4 @@ void LowRankInverseHessian::RightMultiply(const double* x_ptr,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/low_rank_inverse_hessian.h
+++ b/extern/ceres/internal/ceres/low_rank_inverse_hessian.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_operator.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // LowRankInverseHessian is a positive definite approximation to the
 // Hessian using the limited memory variant of the
@@ -65,7 +64,7 @@ class CERES_NO_EXPORT LowRankInverseHessian final : public LinearOperator {
  // num_parameters is the row/column size of the Hessian.
  // max_num_corrections is the rank of the Hessian approximation.
  // use_approximate_eigenvalue_scaling controls whether the initial
-  // inverse Hessian used during Right/LeftMultiply() is scaled by
+  // inverse Hessian used during Right/LeftMultiplyAndAccumulate() is scaled by
  // the approximate eigenvalue of the true inverse Hessian at the
  // current operating point.
  // The approximation uses:
@@ -84,9 +83,9 @@ class CERES_NO_EXPORT LowRankInverseHessian final : public LinearOperator {
  bool Update(const Vector& delta_x, const Vector& delta_gradient);

  // LinearOperator interface
-  void RightMultiply(const double* x, double* y) const final;
-  void LeftMultiply(const double* x, double* y) const final {
-    RightMultiply(x, y);
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final {
+    RightMultiplyAndAccumulate(x, y);
  }
  int num_rows() const final { return num_parameters_; }
  int num_cols() const final { return num_parameters_; }
@@ -102,7 +101,6 @@ class CERES_NO_EXPORT LowRankInverseHessian final : public LinearOperator {
  std::list<int> indices_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_LOW_RANK_INVERSE_HESSIAN_H_
--- a/extern/ceres/internal/ceres/manifold.cc
+++ b/extern/ceres/internal/ceres/manifold.cc
@@ -30,13 +30,11 @@ inline void QuaternionPlusImpl(const double* x,
                               double* x_plus_delta) {
  // x_plus_delta = QuaternionProduct(q_delta, x), where q_delta is the
  // quaternion constructed from delta.
-  const double norm_delta = std::sqrt(
-      delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2]);
+  const double norm_delta = std::hypot(delta[0], delta[1], delta[2]);

-  if (norm_delta == 0.0) {
-    for (int i = 0; i < 4; ++i) {
-      x_plus_delta[i] = x[i];
-    }
+  if (std::fpclassify(norm_delta) == FP_ZERO) {
+    // No change in rotation: return the quaternion as is.
+    std::copy_n(x, 4, x_plus_delta);
    return;
  }

@@ -100,19 +98,16 @@ inline void QuaternionMinusImpl(const double* y,
      -y[Order::kW] * x[Order::kZ] - y[Order::kX] * x[Order::kY] +
      y[Order::kY] * x[Order::kX] + y[Order::kZ] * x[Order::kW];

-  const double u_norm =
-      std::sqrt(ambient_y_minus_x[Order::kX] * ambient_y_minus_x[Order::kX] +
-                ambient_y_minus_x[Order::kY] * ambient_y_minus_x[Order::kY] +
-                ambient_y_minus_x[Order::kZ] * ambient_y_minus_x[Order::kZ]);
-  if (u_norm > 0.0) {
+  const double u_norm = std::hypot(ambient_y_minus_x[Order::kX],
+                                   ambient_y_minus_x[Order::kY],
+                                   ambient_y_minus_x[Order::kZ]);
+  if (std::fpclassify(u_norm) != FP_ZERO) {
    const double theta = std::atan2(u_norm, ambient_y_minus_x[Order::kW]);
    y_minus_x[0] = theta * ambient_y_minus_x[Order::kX] / u_norm;
    y_minus_x[1] = theta * ambient_y_minus_x[Order::kY] / u_norm;
    y_minus_x[2] = theta * ambient_y_minus_x[Order::kZ] / u_norm;
  } else {
-    y_minus_x[0] = 0.0;
-    y_minus_x[1] = 0.0;
-    y_minus_x[2] = 0.0;
+    std::fill_n(y_minus_x, 3, 0.0);
  }
 }

@@ -201,7 +196,7 @@ bool SubsetManifold::Plus(const double* x,
  return true;
 }

-bool SubsetManifold::PlusJacobian(const double* x,
+bool SubsetManifold::PlusJacobian(const double* /*x*/,
                                  double* plus_jacobian) const {
  if (tangent_size_ == 0) {
    return true;
@@ -218,7 +213,7 @@ bool SubsetManifold::PlusJacobian(const double* x,
  return true;
 }

-bool SubsetManifold::RightMultiplyByPlusJacobian(const double* x,
+bool SubsetManifold::RightMultiplyByPlusJacobian(const double* /*x*/,
                                                 const int num_rows,
                                                 const double* ambient_matrix,
                                                 double* tangent_matrix) const {
@@ -254,7 +249,7 @@ bool SubsetManifold::Minus(const double* y,
  return true;
 }

-bool SubsetManifold::MinusJacobian(const double* x,
+bool SubsetManifold::MinusJacobian(const double* /*x*/,
                                   double* minus_jacobian) const {
  const int ambient_size = AmbientSize();
  MatrixRef m(minus_jacobian, tangent_size_, ambient_size);
--- a/extern/ceres/internal/ceres/manifold_adapter.h
+++ b/extern/ceres/internal/ceres/manifold_adapter.h
@@ -1,60 +0,0 @@
-#include "ceres/internal/export.h"
-#include "ceres/local_parameterization.h"
-#include "ceres/manifold.h"
-#include "glog/logging.h"
-
-namespace ceres {
-namespace internal {
-
-// Adapter to wrap LocalParameterization and make them look like Manifolds.
-//
-// ManifoldAdapter NEVER takes ownership of local_parameterization.
-class CERES_NO_EXPORT ManifoldAdapter final : public Manifold {
- public:
-  explicit ManifoldAdapter(const LocalParameterization* local_parameterization)
-      : local_parameterization_(local_parameterization) {
-    CHECK(local_parameterization != nullptr);
-  }
-
-  bool Plus(const double* x,
-            const double* delta,
-            double* x_plus_delta) const override {
-    return local_parameterization_->Plus(x, delta, x_plus_delta);
-  }
-
-  bool PlusJacobian(const double* x, double* jacobian) const override {
-    return local_parameterization_->ComputeJacobian(x, jacobian);
-  }
-
-  bool RightMultiplyByPlusJacobian(const double* x,
-                                   const int num_rows,
-                                   const double* ambient_matrix,
-                                   double* tangent_matrix) const override {
-    return local_parameterization_->MultiplyByJacobian(
-        x, num_rows, ambient_matrix, tangent_matrix);
-  }
-
-  bool Minus(const double* y, const double* x, double* delta) const override {
-    LOG(FATAL) << "This should never be called.";
-    return false;
-  }
-
-  bool MinusJacobian(const double* x, double* jacobian) const override {
-    LOG(FATAL) << "This should never be called.";
-    return false;
-  }
-
-  int AmbientSize() const override {
-    return local_parameterization_->GlobalSize();
-  }
-
-  int TangentSize() const override {
-    return local_parameterization_->LocalSize();
-  }
-
- private:
-  const LocalParameterization* local_parameterization_;
-};
-
-}  // namespace internal
-}  // namespace ceres
--- a/extern/ceres/internal/ceres/map_util.h
+++ b/extern/ceres/internal/ceres/map_util.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/minimizer.cc
+++ b/extern/ceres/internal/ceres/minimizer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 std::unique_ptr<Minimizer> Minimizer::Create(MinimizerType minimizer_type) {
  if (minimizer_type == TRUST_REGION) {
@@ -89,5 +88,4 @@ bool Minimizer::RunCallbacks(const Minimizer::Options& options,
  return false;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/minimizer.h
+++ b/extern/ceres/internal/ceres/minimizer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,14 +40,14 @@
 #include "ceres/iteration_callback.h"
 #include "ceres/solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Evaluator;
 class SparseMatrix;
 class TrustRegionStrategy;
 class CoordinateDescentMinimizer;
 class LinearSolver;
+class ContextImpl;

 // Interface for non-linear least squares solvers.
 class CERES_NO_EXPORT Minimizer {
@@ -114,6 +114,7 @@ class CERES_NO_EXPORT Minimizer {
    int max_num_iterations;
    double max_solver_time_in_seconds;
    int num_threads;
+    ContextImpl* context = nullptr;

    // Number of times the linear solver should be retried in case of
    // numerical failure. The retries are done by exponentially scaling up
@@ -193,8 +194,7 @@ class CERES_NO_EXPORT Minimizer {
                        Solver::Summary* summary) = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/normal_prior.cc
+++ b/extern/ceres/internal/ceres/normal_prior.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,7 @@
 #include "ceres/normal_prior.h"

 #include <cstddef>
+#include <utility>
 #include <vector>

 #include "ceres/internal/eigen.h"
@@ -39,7 +40,7 @@

 namespace ceres {

-NormalPrior::NormalPrior(const Matrix& A, const Vector& b) : A_(A), b_(b) {
+NormalPrior::NormalPrior(const Matrix& A, Vector b) : A_(A), b_(std::move(b)) {
  CHECK_GT(b_.rows(), 0);
  CHECK_GT(A_.rows(), 0);
  CHECK_EQ(b_.rows(), A.cols());
@@ -54,7 +55,7 @@ bool NormalPrior::Evaluate(double const* const* parameters,
  VectorRef r(residuals, num_residuals());
  // The following line should read
  // r = A_ * (p - b_);
-  // The extra eval is to get around a bug in the eigen library.
+  // The extra eval is to get around a bug in the Eigen library.
  r = A_ * (p - b_).eval();
  if ((jacobians != nullptr) && (jacobians[0] != nullptr)) {
    MatrixRef(jacobians[0], num_residuals(), parameter_block_sizes()[0]) = A_;
--- a/extern/ceres/internal/ceres/pair_hash.h
+++ b/extern/ceres/internal/ceres/pair_hash.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 #if defined(_WIN32) && !defined(__MINGW64__) && !defined(__MINGW32__)
 #define GG_LONGLONG(x) x##I64
@@ -112,7 +111,6 @@ struct pair_hash {
  }
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_PAIR_HASH_H_
--- a/extern/ceres/internal/ceres/parallel_for.h
+++ b/extern/ceres/internal/ceres/parallel_for.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -26,48 +26,161 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: vitus@google.com (Michael Vitus)
+// Authors: vitus@google.com (Michael Vitus),
+//          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #ifndef CERES_INTERNAL_PARALLEL_FOR_H_
 #define CERES_INTERNAL_PARALLEL_FOR_H_

-#include <functional>
+#include <mutex>
+#include <vector>

 #include "ceres/context_impl.h"
-#include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
+#include "ceres/parallel_invoke.h"
+#include "ceres/partition_range_for_parallel_for.h"
+#include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-// Returns the maximum number of threads supported by the threading backend
-// Ceres was compiled with.
-CERES_NO_EXPORT
-int MaxNumThreadsAvailable();
+// Use a dummy mutex if num_threads = 1.
+inline decltype(auto) MakeConditionalLock(const int num_threads,
+                                          std::mutex& m) {
+  return (num_threads == 1) ? std::unique_lock<std::mutex>{}
+                            : std::unique_lock<std::mutex>{m};
+}

 // Execute the function for every element in the range [start, end) with at most
 // num_threads. It will execute all the work on the calling thread if
-// num_threads is 1.
-CERES_NO_EXPORT void ParallelFor(ContextImpl* context,
-                                 int start,
-                                 int end,
-                                 int num_threads,
-                                 const std::function<void(int)>& function);
+// num_threads or (end - start) is equal to 1.
+// Depending on function signature, it will be supplied with either loop index
+// or a range of loop indicies; function can also be supplied with thread_id.
+// The following function signatures are supported:
+//  - Functions accepting a single loop index:
+//     - [](int index) { ... }
+//     - [](int thread_id, int index) { ... }
+//  - Functions accepting a range of loop index:
+//     - [](std::tuple<int, int> index) { ... }
+//     - [](int thread_id, std::tuple<int, int> index) { ... }
+//
+// When distributing workload between threads, it is assumed that each loop
+// iteration takes approximately equal time to complete.
+template <typename F>
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 F&& function,
+                 int min_block_size = 1) {
+  CHECK_GT(num_threads, 0);
+  if (start >= end) {
+    return;
+  }

-// Execute the function for every element in the range [start, end) with at most
-// num_threads. It will execute all the work on the calling thread if
-// num_threads is 1.  Each invocation of function() will be passed a thread_id
-// in [0, num_threads) that is guaranteed to be distinct from the value passed
-// to any concurrent execution of function().
-CERES_NO_EXPORT void ParallelFor(
-    ContextImpl* context,
-    int start,
-    int end,
-    int num_threads,
-    const std::function<void(int thread_id, int i)>& function);
-}  // namespace internal
-}  // namespace ceres
+  if (num_threads == 1 || end - start < min_block_size * 2) {
+    InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
+    return;
+  }

-#include "ceres/internal/disable_warnings.h"
+  CHECK(context != nullptr);
+  ParallelInvoke(context,
+                 start,
+                 end,
+                 num_threads,
+                 std::forward<F>(function),
+                 min_block_size);
+}
+
+// Execute function for every element in the range [start, end) with at most
+// num_threads, using user-provided partitions array.
+// When distributing workload between threads, it is assumed that each segment
+// bounded by adjacent elements of partitions array takes approximately equal
+// time to process.
+template <typename F>
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 F&& function,
+                 const std::vector<int>& partitions) {
+  CHECK_GT(num_threads, 0);
+  if (start >= end) {
+    return;
+  }
+  CHECK_EQ(partitions.front(), start);
+  CHECK_EQ(partitions.back(), end);
+  if (num_threads == 1 || end - start <= num_threads) {
+    ParallelFor(context, start, end, num_threads, std::forward<F>(function));
+    return;
+  }
+  CHECK_GT(partitions.size(), 1);
+  const int num_partitions = partitions.size() - 1;
+  ParallelFor(context,
+              0,
+              num_partitions,
+              num_threads,
+              [&function, &partitions](int thread_id,
+                                       std::tuple<int, int> partition_ids) {
+                // partition_ids is a range of partition indices
+                const auto [partition_start, partition_end] = partition_ids;
+                // Execution over several adjacent segments is equivalent
+                // to execution over union of those segments (which is also a
+                // contiguous segment)
+                const int range_start = partitions[partition_start];
+                const int range_end = partitions[partition_end];
+                // Range of original loop indices
+                const auto range = std::make_tuple(range_start, range_end);
+                InvokeOnSegment(thread_id, range, function);
+              });
+}
+
+// Execute function for every element in the range [start, end) with at most
+// num_threads, taking into account user-provided integer cumulative costs of
+// iterations. Cumulative costs of iteration for indices in range [0, end) are
+// stored in objects from cumulative_cost_data. User-provided
+// cumulative_cost_fun returns non-decreasing integer values corresponding to
+// inclusive cumulative cost of loop iterations, provided with a reference to
+// user-defined object. Only indices from [start, end) will be referenced. This
+// routine assumes that cumulative_cost_fun is non-decreasing (in other words,
+// all costs are non-negative);
+// When distributing workload between threads, input range of loop indices will
+// be partitioned into disjoint contiguous intervals, with the maximal cost
+// being minimized.
+// For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun
+// should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range
+// will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs
+// [2, 5, 4, 4].
+template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 F&& function,
+                 const CumulativeCostData* cumulative_cost_data,
+                 CumulativeCostFun&& cumulative_cost_fun) {
+  CHECK_GT(num_threads, 0);
+  if (start >= end) {
+    return;
+  }
+  if (num_threads == 1 || end - start <= num_threads) {
+    ParallelFor(context, start, end, num_threads, std::forward<F>(function));
+    return;
+  }
+  // Creating several partitions allows us to tolerate imperfections of
+  // partitioning and user-supplied iteration costs up to a certain extent
+  constexpr int kNumPartitionsPerThread = 4;
+  const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
+  const auto& partitions = PartitionRangeForParallelFor(
+      start,
+      end,
+      kMaxPartitions,
+      cumulative_cost_data,
+      std::forward<CumulativeCostFun>(cumulative_cost_fun));
+  CHECK_GT(partitions.size(), 1);
+  ParallelFor(
+      context, start, end, num_threads, std::forward<F>(function), partitions);
+}
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_PARALLEL_FOR_H_
--- a/extern/ceres/internal/ceres/parallel_for_cxx.cc
+++ b/extern/ceres/internal/ceres/parallel_for_cxx.cc
@@ -1,245 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vitus@google.com (Michael Vitus)
-
-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
-
-#ifdef CERES_USE_CXX_THREADS
-
-#include <cmath>
-#include <condition_variable>
-#include <memory>
-#include <mutex>
-
-#include "ceres/concurrent_queue.h"
-#include "ceres/parallel_for.h"
-#include "ceres/scoped_thread_token.h"
-#include "ceres/thread_token_provider.h"
-#include "glog/logging.h"
-
-namespace ceres {
-namespace internal {
-namespace {
-// This class creates a thread safe barrier which will block until a
-// pre-specified number of threads call Finished.  This allows us to block the
-// main thread until all the parallel threads are finished processing all the
-// work.
-class BlockUntilFinished {
- public:
-  explicit BlockUntilFinished(int num_total)
-      : num_finished_(0), num_total_(num_total) {}
-
-  // Increment the number of jobs that have finished and signal the blocking
-  // thread if all jobs have finished.
-  void Finished() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    ++num_finished_;
-    CHECK_LE(num_finished_, num_total_);
-    if (num_finished_ == num_total_) {
-      condition_.notify_one();
-    }
-  }
-
-  // Block until all threads have signaled they are finished.
-  void Block() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    condition_.wait(lock, [&]() { return num_finished_ == num_total_; });
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable condition_;
-  // The current number of jobs finished.
-  int num_finished_;
-  // The total number of jobs.
-  int num_total_;
-};
-
-// Shared state between the parallel tasks. Each thread will use this
-// information to get the next block of work to be performed.
-struct SharedState {
-  SharedState(int start, int end, int num_work_items)
-      : start(start),
-        end(end),
-        num_work_items(num_work_items),
-        i(0),
-        thread_token_provider(num_work_items),
-        block_until_finished(num_work_items) {}
-
-  // The start and end index of the for loop.
-  const int start;
-  const int end;
-  // The number of blocks that need to be processed.
-  const int num_work_items;
-
-  // The next block of work to be assigned to a worker.  The parallel for loop
-  // range is split into num_work_items blocks of work, i.e. a single block of
-  // work is:
-  //  for (int j = start + i; j < end; j += num_work_items) { ... }.
-  int i;
-  std::mutex mutex_i;
-
-  // Provides a unique thread ID among all active threads working on the same
-  // group of tasks.  Thread-safe.
-  ThreadTokenProvider thread_token_provider;
-
-  // Used to signal when all the work has been completed.  Thread safe.
-  BlockUntilFinished block_until_finished;
-};
-
-}  // namespace
-
-int MaxNumThreadsAvailable() { return ThreadPool::MaxNumThreadsAvailable(); }
-
-// See ParallelFor (below) for more details.
-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int)>& function) {
-  CHECK_GT(num_threads, 0);
-  CHECK(context != nullptr);
-  if (end <= start) {
-    return;
-  }
-
-  // Fast path for when it is single threaded.
-  if (num_threads == 1) {
-    for (int i = start; i < end; ++i) {
-      function(i);
-    }
-    return;
-  }
-
-  ParallelFor(
-      context, start, end, num_threads, [&function](int /*thread_id*/, int i) {
-        function(i);
-      });
-}
-
-// This implementation uses a fixed size max worker pool with a shared task
-// queue. The problem of executing the function for the interval of [start, end)
-// is broken up into at most num_threads blocks and added to the thread pool. To
-// avoid deadlocks, the calling thread is allowed to steal work from the worker
-// pool. This is implemented via a shared state between the tasks. In order for
-// the calling thread or thread pool to get a block of work, it will query the
-// shared state for the next block of work to be done. If there is nothing left,
-// it will return. We will exit the ParallelFor call when all of the work has
-// been done, not when all of the tasks have been popped off the task queue.
-//
-// A unique thread ID among all active tasks will be acquired once for each
-// block of work.  This avoids the significant performance penalty for acquiring
-// it on every iteration of the for loop. The thread ID is guaranteed to be in
-// [0, num_threads).
-//
-// A performance analysis has shown this implementation is onpar with OpenMP and
-// TBB.
-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int thread_id, int i)>& function) {
-  CHECK_GT(num_threads, 0);
-  CHECK(context != nullptr);
-  if (end <= start) {
-    return;
-  }
-
-  // Fast path for when it is single threaded.
-  if (num_threads == 1) {
-    // Even though we only have one thread, use the thread token provider to
-    // guarantee the exact same behavior when running with multiple threads.
-    ThreadTokenProvider thread_token_provider(num_threads);
-    const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-    const int thread_id = scoped_thread_token.token();
-    for (int i = start; i < end; ++i) {
-      function(thread_id, i);
-    }
-    return;
-  }
-
-  // We use a std::shared_ptr because the main thread can finish all
-  // the work before the tasks have been popped off the queue.  So the
-  // shared state needs to exist for the duration of all the tasks.
-  const int num_work_items = std::min((end - start), num_threads);
-  std::shared_ptr<SharedState> shared_state(
-      new SharedState(start, end, num_work_items));
-
-  // A function which tries to perform a chunk of work. This returns false if
-  // there is no work to be done.
-  auto task_function = [shared_state, &function]() {
-    int i = 0;
-    {
-      // Get the next available chunk of work to be performed. If there is no
-      // work, return false.
-      std::lock_guard<std::mutex> lock(shared_state->mutex_i);
-      if (shared_state->i >= shared_state->num_work_items) {
-        return false;
-      }
-      i = shared_state->i;
-      ++shared_state->i;
-    }
-
-    const ScopedThreadToken scoped_thread_token(
-        &shared_state->thread_token_provider);
-    const int thread_id = scoped_thread_token.token();
-
-    // Perform each task.
-    for (int j = shared_state->start + i; j < shared_state->end;
-         j += shared_state->num_work_items) {
-      function(thread_id, j);
-    }
-    shared_state->block_until_finished.Finished();
-    return true;
-  };
-
-  // Add all the tasks to the thread pool.
-  for (int i = 0; i < num_work_items; ++i) {
-    // Note we are taking the task_function as value so the shared_state
-    // shared pointer is copied and the ref count is increased. This is to
-    // prevent it from being deleted when the main thread finishes all the
-    // work and exits before the threads finish.
-    context->thread_pool.AddTask([task_function]() { task_function(); });
-  }
-
-  // Try to do any available work on the main thread. This may steal work from
-  // the thread pool, but when there is no work left the thread pool tasks
-  // will be no-ops.
-  while (task_function()) {
-  }
-
-  // Wait until all tasks have finished.
-  shared_state->block_until_finished.Block();
-}
-
-}  // namespace internal
-}  // namespace ceres
-
-#endif  // CERES_USE_CXX_THREADS
--- a/extern/ceres/internal/ceres/parallel_for_openmp.cc
+++ b/extern/ceres/internal/ceres/parallel_for_openmp.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,58 +28,50 @@
 //
 // Author: vitus@google.com (Michael Vitus)

-// This include must come before any #ifndef check on Ceres compile options.
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <tuple>
+
 #include "ceres/internal/config.h"
-
-#if defined(CERES_USE_OPENMP)
-
 #include "ceres/parallel_for.h"
-#include "ceres/scoped_thread_token.h"
-#include "ceres/thread_token_provider.h"
+#include "ceres/parallel_vector_ops.h"
 #include "glog/logging.h"
-#include "omp.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-int MaxNumThreadsAvailable() { return omp_get_max_threads(); }
+BlockUntilFinished::BlockUntilFinished(int num_total_jobs)
+    : num_total_jobs_finished_(0), num_total_jobs_(num_total_jobs) {}

-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int)>& function) {
-  CHECK_GT(num_threads, 0);
-  CHECK(context != nullptr);
-  if (end <= start) {
-    return;
-  }
-
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads) \
-    schedule(dynamic) if (num_threads > 1)
-#endif  // CERES_USE_OPENMP
-  for (int i = start; i < end; ++i) {
-    function(i);
+void BlockUntilFinished::Finished(int num_jobs_finished) {
+  if (num_jobs_finished == 0) return;
+  std::lock_guard<std::mutex> lock(mutex_);
+  num_total_jobs_finished_ += num_jobs_finished;
+  CHECK_LE(num_total_jobs_finished_, num_total_jobs_);
+  if (num_total_jobs_finished_ == num_total_jobs_) {
+    condition_.notify_one();
  }
 }

-void ParallelFor(ContextImpl* context,
-                 int start,
-                 int end,
-                 int num_threads,
-                 const std::function<void(int thread_id, int i)>& function) {
-  CHECK(context != nullptr);
-
-  ThreadTokenProvider thread_token_provider(num_threads);
-  ParallelFor(context, start, end, num_threads, [&](int i) {
-    const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-    const int thread_id = scoped_thread_token.token();
-    function(thread_id, i);
-  });
+void BlockUntilFinished::Block() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  condition_.wait(
+      lock, [this]() { return num_total_jobs_finished_ == num_total_jobs_; });
 }

-}  // namespace internal
-}  // namespace ceres
+ParallelInvokeState::ParallelInvokeState(int start,
+                                         int end,
+                                         int num_work_blocks)
+    : start(start),
+      end(end),
+      num_work_blocks(num_work_blocks),
+      base_block_size((end - start) / num_work_blocks),
+      num_base_p1_sized_blocks((end - start) % num_work_blocks),
+      block_id(0),
+      thread_id(0),
+      block_until_finished(num_work_blocks) {}

-#endif  // defined(CERES_USE_OPENMP)
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/parallel_invoke.h
+++ b/extern/ceres/internal/ceres/parallel_invoke.h
@@ -0,0 +1,272 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vitus@google.com (Michael Vitus),
+//          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_PARALLEL_INVOKE_H_
+#define CERES_INTERNAL_PARALLEL_INVOKE_H_
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <tuple>
+#include <type_traits>
+
+namespace ceres::internal {
+
+// InvokeWithThreadId handles passing thread_id to the function
+template <typename F, typename... Args>
+void InvokeWithThreadId(int thread_id, F&& function, Args&&... args) {
+  constexpr bool kPassThreadId = std::is_invocable_v<F, int, Args...>;
+
+  if constexpr (kPassThreadId) {
+    function(thread_id, std::forward<Args>(args)...);
+  } else {
+    function(std::forward<Args>(args)...);
+  }
+}
+
+// InvokeOnSegment either runs a loop over segment indices or passes it to the
+// function
+template <typename F>
+void InvokeOnSegment(int thread_id, std::tuple<int, int> range, F&& function) {
+  constexpr bool kExplicitLoop =
+      std::is_invocable_v<F, int> || std::is_invocable_v<F, int, int>;
+
+  if constexpr (kExplicitLoop) {
+    const auto [start, end] = range;
+    for (int i = start; i != end; ++i) {
+      InvokeWithThreadId(thread_id, std::forward<F>(function), i);
+    }
+  } else {
+    InvokeWithThreadId(thread_id, std::forward<F>(function), range);
+  }
+}
+
+// This class creates a thread safe barrier which will block until a
+// pre-specified number of threads call Finished.  This allows us to block the
+// main thread until all the parallel threads are finished processing all the
+// work.
+class BlockUntilFinished {
+ public:
+  explicit BlockUntilFinished(int num_total_jobs);
+
+  // Increment the number of jobs that have been processed by the number of
+  // jobs processed by caller and signal the blocking thread if all jobs
+  // have finished.
+  void Finished(int num_jobs_finished);
+
+  // Block until receiving confirmation of all jobs being finished.
+  void Block();
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  int num_total_jobs_finished_;
+  const int num_total_jobs_;
+};
+
+// Shared state between the parallel tasks. Each thread will use this
+// information to get the next block of work to be performed.
+struct ParallelInvokeState {
+  // The entire range [start, end) is split into num_work_blocks contiguous
+  // disjoint intervals (blocks), which are as equal as possible given
+  // total index count and requested number of  blocks.
+  //
+  // Those num_work_blocks blocks are then processed in parallel.
+  //
+  // Total number of integer indices in interval [start, end) is
+  // end - start, and when splitting them into num_work_blocks blocks
+  // we can either
+  //  - Split into equal blocks when (end - start) is divisible by
+  //    num_work_blocks
+  //  - Split into blocks with size difference at most 1:
+  //     - Size of the smallest block(s) is (end - start) / num_work_blocks
+  //     - (end - start) % num_work_blocks will need to be 1 index larger
+  //
+  // Note that this splitting is optimal in the sense of maximal difference
+  // between block sizes, since splitting into equal blocks is possible
+  // if and only if number of indices is divisible by number of blocks.
+  ParallelInvokeState(int start, int end, int num_work_blocks);
+
+  // The start and end index of the for loop.
+  const int start;
+  const int end;
+  // The number of blocks that need to be processed.
+  const int num_work_blocks;
+  // Size of the smallest block
+  const int base_block_size;
+  // Number of blocks of size base_block_size + 1
+  const int num_base_p1_sized_blocks;
+
+  // The next block of work to be assigned to a worker.  The parallel for loop
+  // range is split into num_work_blocks blocks of work, with a single block of
+  // work being of size
+  //  - base_block_size + 1 for the first num_base_p1_sized_blocks blocks
+  //  - base_block_size for the rest of the blocks
+  //  blocks of indices are contiguous and disjoint
+  std::atomic<int> block_id;
+
+  // Provides a unique thread ID among all active threads
+  // We do not schedule more than num_threads threads via thread pool
+  // and caller thread might steal one ID
+  std::atomic<int> thread_id;
+
+  // Used to signal when all the work has been completed.  Thread safe.
+  BlockUntilFinished block_until_finished;
+};
+
+// This implementation uses a fixed size max worker pool with a shared task
+// queue. The problem of executing the function for the interval of [start, end)
+// is broken up into at most num_threads * kWorkBlocksPerThread blocks (each of
+// size at least min_block_size) and added to the thread pool. To avoid
+// deadlocks, the calling thread is allowed to steal work from the worker pool.
+// This is implemented via a shared state between the tasks. In order for
+// the calling thread or thread pool to get a block of work, it will query the
+// shared state for the next block of work to be done. If there is nothing left,
+// it will return. We will exit the ParallelFor call when all of the work has
+// been done, not when all of the tasks have been popped off the task queue.
+//
+// A unique thread ID among all active tasks will be acquired once for each
+// block of work.  This avoids the significant performance penalty for acquiring
+// it on every iteration of the for loop. The thread ID is guaranteed to be in
+// [0, num_threads).
+//
+// A performance analysis has shown this implementation is on par with OpenMP
+// and TBB.
+template <typename F>
+void ParallelInvoke(ContextImpl* context,
+                    int start,
+                    int end,
+                    int num_threads,
+                    F&& function,
+                    int min_block_size) {
+  CHECK(context != nullptr);
+
+  // Maximal number of work items scheduled for a single thread
+  //  - Lower number of work items results in larger runtimes on unequal tasks
+  //  - Higher number of work items results in larger losses for synchronization
+  constexpr int kWorkBlocksPerThread = 4;
+
+  // Interval [start, end) is being split into
+  // num_threads * kWorkBlocksPerThread contiguous disjoint blocks.
+  //
+  // In order to avoid creating empty blocks of work, we need to limit
+  // number of work blocks by a total number of indices.
+  const int num_work_blocks = std::min((end - start) / min_block_size,
+                                       num_threads * kWorkBlocksPerThread);
+
+  // We use a std::shared_ptr because the main thread can finish all
+  // the work before the tasks have been popped off the queue.  So the
+  // shared state needs to exist for the duration of all the tasks.
+  auto shared_state =
+      std::make_shared<ParallelInvokeState>(start, end, num_work_blocks);
+
+  // A function which tries to schedule another task in the thread pool and
+  // perform several chunks of work. Function expects itself as the argument in
+  // order to schedule next task in the thread pool.
+  auto task = [context, shared_state, num_threads, &function](auto& task_copy) {
+    int num_jobs_finished = 0;
+    const int thread_id = shared_state->thread_id.fetch_add(1);
+    // In order to avoid dead-locks in nested parallel for loops, task() will be
+    // invoked num_threads + 1 times:
+    //  - num_threads times via enqueueing task into thread pool
+    //  - one more time in the main thread
+    //  Tasks enqueued to thread pool might take some time before execution, and
+    //  the last task being executed will be terminated here in order to avoid
+    //  having more than num_threads active threads
+    if (thread_id >= num_threads) return;
+    const int num_work_blocks = shared_state->num_work_blocks;
+    if (thread_id + 1 < num_threads &&
+        shared_state->block_id < num_work_blocks) {
+      // Add another thread to the thread pool.
+      // Note we are taking the task as value so the copy of shared_state shared
+      // pointer (captured by value at declaration of task lambda-function) is
+      // copied and the ref count is increased. This is to prevent it from being
+      // deleted when the main thread finishes all the work and exits before the
+      // threads finish.
+      context->thread_pool.AddTask([task_copy]() { task_copy(task_copy); });
+    }
+
+    const int start = shared_state->start;
+    const int base_block_size = shared_state->base_block_size;
+    const int num_base_p1_sized_blocks = shared_state->num_base_p1_sized_blocks;
+
+    while (true) {
+      // Get the next available chunk of work to be performed. If there is no
+      // work, return.
+      int block_id = shared_state->block_id.fetch_add(1);
+      if (block_id >= num_work_blocks) {
+        break;
+      }
+      ++num_jobs_finished;
+
+      // For-loop interval [start, end) was split into num_work_blocks,
+      // with num_base_p1_sized_blocks of size base_block_size + 1 and remaining
+      // num_work_blocks - num_base_p1_sized_blocks of size base_block_size
+      //
+      // Then, start index of the block #block_id is given by a total
+      // length of preceeding blocks:
+      //  * Total length of preceeding blocks of size base_block_size + 1:
+      //     min(block_id, num_base_p1_sized_blocks) * (base_block_size + 1)
+      //
+      //  * Total length of preceeding blocks of size base_block_size:
+      //     (block_id - min(block_id, num_base_p1_sized_blocks)) *
+      //     base_block_size
+      //
+      // Simplifying sum of those quantities yields a following
+      // expression for start index of the block #block_id
+      const int curr_start = start + block_id * base_block_size +
+                             std::min(block_id, num_base_p1_sized_blocks);
+      // First num_base_p1_sized_blocks have size base_block_size + 1
+      //
+      // Note that it is guaranteed that all blocks are within
+      // [start, end) interval
+      const int curr_end = curr_start + base_block_size +
+                           (block_id < num_base_p1_sized_blocks ? 1 : 0);
+      // Perform each task in current block
+      const auto range = std::make_tuple(curr_start, curr_end);
+      InvokeOnSegment(thread_id, range, function);
+    }
+    shared_state->block_until_finished.Finished(num_jobs_finished);
+  };
+
+  // Start scheduling threads and doing work. We might end up with less threads
+  // scheduled than expected, if scheduling overhead is larger than the amount
+  // of work to be done.
+  task(task);
+
+  // Wait until all tasks have finished.
+  shared_state->block_until_finished.Block();
+}
+
+}  // namespace ceres::internal
+
+#endif
--- a/extern/ceres/internal/ceres/parallel_utils.cc
+++ b/extern/ceres/internal/ceres/parallel_utils.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,7 @@

 #include "ceres/parallel_utils.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j) {
  // This works by unfolding a rectangle into a triangle.
@@ -86,5 +85,4 @@ void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j) {
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/parallel_utils.h
+++ b/extern/ceres/internal/ceres/parallel_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,7 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Converts a linear iteration order into a triangular iteration order.
 // Suppose you have nested loops that look like
@@ -66,7 +65,6 @@ CERES_NO_EXPORT void LinearIndexToUpperTriangularIndex(int k,
                                                       int* i,
                                                       int* j);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_PARALLEL_UTILS_H_
--- a/extern/ceres/internal/ceres/parallel_vector_ops.cc
+++ b/extern/ceres/internal/ceres/parallel_vector_ops.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -25,35 +25,30 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: sameeragarwal@google.com (Sameer Agarwal)

-#ifndef CERES_INTERNAL_FLOAT_CXSPARSE_H_
-#define CERES_INTERNAL_FLOAT_CXSPARSE_H_
+#include "ceres/parallel_vector_ops.h"

-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
+#include <algorithm>
+#include <tuple>

-#if !defined(CERES_NO_CXSPARSE)
+#include "ceres/context_impl.h"
+#include "ceres/parallel_for.h"

-#include <memory>
+namespace ceres::internal {
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values) {
+  ParallelFor(
+      context,
+      0,
+      num_values,
+      num_threads,
+      [values](std::tuple<int, int> range) {
+        auto [start, end] = range;
+        std::fill(values + start, values + end, 0.);
+      },
+      kMinBlockSizeParallelVectorOps);
+}

-#include "ceres/internal/export.h"
-#include "ceres/sparse_cholesky.h"
-
-namespace ceres {
-namespace internal {
-
-// Fake implementation of a single precision Sparse Cholesky using
-// CXSparse.
-class CERES_NO_EXPORT FloatCXSparseCholesky : public SparseCholesky {
- public:
-  static std::unique_ptr<SparseCholesky> Create(OrderingType ordering_type);
-};
-
-}  // namespace internal
-}  // namespace ceres
-
-#endif  // !defined(CERES_NO_CXSPARSE)
-
-#endif  // CERES_INTERNAL_FLOAT_CXSPARSE_H_
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/parallel_vector_ops.h
+++ b/extern/ceres/internal/ceres/parallel_vector_ops.h
@@ -0,0 +1,90 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vitus@google.com (Michael Vitus),
+//          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_PARALLEL_VECTOR_OPS_H_
+#define CERES_INTERNAL_PARALLEL_VECTOR_OPS_H_
+
+#include <mutex>
+#include <vector>
+
+#include "ceres/context_impl.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
+
+namespace ceres::internal {
+
+// Lower bound on block size for parallel vector operations.
+// Operations with vectors of less than kMinBlockSizeParallelVectorOps elements
+// will be executed in a single thread.
+constexpr int kMinBlockSizeParallelVectorOps = 1 << 16;
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of column-vector
+// expression, assignment lhs = rhs is eavluated over a set of contiguous blocks
+// in parallel. This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into temporaries). This
+// method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+                    int num_threads,
+                    LhsExpression& lhs,
+                    const RhsExpression& rhs) {
+  static_assert(LhsExpression::ColsAtCompileTime == 1);
+  static_assert(RhsExpression::ColsAtCompileTime == 1);
+  CHECK_EQ(lhs.rows(), rhs.rows());
+  const int num_rows = lhs.rows();
+  ParallelFor(
+      context,
+      0,
+      num_rows,
+      num_threads,
+      [&lhs, &rhs](const std::tuple<int, int>& range) {
+        auto [start, end] = range;
+        lhs.segment(start, end - start) = rhs.segment(start, end - start);
+      },
+      kMinBlockSizeParallelVectorOps);
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     VectorType& vector) {
+  ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values);
+
+}  // namespace ceres::internal
+
+#endif  // CERES_INTERNAL_PARALLEL_FOR_H_
--- a/extern/ceres/internal/ceres/parameter_block.h
+++ b/extern/ceres/internal/ceres/parameter_block.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@
 #include "ceres/stringprintf.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class ProblemImpl;
 class ResidualBlock;
@@ -382,8 +381,7 @@ class CERES_NO_EXPORT ParameterBlock {
  friend class ProblemImpl;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/parameter_block_ordering.cc
+++ b/extern/ceres/internal/ceres/parameter_block_ordering.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,11 @@

 #include "ceres/parameter_block_ordering.h"

+#include <map>
 #include <memory>
+#include <set>
 #include <unordered_set>
+#include <vector>

 #include "ceres/graph.h"
 #include "ceres/graph_algorithms.h"
@@ -42,22 +45,18 @@
 #include "ceres/wall_time.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::map;
-using std::set;
-using std::vector;
+namespace ceres::internal {

 int ComputeStableSchurOrdering(const Program& program,
-                               vector<ParameterBlock*>* ordering) {
+                               std::vector<ParameterBlock*>* ordering) {
  CHECK(ordering != nullptr);
  ordering->clear();
  EventLogger event_logger("ComputeStableSchurOrdering");
  auto graph = CreateHessianGraph(program);
  event_logger.AddEvent("CreateHessianGraph");

-  const vector<ParameterBlock*>& parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program.parameter_blocks();
  const std::unordered_set<ParameterBlock*>& vertices = graph->vertices();
  for (auto* parameter_block : parameter_blocks) {
    if (vertices.count(parameter_block) > 0) {
@@ -81,13 +80,14 @@ int ComputeStableSchurOrdering(const Program& program,
 }

 int ComputeSchurOrdering(const Program& program,
-                         vector<ParameterBlock*>* ordering) {
+                         std::vector<ParameterBlock*>* ordering) {
  CHECK(ordering != nullptr);
  ordering->clear();

  auto graph = CreateHessianGraph(program);
  int independent_set_size = IndependentSetOrdering(*graph, ordering);
-  const vector<ParameterBlock*>& parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program.parameter_blocks();

  // Add the excluded blocks to back of the ordering vector.
  for (auto* parameter_block : parameter_blocks) {
@@ -103,13 +103,14 @@ void ComputeRecursiveIndependentSetOrdering(const Program& program,
                                            ParameterBlockOrdering* ordering) {
  CHECK(ordering != nullptr);
  ordering->Clear();
-  const vector<ParameterBlock*> parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*> parameter_blocks =
+      program.parameter_blocks();
  auto graph = CreateHessianGraph(program);

  int num_covered = 0;
  int round = 0;
  while (num_covered < parameter_blocks.size()) {
-    vector<ParameterBlock*> independent_set_ordering;
+    std::vector<ParameterBlock*> independent_set_ordering;
    const int independent_set_size =
        IndependentSetOrdering(*graph, &independent_set_ordering);
    for (int i = 0; i < independent_set_size; ++i) {
@@ -126,14 +127,16 @@ std::unique_ptr<Graph<ParameterBlock*>> CreateHessianGraph(
    const Program& program) {
  auto graph = std::make_unique<Graph<ParameterBlock*>>();
  CHECK(graph != nullptr);
-  const vector<ParameterBlock*>& parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program.parameter_blocks();
  for (auto* parameter_block : parameter_blocks) {
    if (!parameter_block->IsConstant()) {
      graph->AddVertex(parameter_block);
    }
  }

-  const vector<ResidualBlock*>& residual_blocks = program.residual_blocks();
+  const std::vector<ResidualBlock*>& residual_blocks =
+      program.residual_blocks();
  for (auto* residual_block : residual_blocks) {
    const int num_parameter_blocks = residual_block->NumParameterBlocks();
    ParameterBlock* const* parameter_blocks =
@@ -157,19 +160,20 @@ std::unique_ptr<Graph<ParameterBlock*>> CreateHessianGraph(
 }

 void OrderingToGroupSizes(const ParameterBlockOrdering* ordering,
-                          vector<int>* group_sizes) {
+                          std::vector<int>* group_sizes) {
  CHECK(group_sizes != nullptr);
  group_sizes->clear();
  if (ordering == nullptr) {
    return;
  }

-  const map<int, set<double*>>& group_to_elements =
+  // TODO(sameeragarwal): Investigate if this should be a set or an
+  // unordered_set.
+  const std::map<int, std::set<double*>>& group_to_elements =
      ordering->group_to_elements();
  for (const auto& g_t_e : group_to_elements) {
    group_sizes->push_back(g_t_e.second.size());
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/parameter_block_ordering.h
+++ b/extern/ceres/internal/ceres/parameter_block_ordering.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,15 +40,14 @@
 #include "ceres/ordered_groups.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;
 class ParameterBlock;

 // Uses an approximate independent set ordering to order the parameter
-// blocks of a problem so that it is suitable for use with Schur
-// complement based solvers. The output variable ordering contains an
+// blocks of a problem so that it is suitable for use with Schur-
+// complement-based solvers. The output variable ordering contains an
 // ordering of the parameter blocks and the return value is size of
 // the independent set or the number of e_blocks (see
 // schur_complement_solver.h for an explanation). Constant parameters
@@ -88,8 +87,7 @@ CERES_NO_EXPORT std::unique_ptr<Graph<ParameterBlock*>> CreateHessianGraph(
 CERES_NO_EXPORT void OrderingToGroupSizes(
    const ParameterBlockOrdering* ordering, std::vector<int>* group_sizes);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/partition_range_for_parallel_for.h
+++ b/extern/ceres/internal/ceres/partition_range_for_parallel_for.h
@@ -0,0 +1,150 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vitus@google.com (Michael Vitus),
+//          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_PARTITION_RANGE_FOR_PARALLEL_FOR_H_
+#define CERES_INTERNAL_PARTITION_RANGE_FOR_PARALLEL_FOR_H_
+
+#include <algorithm>
+#include <vector>
+
+namespace ceres::internal {
+// Check if it is possible to split range [start; end) into at most
+// max_num_partitions  contiguous partitions of cost not greater than
+// max_partition_cost. Inclusive integer cumulative costs are provided by
+// cumulative_cost_data objects, with cumulative_cost_offset being a total cost
+// of all indices (starting from zero) preceding start element. Cumulative costs
+// are returned by cumulative_cost_fun called with a reference to
+// cumulative_cost_data element with index from range[start; end), and should be
+// non-decreasing. Partition of the range is returned via partition argument
+template <typename CumulativeCostData, typename CumulativeCostFun>
+bool MaxPartitionCostIsFeasible(int start,
+                                int end,
+                                int max_num_partitions,
+                                int max_partition_cost,
+                                int cumulative_cost_offset,
+                                const CumulativeCostData* cumulative_cost_data,
+                                CumulativeCostFun&& cumulative_cost_fun,
+                                std::vector<int>* partition) {
+  partition->clear();
+  partition->push_back(start);
+  int partition_start = start;
+  int cost_offset = cumulative_cost_offset;
+
+  while (partition_start < end) {
+    // Already have max_num_partitions
+    if (partition->size() > max_num_partitions) {
+      return false;
+    }
+    const int target = max_partition_cost + cost_offset;
+    const int partition_end =
+        std::partition_point(
+            cumulative_cost_data + partition_start,
+            cumulative_cost_data + end,
+            [&cumulative_cost_fun, target](const CumulativeCostData& item) {
+              return cumulative_cost_fun(item) <= target;
+            }) -
+        cumulative_cost_data;
+    // Unable to make a partition from a single element
+    if (partition_end == partition_start) {
+      return false;
+    }
+
+    const int cost_last =
+        cumulative_cost_fun(cumulative_cost_data[partition_end - 1]);
+    partition->push_back(partition_end);
+    partition_start = partition_end;
+    cost_offset = cost_last;
+  }
+  return true;
+}
+
+// Split integer interval [start, end) into at most max_num_partitions
+// contiguous intervals, minimizing maximal total cost of a single interval.
+// Inclusive integer cumulative costs for each (zero-based) index are provided
+// by cumulative_cost_data objects, and are returned by cumulative_cost_fun call
+// with a reference to one of the objects from range [start, end)
+template <typename CumulativeCostData, typename CumulativeCostFun>
+std::vector<int> PartitionRangeForParallelFor(
+    int start,
+    int end,
+    int max_num_partitions,
+    const CumulativeCostData* cumulative_cost_data,
+    CumulativeCostFun&& cumulative_cost_fun) {
+  // Given maximal partition cost, it is possible to verify if it is admissible
+  // and obtain corresponding partition using MaxPartitionCostIsFeasible
+  // function. In order to find the lowest admissible value, a binary search
+  // over all potentially optimal cost values is being performed
+  const int cumulative_cost_last =
+      cumulative_cost_fun(cumulative_cost_data[end - 1]);
+  const int cumulative_cost_offset =
+      start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0;
+  const int total_cost = cumulative_cost_last - cumulative_cost_offset;
+
+  // Minimal maximal partition cost is not smaller than the average
+  // We will use non-inclusive lower bound
+  int partition_cost_lower_bound = total_cost / max_num_partitions - 1;
+  // Minimal maximal partition cost is not larger than the total cost
+  // Upper bound is inclusive
+  int partition_cost_upper_bound = total_cost;
+
+  std::vector<int> partition;
+  // Range partition corresponding to the latest evaluated upper bound.
+  // A single segment covering the whole input interval [start, end) corresponds
+  // to minimal maximal partition cost of total_cost.
+  std::vector<int> partition_upper_bound = {start, end};
+  // Binary search over partition cost, returning the lowest admissible cost
+  while (partition_cost_upper_bound - partition_cost_lower_bound > 1) {
+    partition.reserve(max_num_partitions + 1);
+    const int partition_cost =
+        partition_cost_lower_bound +
+        (partition_cost_upper_bound - partition_cost_lower_bound) / 2;
+    bool admissible = MaxPartitionCostIsFeasible(
+        start,
+        end,
+        max_num_partitions,
+        partition_cost,
+        cumulative_cost_offset,
+        cumulative_cost_data,
+        std::forward<CumulativeCostFun>(cumulative_cost_fun),
+        &partition);
+    if (admissible) {
+      partition_cost_upper_bound = partition_cost;
+      std::swap(partition, partition_upper_bound);
+    } else {
+      partition_cost_lower_bound = partition_cost;
+    }
+  }
+
+  return partition_upper_bound;
+}
+}  // namespace ceres::internal
+
+#endif
--- a/extern/ceres/internal/ceres/partitioned_matrix_view.cc
+++ b/extern/ceres/internal/ceres/partitioned_matrix_view.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,7 @@
 #include "ceres/linear_solver.h"
 #include "ceres/partitioned_matrix_view.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 PartitionedMatrixViewBase::~PartitionedMatrixViewBase() = default;

@@ -56,121 +55,121 @@ std::unique_ptr<PartitionedMatrixViewBase> PartitionedMatrixViewBase::Create(
     (options.e_block_size == 2) &&
     (options.f_block_size == 2)) {
    return std::make_unique<PartitionedMatrixView<2,2, 2>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 2) &&
     (options.f_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<2,2, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 2) &&
     (options.f_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<2,2, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 2)) {
    return std::make_unique<PartitionedMatrixView<2,2, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 3) &&
     (options.f_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<2,3, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 3) &&
     (options.f_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<2,3, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 3) &&
     (options.f_block_size == 6)) {
    return std::make_unique<PartitionedMatrixView<2,3, 6>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 3) &&
     (options.f_block_size == 9)) {
    return std::make_unique<PartitionedMatrixView<2,3, 9>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<2,3, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<2,4, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<2,4, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 6)) {
    return std::make_unique<PartitionedMatrixView<2,4, 6>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 8)) {
    return std::make_unique<PartitionedMatrixView<2,4, 8>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 9)) {
    return std::make_unique<PartitionedMatrixView<2,4, 9>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 2) &&
     (options.e_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<2,4, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if (options.row_block_size == 2) {
    return std::make_unique<PartitionedMatrixView<2,Eigen::Dynamic, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 3) &&
     (options.e_block_size == 3) &&
     (options.f_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<3,3, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 4) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 2)) {
    return std::make_unique<PartitionedMatrixView<4,4, 2>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 4) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 3)) {
    return std::make_unique<PartitionedMatrixView<4,4, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 4) &&
     (options.e_block_size == 4) &&
     (options.f_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<4,4, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }
  if ((options.row_block_size == 4) &&
     (options.e_block_size == 4)) {
    return std::make_unique<PartitionedMatrixView<4,4, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
  }

 #endif
@@ -180,8 +179,7 @@ std::unique_ptr<PartitionedMatrixViewBase> PartitionedMatrixViewBase::Create(
  return std::make_unique<PartitionedMatrixView<Eigen::Dynamic,
                                                Eigen::Dynamic,
                                                Eigen::Dynamic>>(
-      matrix, options.elimination_groups[0]);
+      options, matrix);
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/partitioned_matrix_view.h
+++ b/extern/ceres/internal/ceres/partitioned_matrix_view.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -50,12 +50,13 @@
 #include "ceres/small_blas.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+
+class ContextImpl;

 // Given generalized bi-partite matrix A = [E F], with the same block
 // structure as required by the Schur complement based solver, found
-// in explicit_schur_complement_solver.h, provide access to the
+// in schur_complement_solver.h, provide access to the
 // matrices E and F and their outer products E'E and F'F with
 // themselves.
 //
@@ -68,16 +69,26 @@ class CERES_NO_EXPORT PartitionedMatrixViewBase {
  virtual ~PartitionedMatrixViewBase();

  // y += E'x
-  virtual void LeftMultiplyE(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateE(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateESingleThreaded(const double* x,
+                                                        double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateEMultiThreaded(const double* x,
+                                                       double* y) const = 0;

  // y += F'x
-  virtual void LeftMultiplyF(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateF(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateFSingleThreaded(const double* x,
+                                                        double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateFMultiThreaded(const double* x,
+                                                       double* y) const = 0;

  // y += Ex
-  virtual void RightMultiplyE(const double* x, double* y) const = 0;
+  virtual void RightMultiplyAndAccumulateE(const double* x,
+                                           double* y) const = 0;

  // y += Fx
-  virtual void RightMultiplyF(const double* x, double* y) const = 0;
+  virtual void RightMultiplyAndAccumulateF(const double* x,
+                                           double* y) const = 0;

  // Create and return the block diagonal of the matrix E'E.
  virtual std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const = 0;
@@ -109,6 +120,8 @@ class CERES_NO_EXPORT PartitionedMatrixViewBase {
  virtual int num_cols_f()       const = 0;
  virtual int num_rows()         const = 0;
  virtual int num_cols()         const = 0;
+  virtual const std::vector<int>& e_cols_partition() const = 0;
+  virtual const std::vector<int>& f_cols_partition() const = 0;
  // clang-format on

  static std::unique_ptr<PartitionedMatrixViewBase> Create(
@@ -122,17 +135,46 @@ class CERES_NO_EXPORT PartitionedMatrixView final
    : public PartitionedMatrixViewBase {
 public:
  // matrix = [E F], where the matrix E contains the first
-  // num_col_blocks_a column blocks.
-  PartitionedMatrixView(const BlockSparseMatrix& matrix, int num_col_blocks_e);
+  // options.elimination_groups[0] column blocks.
+  PartitionedMatrixView(const LinearSolver::Options& options,
+                        const BlockSparseMatrix& matrix);
+
+  // y += E'x
+  virtual void LeftMultiplyAndAccumulateE(const double* x,
+                                          double* y) const final;
+  virtual void LeftMultiplyAndAccumulateESingleThreaded(const double* x,
+                                                        double* y) const final;
+  virtual void LeftMultiplyAndAccumulateEMultiThreaded(const double* x,
+                                                       double* y) const final;
+
+  // y += F'x
+  virtual void LeftMultiplyAndAccumulateF(const double* x,
+                                          double* y) const final;
+  virtual void LeftMultiplyAndAccumulateFSingleThreaded(const double* x,
+                                                        double* y) const final;
+  virtual void LeftMultiplyAndAccumulateFMultiThreaded(const double* x,
+                                                       double* y) const final;
+
+  // y += Ex
+  virtual void RightMultiplyAndAccumulateE(const double* x,
+                                           double* y) const final;
+
+  // y += Fx
+  virtual void RightMultiplyAndAccumulateF(const double* x,
+                                           double* y) const final;

-  void LeftMultiplyE(const double* x, double* y) const final;
-  void LeftMultiplyF(const double* x, double* y) const final;
-  void RightMultiplyE(const double* x, double* y) const final;
-  void RightMultiplyF(const double* x, double* y) const final;
  std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final;
  std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final;
  void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
+  void UpdateBlockDiagonalEtESingleThreaded(
+      BlockSparseMatrix* block_diagonal) const;
+  void UpdateBlockDiagonalEtEMultiThreaded(
+      BlockSparseMatrix* block_diagonal) const;
  void UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const final;
+  void UpdateBlockDiagonalFtFSingleThreaded(
+      BlockSparseMatrix* block_diagonal) const;
+  void UpdateBlockDiagonalFtFMultiThreaded(
+      BlockSparseMatrix* block_diagonal) const;
  // clang-format off
  int num_col_blocks_e() const final { return num_col_blocks_e_;  }
  int num_col_blocks_f() const final { return num_col_blocks_f_;  }
@@ -141,21 +183,29 @@ class CERES_NO_EXPORT PartitionedMatrixView final
  int num_rows()         const final { return matrix_.num_rows(); }
  int num_cols()         const final { return matrix_.num_cols(); }
  // clang-format on
+  const std::vector<int>& e_cols_partition() const final {
+    return e_cols_partition_;
+  }
+  const std::vector<int>& f_cols_partition() const final {
+    return f_cols_partition_;
+  }

 private:
  std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalMatrixLayout(
      int start_col_block, int end_col_block) const;

+  const LinearSolver::Options options_;
  const BlockSparseMatrix& matrix_;
  int num_row_blocks_e_;
  int num_col_blocks_e_;
  int num_col_blocks_f_;
  int num_cols_e_;
  int num_cols_f_;
+  std::vector<int> e_cols_partition_;
+  std::vector<int> f_cols_partition_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/partitioned_matrix_view_impl.h
+++ b/extern/ceres/internal/ceres/partitioned_matrix_view_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,27 +36,31 @@
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
+#include "ceres/partition_range_for_parallel_for.h"
 #include "ceres/partitioned_matrix_view.h"
 #include "ceres/small_blas.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    PartitionedMatrixView(const BlockSparseMatrix& matrix, int num_col_blocks_e)
-    : matrix_(matrix), num_col_blocks_e_(num_col_blocks_e) {
+    PartitionedMatrixView(const LinearSolver::Options& options,
+                          const BlockSparseMatrix& matrix)
+
+    : options_(options), matrix_(matrix) {
  const CompressedRowBlockStructure* bs = matrix_.block_structure();
  CHECK(bs != nullptr);

+  num_col_blocks_e_ = options_.elimination_groups[0];
  num_col_blocks_f_ = bs->cols.size() - num_col_blocks_e_;

  // Compute the number of row blocks in E. The number of row blocks
  // in E maybe less than the number of row blocks in the input matrix
  // as some of the row blocks at the bottom may not have any
  // e_blocks. For a definition of what an e_block is, please see
-  // explicit_schur_complement_solver.h
+  // schur_complement_solver.h
  num_row_blocks_e_ = 0;
  for (const auto& row : bs->rows) {
    const std::vector<Cell>& cells = row.cells;
@@ -79,6 +83,25 @@ PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }

  CHECK_EQ(num_cols_e_ + num_cols_f_, matrix_.num_cols());
+
+  auto transpose_bs = matrix_.transpose_block_structure();
+  const int num_threads = options_.num_threads;
+  if (transpose_bs != nullptr && num_threads > 1) {
+    int kMaxPartitions = num_threads * 4;
+    e_cols_partition_ = PartitionRangeForParallelFor(
+        0,
+        num_col_blocks_e_,
+        kMaxPartitions,
+        transpose_bs->rows.data(),
+        [](const CompressedRow& row) { return row.cumulative_nnz; });
+
+    f_cols_partition_ = PartitionRangeForParallelFor(
+        num_col_blocks_e_,
+        num_col_blocks_e_ + num_col_blocks_f_,
+        kMaxPartitions,
+        transpose_bs->rows.data(),
+        [](const CompressedRow& row) { return row.cumulative_nnz; });
+  }
 }

 // The next four methods don't seem to be particularly cache
@@ -88,77 +111,101 @@ PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    RightMultiplyE(const double* x, double* y) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-
+    RightMultiplyAndAccumulateE(const double* x, double* y) const {
  // Iterate over the first num_row_blocks_e_ row blocks, and multiply
  // by the first cell in each row block.
+  auto bs = matrix_.block_structure();
  const double* values = matrix_.values();
-  for (int r = 0; r < num_row_blocks_e_; ++r) {
-    const Cell& cell = bs->rows[r].cells[0];
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const int col_block_id = cell.block_id;
-    const int col_block_pos = bs->cols[col_block_id].position;
-    const int col_block_size = bs->cols[col_block_id].size;
-    // clang-format off
-    MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
-        values + cell.position, row_block_size, col_block_size,
-        x + col_block_pos,
-        y + row_block_pos);
-    // clang-format on
-  }
+  ParallelFor(options_.context,
+              0,
+              num_row_blocks_e_,
+              options_.num_threads,
+              [values, bs, x, y](int row_block_id) {
+                const Cell& cell = bs->rows[row_block_id].cells[0];
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const int col_block_id = cell.block_id;
+                const int col_block_pos = bs->cols[col_block_id].position;
+                const int col_block_size = bs->cols[col_block_id].size;
+                // clang-format off
+                MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+                    values + cell.position, row_block_size, col_block_size,
+                    x + col_block_pos,
+                    y + row_block_pos);
+                // clang-format on
+              });
 }

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    RightMultiplyF(const double* x, double* y) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-
+    RightMultiplyAndAccumulateF(const double* x, double* y) const {
  // Iterate over row blocks, and if the row block is in E, then
  // multiply by all the cells except the first one which is of type
  // E. If the row block is not in E (i.e its in the bottom
  // num_row_blocks - num_row_blocks_e row blocks), then all the cells
  // are of type F and multiply by them all.
+  const CompressedRowBlockStructure* bs = matrix_.block_structure();
+  const int num_row_blocks = bs->rows.size();
+  const int num_cols_e = num_cols_e_;
  const double* values = matrix_.values();
-  for (int r = 0; r < num_row_blocks_e_; ++r) {
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const std::vector<Cell>& cells = bs->rows[r].cells;
-    for (int c = 1; c < cells.size(); ++c) {
-      const int col_block_id = cells[c].block_id;
-      const int col_block_pos = bs->cols[col_block_id].position;
-      const int col_block_size = bs->cols[col_block_id].size;
-      // clang-format off
-      MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
-          values + cells[c].position, row_block_size, col_block_size,
-          x + col_block_pos - num_cols_e_,
-          y + row_block_pos);
-      // clang-format on
-    }
-  }
+  ParallelFor(options_.context,
+              0,
+              num_row_blocks_e_,
+              options_.num_threads,
+              [values, bs, num_cols_e, x, y](int row_block_id) {
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const auto& cells = bs->rows[row_block_id].cells;
+                for (int c = 1; c < cells.size(); ++c) {
+                  const int col_block_id = cells[c].block_id;
+                  const int col_block_pos = bs->cols[col_block_id].position;
+                  const int col_block_size = bs->cols[col_block_id].size;
+                  // clang-format off
+                  MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+                      values + cells[c].position, row_block_size, col_block_size,
+                      x + col_block_pos - num_cols_e,
+                      y + row_block_pos);
+                  // clang-format on
+                }
+              });
+  ParallelFor(options_.context,
+              num_row_blocks_e_,
+              num_row_blocks,
+              options_.num_threads,
+              [values, bs, num_cols_e, x, y](int row_block_id) {
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const auto& cells = bs->rows[row_block_id].cells;
+                for (const auto& cell : cells) {
+                  const int col_block_id = cell.block_id;
+                  const int col_block_pos = bs->cols[col_block_id].position;
+                  const int col_block_size = bs->cols[col_block_id].size;
+                  // clang-format off
+                  MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+                      values + cell.position, row_block_size, col_block_size,
+                      x + col_block_pos - num_cols_e,
+                      y + row_block_pos);
+                  // clang-format on
+                }
+              });
+}

-  for (int r = num_row_blocks_e_; r < bs->rows.size(); ++r) {
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const std::vector<Cell>& cells = bs->rows[r].cells;
-    for (const auto& cell : cells) {
-      const int col_block_id = cell.block_id;
-      const int col_block_pos = bs->cols[col_block_id].position;
-      const int col_block_size = bs->cols[col_block_id].size;
-      // clang-format off
-      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values + cell.position, row_block_size, col_block_size,
-          x + col_block_pos - num_cols_e_,
-          y + row_block_pos);
-      // clang-format on
-    }
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateE(const double* x, double* y) const {
+  if (!num_col_blocks_e_) return;
+  if (!num_row_blocks_e_) return;
+  if (options_.num_threads == 1) {
+    LeftMultiplyAndAccumulateESingleThreaded(x, y);
+  } else {
+    CHECK(options_.context != nullptr);
+    LeftMultiplyAndAccumulateEMultiThreaded(x, y);
  }
 }

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    LeftMultiplyE(const double* x, double* y) const {
+    LeftMultiplyAndAccumulateESingleThreaded(const double* x, double* y) const {
  const CompressedRowBlockStructure* bs = matrix_.block_structure();

  // Iterate over the first num_row_blocks_e_ row blocks, and multiply
@@ -182,7 +229,55 @@ void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    LeftMultiplyF(const double* x, double* y) const {
+    LeftMultiplyAndAccumulateEMultiThreaded(const double* x, double* y) const {
+  auto transpose_bs = matrix_.transpose_block_structure();
+  CHECK(transpose_bs != nullptr);
+
+  // Local copies of class members in order to avoid capturing pointer to the
+  // whole object in lambda function
+  auto values = matrix_.values();
+  const int num_row_blocks_e = num_row_blocks_e_;
+  ParallelFor(
+      options_.context,
+      0,
+      num_col_blocks_e_,
+      options_.num_threads,
+      [values, transpose_bs, num_row_blocks_e, x, y](int row_block_id) {
+        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+        int row_block_size = transpose_bs->rows[row_block_id].block.size;
+        auto& cells = transpose_bs->rows[row_block_id].cells;
+
+        for (auto& cell : cells) {
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          if (col_block_id >= num_row_blocks_e) break;
+          MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos);
+        }
+      },
+      e_cols_partition());
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateF(const double* x, double* y) const {
+  if (!num_col_blocks_f_) return;
+  if (options_.num_threads == 1) {
+    LeftMultiplyAndAccumulateFSingleThreaded(x, y);
+  } else {
+    CHECK(options_.context != nullptr);
+    LeftMultiplyAndAccumulateFMultiThreaded(x, y);
+  }
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateFSingleThreaded(const double* x, double* y) const {
  const CompressedRowBlockStructure* bs = matrix_.block_structure();

  // Iterate over row blocks, and if the row block is in E, then
@@ -226,10 +321,63 @@ void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }
 }

+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateFMultiThreaded(const double* x, double* y) const {
+  auto transpose_bs = matrix_.transpose_block_structure();
+  CHECK(transpose_bs != nullptr);
+  // Local copies of class members  in order to avoid capturing pointer to the
+  // whole object in lambda function
+  auto values = matrix_.values();
+  const int num_row_blocks_e = num_row_blocks_e_;
+  const int num_cols_e = num_cols_e_;
+  ParallelFor(
+      options_.context,
+      num_col_blocks_e_,
+      num_col_blocks_e_ + num_col_blocks_f_,
+      options_.num_threads,
+      [values, transpose_bs, num_row_blocks_e, num_cols_e, x, y](
+          int row_block_id) {
+        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+        int row_block_size = transpose_bs->rows[row_block_id].block.size;
+        auto& cells = transpose_bs->rows[row_block_id].cells;
+
+        const int num_cells = cells.size();
+        int cell_idx = 0;
+        for (; cell_idx < num_cells; ++cell_idx) {
+          auto& cell = cells[cell_idx];
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          if (col_block_id >= num_row_blocks_e) break;
+
+          MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos - num_cols_e);
+        }
+        for (; cell_idx < num_cells; ++cell_idx) {
+          auto& cell = cells[cell_idx];
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos - num_cols_e);
+        }
+      },
+      f_cols_partition());
+}
+
 // Given a range of columns blocks of a matrix m, compute the block
 // structure of the block diagonal of the matrix m(:,
 // start_col_block:end_col_block)'m(:, start_col_block:end_col_block)
-// and return a BlockSparseMatrix with the this block structure. The
+// and return a BlockSparseMatrix with this block structure. The
 // caller owns the result.
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 std::unique_ptr<BlockSparseMatrix>
@@ -290,17 +438,17 @@ PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
  return block_diagonal;
 }

-// Similar to the code in RightMultiplyE, except instead of the matrix
-// vector multiply its an outer product.
+// Similar to the code in RightMultiplyAndAccumulateE, except instead of the
+// matrix vector multiply its an outer product.
 //
 //    block_diagonal = block_diagonal(E'E)
 //
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-  const CompressedRowBlockStructure* block_diagonal_structure =
-      block_diagonal->block_structure();
+    UpdateBlockDiagonalEtESingleThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto bs = matrix_.block_structure();
+  auto block_diagonal_structure = block_diagonal->block_structure();

  block_diagonal->SetZero();
  const double* values = matrix_.values();
@@ -323,17 +471,68 @@ void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }
 }

-// Similar to the code in RightMultiplyF, except instead of the matrix
-// vector multiply its an outer product.
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalEtEMultiThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto transpose_block_structure = matrix_.transpose_block_structure();
+  CHECK(transpose_block_structure != nullptr);
+  auto block_diagonal_structure = block_diagonal->block_structure();
+
+  const double* values = matrix_.values();
+  double* values_diagonal = block_diagonal->mutable_values();
+  ParallelFor(
+      options_.context,
+      0,
+      num_col_blocks_e_,
+      options_.num_threads,
+      [values,
+       transpose_block_structure,
+       values_diagonal,
+       block_diagonal_structure](int col_block_id) {
+        int cell_position =
+            block_diagonal_structure->rows[col_block_id].cells[0].position;
+        double* cell_values = values_diagonal + cell_position;
+        int col_block_size =
+            transpose_block_structure->rows[col_block_id].block.size;
+        auto& cells = transpose_block_structure->rows[col_block_id].cells;
+        MatrixRef(cell_values, col_block_size, col_block_size).setZero();
+
+        for (auto& c : cells) {
+          int row_block_size = transpose_block_structure->cols[c.block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply<kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
+            values + c.position, row_block_size, col_block_size,
+            values + c.position, row_block_size, col_block_size,
+            cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+      },
+      e_cols_partition_);
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const {
+  if (options_.num_threads == 1) {
+    UpdateBlockDiagonalEtESingleThreaded(block_diagonal);
+  } else {
+    CHECK(options_.context != nullptr);
+    UpdateBlockDiagonalEtEMultiThreaded(block_diagonal);
+  }
+}
+
+// Similar to the code in RightMultiplyAndAccumulateF, except instead of the
+// matrix vector multiply its an outer product.
 //
 //   block_diagonal = block_diagonal(F'F)
 //
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-  const CompressedRowBlockStructure* block_diagonal_structure =
-      block_diagonal->block_structure();
+    UpdateBlockDiagonalFtFSingleThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto bs = matrix_.block_structure();
+  auto block_diagonal_structure = block_diagonal->block_structure();

  block_diagonal->SetZero();
  const double* values = matrix_.values();
@@ -380,5 +579,82 @@ void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }
 }

-}  // namespace internal
-}  // namespace ceres
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalFtFMultiThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto transpose_block_structure = matrix_.transpose_block_structure();
+  CHECK(transpose_block_structure != nullptr);
+  auto block_diagonal_structure = block_diagonal->block_structure();
+
+  const double* values = matrix_.values();
+  double* values_diagonal = block_diagonal->mutable_values();
+
+  const int num_col_blocks_e = num_col_blocks_e_;
+  const int num_row_blocks_e = num_row_blocks_e_;
+  ParallelFor(
+      options_.context,
+      num_col_blocks_e_,
+      num_col_blocks_e + num_col_blocks_f_,
+      options_.num_threads,
+      [transpose_block_structure,
+       block_diagonal_structure,
+       num_col_blocks_e,
+       num_row_blocks_e,
+       values,
+       values_diagonal](int col_block_id) {
+        const int col_block_size =
+            transpose_block_structure->rows[col_block_id].block.size;
+        const int diagonal_block_id = col_block_id - num_col_blocks_e;
+        const int cell_position =
+            block_diagonal_structure->rows[diagonal_block_id].cells[0].position;
+        double* cell_values = values_diagonal + cell_position;
+
+        MatrixRef(cell_values, col_block_size, col_block_size).setZero();
+
+        auto& cells = transpose_block_structure->rows[col_block_id].cells;
+        const int num_cells = cells.size();
+        int i = 0;
+        for (; i < num_cells; ++i) {
+          auto& cell = cells[i];
+          const int row_block_id = cell.block_id;
+          if (row_block_id >= num_row_blocks_e) break;
+          const int row_block_size =
+              transpose_block_structure->cols[row_block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply
+              <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
+                  values + cell.position, row_block_size, col_block_size,
+                  values + cell.position, row_block_size, col_block_size,
+                  cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+        for (; i < num_cells; ++i) {
+          auto& cell = cells[i];
+          const int row_block_id = cell.block_id;
+          const int row_block_size =
+              transpose_block_structure->cols[row_block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply
+              <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
+                  values + cell.position, row_block_size, col_block_size,
+                  values + cell.position, row_block_size, col_block_size,
+                  cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+      },
+      f_cols_partition_);
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const {
+  if (options_.num_threads == 1) {
+    UpdateBlockDiagonalFtFSingleThreaded(block_diagonal);
+  } else {
+    CHECK(options_.context != nullptr);
+    UpdateBlockDiagonalFtFMultiThreaded(block_diagonal);
+  }
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/partitioned_matrix_view_template.py
+++ b/extern/ceres/internal/ceres/partitioned_matrix_view_template.py
@@ -0,0 +1,149 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2023 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: sameeragarwal@google.com (Sameer Agarwal)
+#
+# Script for explicitly generating template specialization of the
+# PartitionedMatrixView class. Explicitly generating these
+# instantiations in separate .cc files breaks the compilation into
+# separate compilation unit rather than one large cc file.
+#
+# This script creates two sets of files.
+#
+# 1. partitioned_matrix_view_x_x_x.cc
+# where the x indicates the template parameters and
+#
+# 2. partitioned_matrix_view.cc
+#
+# that contains a factory function for instantiating these classes
+# based on runtime parameters.
+#
+# The list of tuples, specializations indicates the set of
+# specializations that is generated.
+
+HEADER = """// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+//
+// Template specialization of PartitionedMatrixView.
+//
+// ========================================
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+//=========================================
+//
+// This file is generated using generate_template_specializations.py.
+"""
+
+DYNAMIC_FILE = """
+#include "ceres/partitioned_matrix_view_impl.h"
+
+namespace ceres::internal {
+
+template class PartitionedMatrixView<%s,
+                                     %s,
+                                     %s>;
+
+}  // namespace ceres::internal
+"""
+
+SPECIALIZATION_FILE = """
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/config.h"
+
+#ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
+
+#include "ceres/partitioned_matrix_view_impl.h"
+
+namespace ceres::internal {
+
+template class PartitionedMatrixView<%s, %s, %s>;
+
+}  // namespace ceres::internal
+
+#endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
+"""
+
+FACTORY_FILE_HEADER = """
+#include <memory>
+
+#include "ceres/linear_solver.h"
+#include "ceres/partitioned_matrix_view.h"
+
+namespace ceres::internal {
+
+PartitionedMatrixViewBase::~PartitionedMatrixViewBase() = default;
+
+std::unique_ptr<PartitionedMatrixViewBase> PartitionedMatrixViewBase::Create(
+    const LinearSolver::Options& options, const BlockSparseMatrix& matrix) {
+#ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
+"""
+FACTORY = """  return std::make_unique<PartitionedMatrixView<%s,%s, %s>>(
+                   options, matrix);"""
+
+FACTORY_FOOTER = """
+#endif
+  VLOG(1) << "Template specializations not found for <"
+          << options.row_block_size << "," << options.e_block_size << ","
+          << options.f_block_size << ">";
+  return std::make_unique<PartitionedMatrixView<Eigen::Dynamic,
+                                                Eigen::Dynamic,
+                                                Eigen::Dynamic>>(
+      options, matrix);
+};
+
+}  // namespace ceres::internal
+"""
--- a/extern/ceres/internal/ceres/polynomial.cc
+++ b/extern/ceres/internal/ceres/polynomial.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,10 +40,7 @@
 #include "ceres/internal/export.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
+namespace ceres::internal {

 namespace {

@@ -326,7 +323,7 @@ void MinimizePolynomial(const Vector& polynomial,
  }
 }

-Vector FindInterpolatingPolynomial(const vector<FunctionSample>& samples) {
+Vector FindInterpolatingPolynomial(const std::vector<FunctionSample>& samples) {
  const int num_samples = samples.size();
  int num_constraints = 0;
  for (int i = 0; i < num_samples; ++i) {
@@ -369,7 +366,7 @@ Vector FindInterpolatingPolynomial(const vector<FunctionSample>& samples) {
  return lu.setThreshold(0.0).solve(rhs);
 }

-void MinimizeInterpolatingPolynomial(const vector<FunctionSample>& samples,
+void MinimizeInterpolatingPolynomial(const std::vector<FunctionSample>& samples,
                                     double x_min,
                                     double x_max,
                                     double* optimal_x,
@@ -389,5 +386,4 @@ void MinimizeInterpolatingPolynomial(const vector<FunctionSample>& samples,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/polynomial.h
+++ b/extern/ceres/internal/ceres/polynomial.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct FunctionSample;

@@ -116,8 +115,7 @@ CERES_NO_EXPORT void MinimizeInterpolatingPolynomial(
    double* optimal_x,
    double* optimal_value);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/power_series_expansion_preconditioner.cc
+++ b/extern/ceres/internal/ceres/power_series_expansion_preconditioner.cc
@@ -0,0 +1,88 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: markshachkov@gmail.com (Mark Shachkov)
+
+#include "ceres/power_series_expansion_preconditioner.h"
+
+#include "ceres/eigen_vector_ops.h"
+#include "ceres/parallel_vector_ops.h"
+#include "ceres/preconditioner.h"
+
+namespace ceres::internal {
+
+PowerSeriesExpansionPreconditioner::PowerSeriesExpansionPreconditioner(
+    const ImplicitSchurComplement* isc,
+    const int max_num_spse_iterations,
+    const double spse_tolerance,
+    const Preconditioner::Options& options)
+    : isc_(isc),
+      max_num_spse_iterations_(max_num_spse_iterations),
+      spse_tolerance_(spse_tolerance),
+      options_(options) {}
+
+PowerSeriesExpansionPreconditioner::~PowerSeriesExpansionPreconditioner() =
+    default;
+
+bool PowerSeriesExpansionPreconditioner::Update(const LinearOperator& /*A*/,
+                                                const double* /*D*/) {
+  return true;
+}
+
+void PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate(
+    const double* x, double* y) const {
+  VectorRef yref(y, num_rows());
+  Vector series_term(num_rows());
+  Vector previous_series_term(num_rows());
+  ParallelSetZero(options_.context, options_.num_threads, yref);
+  isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(
+      x, y, options_.context, options_.num_threads);
+  ParallelAssign(
+      options_.context, options_.num_threads, previous_series_term, yref);
+
+  const double norm_threshold =
+      spse_tolerance_ * Norm(yref, options_.context, options_.num_threads);
+
+  for (int i = 1;; i++) {
+    ParallelSetZero(options_.context, options_.num_threads, series_term);
+    isc_->InversePowerSeriesOperatorRightMultiplyAccumulate(
+        previous_series_term.data(), series_term.data());
+    ParallelAssign(
+        options_.context, options_.num_threads, yref, yref + series_term);
+    if (i >= max_num_spse_iterations_ || series_term.norm() < norm_threshold) {
+      break;
+    }
+    std::swap(previous_series_term, series_term);
+  }
+}
+
+int PowerSeriesExpansionPreconditioner::num_rows() const {
+  return isc_->num_rows();
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/power_series_expansion_preconditioner.h
+++ b/extern/ceres/internal/ceres/power_series_expansion_preconditioner.h
@@ -0,0 +1,71 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: markshachkov@gmail.com (Mark Shachkov)
+
+#ifndef CERES_INTERNAL_POWER_SERIES_EXPANSION_PRECONDITIONER_H_
+#define CERES_INTERNAL_POWER_SERIES_EXPANSION_PRECONDITIONER_H_
+
+#include "ceres/implicit_schur_complement.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/internal/export.h"
+#include "ceres/preconditioner.h"
+
+namespace ceres::internal {
+
+// This is a preconditioner via power series expansion of Schur
+// complement inverse based on "Weber et al, Power Bundle Adjustment for
+// Large-Scale 3D Reconstruction".
+class CERES_NO_EXPORT PowerSeriesExpansionPreconditioner
+    : public Preconditioner {
+ public:
+  // TODO: Consider moving max_num_spse_iterations and spse_tolerance to
+  // Preconditioner::Options
+  PowerSeriesExpansionPreconditioner(const ImplicitSchurComplement* isc,
+                                     const int max_num_spse_iterations,
+                                     const double spse_tolerance,
+                                     const Preconditioner::Options& options);
+  PowerSeriesExpansionPreconditioner(
+      const PowerSeriesExpansionPreconditioner&) = delete;
+  void operator=(const PowerSeriesExpansionPreconditioner&) = delete;
+  ~PowerSeriesExpansionPreconditioner() override;
+
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  bool Update(const LinearOperator& A, const double* D) final;
+  int num_rows() const final;
+
+ private:
+  const ImplicitSchurComplement* isc_;
+  const int max_num_spse_iterations_;
+  const double spse_tolerance_;
+  const Preconditioner::Options options_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_INTERNAL_POWER_SERIES_EXPANSION_PRECONDITIONER_H_
--- a/extern/ceres/internal/ceres/preconditioner.cc
+++ b/extern/ceres/internal/ceres/preconditioner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,7 @@

 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 Preconditioner::~Preconditioner() = default;

@@ -48,27 +47,27 @@ PreconditionerType Preconditioner::PreconditionerForZeroEBlocks(
 }

 SparseMatrixPreconditionerWrapper::SparseMatrixPreconditionerWrapper(
-    const SparseMatrix* matrix)
-    : matrix_(matrix) {
+    const SparseMatrix* matrix, const Preconditioner::Options& options)
+    : matrix_(matrix), options_(options) {
  CHECK(matrix != nullptr);
 }

 SparseMatrixPreconditionerWrapper::~SparseMatrixPreconditionerWrapper() =
    default;

-bool SparseMatrixPreconditionerWrapper::UpdateImpl(const SparseMatrix& A,
-                                                   const double* D) {
+bool SparseMatrixPreconditionerWrapper::UpdateImpl(const SparseMatrix& /*A*/,
+                                                   const double* /*D*/) {
  return true;
 }

-void SparseMatrixPreconditionerWrapper::RightMultiply(const double* x,
-                                                      double* y) const {
-  matrix_->RightMultiply(x, y);
+void SparseMatrixPreconditionerWrapper::RightMultiplyAndAccumulate(
+    const double* x, double* y) const {
+  matrix_->RightMultiplyAndAccumulate(
+      x, y, options_.context, options_.num_threads);
 }

 int SparseMatrixPreconditionerWrapper::num_rows() const {
  return matrix_->num_rows();
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/preconditioner.h
+++ b/extern/ceres/internal/ceres/preconditioner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,11 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 #include "ceres/linear_operator.h"
+#include "ceres/linear_solver.h"
 #include "ceres/sparse_matrix.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;
 class SparseMatrix;
@@ -51,10 +51,25 @@ class SparseMatrix;
 class CERES_NO_EXPORT Preconditioner : public LinearOperator {
 public:
  struct Options {
+    Options() = default;
+    Options(const LinearSolver::Options& linear_solver_options)
+        : type(linear_solver_options.preconditioner_type),
+          visibility_clustering_type(
+              linear_solver_options.visibility_clustering_type),
+          sparse_linear_algebra_library_type(
+              linear_solver_options.sparse_linear_algebra_library_type),
+          num_threads(linear_solver_options.num_threads),
+          row_block_size(linear_solver_options.row_block_size),
+          e_block_size(linear_solver_options.e_block_size),
+          f_block_size(linear_solver_options.f_block_size),
+          elimination_groups(linear_solver_options.elimination_groups),
+          context(linear_solver_options.context) {}
+
    PreconditionerType type = JACOBI;
    VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS;
    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =
        SUITE_SPARSE;
+    OrderingType ordering_type = OrderingType::NATURAL;

    // When using the subset preconditioner, all row blocks starting
    // from this row block are used to construct the preconditioner.
@@ -68,9 +83,6 @@ class CERES_NO_EXPORT Preconditioner : public LinearOperator {
    // and the preconditioner is the inverse of the matrix Q'Q.
    int subset_preconditioner_start_row_block = -1;

-    // See solver.h for information about these flags.
-    bool use_postordering = false;
-
    // If possible, how many threads the preconditioner can use.
    int num_threads = 1;

@@ -132,18 +144,37 @@ class CERES_NO_EXPORT Preconditioner : public LinearOperator {
  virtual bool Update(const LinearOperator& A, const double* D) = 0;

  // LinearOperator interface. Since the operator is symmetric,
-  // LeftMultiply and num_cols are just calls to RightMultiply and
-  // num_rows respectively. Update() must be called before
-  // RightMultiply can be called.
-  void RightMultiply(const double* x, double* y) const override = 0;
-  void LeftMultiply(const double* x, double* y) const override {
-    return RightMultiply(x, y);
+  // LeftMultiplyAndAccumulate and num_cols are just calls to
+  // RightMultiplyAndAccumulate and num_rows respectively. Update() must be
+  // called before RightMultiplyAndAccumulate can be called.
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y) const override = 0;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const override {
+    return RightMultiplyAndAccumulate(x, y);
  }

  int num_rows() const override = 0;
  int num_cols() const override { return num_rows(); }
 };

+class CERES_NO_EXPORT IdentityPreconditioner : public Preconditioner {
+ public:
+  IdentityPreconditioner(int num_rows) : num_rows_(num_rows) {}
+
+  bool Update(const LinearOperator& /*A*/, const double* /*D*/) final {
+    return true;
+  }
+
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final {
+    VectorRef(y, num_rows_) += ConstVectorRef(x, num_rows_);
+  }
+
+  int num_rows() const final { return num_rows_; }
+
+ private:
+  int num_rows_ = -1;
+};
+
 // This templated subclass of Preconditioner serves as a base class for
 // other preconditioners that depend on the particular matrix layout of
 // the underlying linear operator.
@@ -171,20 +202,21 @@ class CERES_NO_EXPORT SparseMatrixPreconditionerWrapper final
    : public SparseMatrixPreconditioner {
 public:
  // Wrapper does NOT take ownership of the matrix pointer.
-  explicit SparseMatrixPreconditionerWrapper(const SparseMatrix* matrix);
+  explicit SparseMatrixPreconditionerWrapper(
+      const SparseMatrix* matrix, const Preconditioner::Options& options);
  ~SparseMatrixPreconditionerWrapper() override;

  // Preconditioner interface
-  void RightMultiply(const double* x, double* y) const override;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const override;
  int num_rows() const override;

 private:
  bool UpdateImpl(const SparseMatrix& A, const double* D) override;
  const SparseMatrix* matrix_;
+  const Preconditioner::Options options_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/preprocessor.cc
+++ b/extern/ceres/internal/ceres/preprocessor.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,13 +35,12 @@
 #include "ceres/callbacks.h"
 #include "ceres/gradient_checking_cost_function.h"
 #include "ceres/line_search_preprocessor.h"
-#include "ceres/parallel_for.h"
 #include "ceres/problem_impl.h"
 #include "ceres/solver.h"
+#include "ceres/thread_pool.h"
 #include "ceres/trust_region_preprocessor.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 std::unique_ptr<Preprocessor> Preprocessor::Create(
    MinimizerType minimizer_type) {
@@ -63,7 +62,7 @@ void ChangeNumThreadsIfNeeded(Solver::Options* options) {
  if (options->num_threads == 1) {
    return;
  }
-  const int num_threads_available = MaxNumThreadsAvailable();
+  const int num_threads_available = ThreadPool::MaxNumThreadsAvailable();
  if (options->num_threads > num_threads_available) {
    LOG(WARNING) << "Specified options.num_threads: " << options->num_threads
                 << " exceeds maximum available from the threading model Ceres "
@@ -83,9 +82,11 @@ void SetupCommonMinimizerOptions(PreprocessedProblem* pp) {
  double* reduced_parameters = pp->reduced_parameters.data();
  program->ParameterBlocksToStateVector(reduced_parameters);

+  auto context = pp->problem->context();
  Minimizer::Options& minimizer_options = pp->minimizer_options;
  minimizer_options = Minimizer::Options(options);
  minimizer_options.evaluator = pp->evaluator;
+  minimizer_options.context = context;

  if (options.logging_type != SILENT) {
    pp->logging_callback = std::make_unique<LoggingCallback>(
@@ -104,5 +105,4 @@ void SetupCommonMinimizerOptions(PreprocessedProblem* pp) {
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/preprocessor.h
+++ b/extern/ceres/internal/ceres/preprocessor.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@
 #include "ceres/program.h"
 #include "ceres/solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct PreprocessedProblem;

@@ -118,8 +117,7 @@ void ChangeNumThreadsIfNeeded(Solver::Options* options);
 CERES_NO_EXPORT
 void SetupCommonMinimizerOptions(PreprocessedProblem* pp);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/problem.cc
+++ b/extern/ceres/internal/ceres/problem.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,6 @@

 namespace ceres {

-using std::vector;
-
 Problem::Problem() : impl_(new internal::ProblemImpl) {}
 Problem::Problem(const Problem::Options& options)
    : impl_(new internal::ProblemImpl(options)) {}
@@ -52,7 +50,7 @@ Problem::~Problem() = default;
 ResidualBlockId Problem::AddResidualBlock(
    CostFunction* cost_function,
    LossFunction* loss_function,
-    const vector<double*>& parameter_blocks) {
+    const std::vector<double*>& parameter_blocks) {
  return impl_->AddResidualBlock(cost_function,
                                 loss_function,
                                 parameter_blocks.data(),
@@ -71,12 +69,6 @@ void Problem::AddParameterBlock(double* values, int size) {
  impl_->AddParameterBlock(values, size);
 }

-void Problem::AddParameterBlock(double* values,
-                                int size,
-                                LocalParameterization* local_parameterization) {
-  impl_->AddParameterBlock(values, size, local_parameterization);
-}
-
 void Problem::AddParameterBlock(double* values, int size, Manifold* manifold) {
  impl_->AddParameterBlock(values, size, manifold);
 }
@@ -101,20 +93,6 @@ bool Problem::IsParameterBlockConstant(const double* values) const {
  return impl_->IsParameterBlockConstant(values);
 }

-void Problem::SetParameterization(
-    double* values, LocalParameterization* local_parameterization) {
-  impl_->SetParameterization(values, local_parameterization);
-}
-
-const LocalParameterization* Problem::GetParameterization(
-    const double* values) const {
-  return impl_->GetParameterization(values);
-}
-
-bool Problem::HasParameterization(const double* values) const {
-  return impl_->HasParameterization(values);
-}
-
 void Problem::SetManifold(double* values, Manifold* manifold) {
  impl_->SetManifold(values, manifold);
 }
@@ -149,8 +127,8 @@ double Problem::GetParameterLowerBound(const double* values, int index) const {

 bool Problem::Evaluate(const EvaluateOptions& evaluate_options,
                       double* cost,
-                       vector<double>* residuals,
-                       vector<double>* gradient,
+                       std::vector<double>* residuals,
+                       std::vector<double>* gradient,
                       CRSMatrix* jacobian) {
  return impl_->Evaluate(evaluate_options, cost, residuals, gradient, jacobian);
 }
@@ -194,10 +172,6 @@ int Problem::ParameterBlockSize(const double* values) const {
  return impl_->ParameterBlockSize(values);
 }

-int Problem::ParameterBlockLocalSize(const double* values) const {
-  return impl_->ParameterBlockTangentSize(values);
-}
-
 int Problem::ParameterBlockTangentSize(const double* values) const {
  return impl_->ParameterBlockTangentSize(values);
 }
@@ -206,18 +180,18 @@ bool Problem::HasParameterBlock(const double* values) const {
  return impl_->HasParameterBlock(values);
 }

-void Problem::GetParameterBlocks(vector<double*>* parameter_blocks) const {
+void Problem::GetParameterBlocks(std::vector<double*>* parameter_blocks) const {
  impl_->GetParameterBlocks(parameter_blocks);
 }

 void Problem::GetResidualBlocks(
-    vector<ResidualBlockId>* residual_blocks) const {
+    std::vector<ResidualBlockId>* residual_blocks) const {
  impl_->GetResidualBlocks(residual_blocks);
 }

 void Problem::GetParameterBlocksForResidualBlock(
    const ResidualBlockId residual_block,
-    vector<double*>* parameter_blocks) const {
+    std::vector<double*>* parameter_blocks) const {
  impl_->GetParameterBlocksForResidualBlock(residual_block, parameter_blocks);
 }

@@ -232,8 +206,12 @@ const LossFunction* Problem::GetLossFunctionForResidualBlock(
 }

 void Problem::GetResidualBlocksForParameterBlock(
-    const double* values, vector<ResidualBlockId>* residual_blocks) const {
+    const double* values, std::vector<ResidualBlockId>* residual_blocks) const {
  impl_->GetResidualBlocksForParameterBlock(values, residual_blocks);
 }

+const Problem::Options& Problem::options() const { return impl_->options(); }
+
+internal::ProblemImpl* Problem::mutable_impl() { return impl_.get(); }
+
 }  // namespace ceres
--- a/extern/ceres/internal/ceres/problem_impl.cc
+++ b/extern/ceres/internal/ceres/problem_impl.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -53,7 +53,6 @@
 #include "ceres/internal/fixed_array.h"
 #include "ceres/loss_function.h"
 #include "ceres/manifold.h"
-#include "ceres/manifold_adapter.h"
 #include "ceres/map_util.h"
 #include "ceres/parameter_block.h"
 #include "ceres/program.h"
@@ -64,8 +63,7 @@
 #include "ceres/stringprintf.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 namespace {
 // Returns true if two regions of memory, a and b, with sizes size_a and size_b
 // respectively, overlap.
@@ -257,10 +255,6 @@ ProblemImpl::~ProblemImpl() {
    DeleteBlock(parameter_block);
  }

-  // Delete the owned parameterizations.
-  STLDeleteUniqueContainerPointers(local_parameterizations_to_delete_.begin(),
-                                   local_parameterizations_to_delete_.end());
-
  // Delete the owned manifolds.
  STLDeleteUniqueContainerPointers(manifolds_to_delete_.begin(),
                                   manifolds_to_delete_.end());
@@ -365,45 +359,15 @@ void ProblemImpl::AddParameterBlock(double* values, int size) {
  InternalAddParameterBlock(values, size);
 }

-void ProblemImpl::InternalSetParameterization(
-    double* values,
-    ParameterBlock* parameter_block,
-    LocalParameterization* local_parameterization) {
-  parameter_block_to_local_param_[values] = local_parameterization;
-  Manifold* manifold = nullptr;
-  if (local_parameterization != nullptr) {
-    if (options_.local_parameterization_ownership == TAKE_OWNERSHIP) {
-      local_parameterizations_to_delete_.push_back(local_parameterization);
-    }
-
-    manifold = new ManifoldAdapter(local_parameterization);
-    // Add the manifold to manifolds_to_delete_ unconditionally since
-    // we own it and it will need to be deleted.
-    manifolds_to_delete_.push_back(manifold);
-  }
-
-  parameter_block->SetManifold(manifold);
-}
-
-void ProblemImpl::InternalSetManifold(double* values,
+void ProblemImpl::InternalSetManifold(double* /*values*/,
                                      ParameterBlock* parameter_block,
                                      Manifold* manifold) {
-  // Reset any association between this parameter block and a local
-  // parameterization. This only needs done while we are in the transition from
-  // LocalParameterization to Manifold.
-  parameter_block_to_local_param_[values] = nullptr;
  if (manifold != nullptr && options_.manifold_ownership == TAKE_OWNERSHIP) {
    manifolds_to_delete_.push_back(manifold);
  }
  parameter_block->SetManifold(manifold);
 }

-void ProblemImpl::AddParameterBlock(
-    double* values, int size, LocalParameterization* local_parameterization) {
-  ParameterBlock* parameter_block = InternalAddParameterBlock(values, size);
-  InternalSetParameterization(values, parameter_block, local_parameterization);
-}
-
 void ProblemImpl::AddParameterBlock(double* values,
                                    int size,
                                    Manifold* manifold) {
@@ -539,19 +503,6 @@ void ProblemImpl::SetParameterBlockVariable(double* values) {
  parameter_block->SetVarying();
 }

-void ProblemImpl::SetParameterization(
-    double* values, LocalParameterization* local_parameterization) {
-  ParameterBlock* parameter_block =
-      FindWithDefault(parameter_block_map_, values, nullptr);
-  if (parameter_block == nullptr) {
-    LOG(FATAL) << "Parameter block not found: " << values
-               << ". You must add the parameter block to the problem before "
-               << "you can set its local parameterization.";
-  }
-
-  InternalSetParameterization(values, parameter_block, local_parameterization);
-}
-
 void ProblemImpl::SetManifold(double* values, Manifold* manifold) {
  ParameterBlock* parameter_block =
      FindWithDefault(parameter_block_map_, values, nullptr);
@@ -564,22 +515,13 @@ void ProblemImpl::SetManifold(double* values, Manifold* manifold) {
  InternalSetManifold(values, parameter_block, manifold);
 }

-const LocalParameterization* ProblemImpl::GetParameterization(
-    const double* values) const {
-  return FindWithDefault(parameter_block_to_local_param_, values, nullptr);
-}
-
-bool ProblemImpl::HasParameterization(const double* values) const {
-  return GetParameterization(values) != nullptr;
-}
-
 const Manifold* ProblemImpl::GetManifold(const double* values) const {
  ParameterBlock* parameter_block = FindWithDefault(
      parameter_block_map_, const_cast<double*>(values), nullptr);
  if (parameter_block == nullptr) {
    LOG(FATAL) << "Parameter block not found: " << values
               << ". You must add the parameter block to the problem before "
-               << "you can get its local parameterization.";
+               << "you can get its manifold.";
  }

  return parameter_block->manifold();
@@ -730,17 +672,7 @@ bool ProblemImpl::Evaluate(const Problem::EvaluateOptions& evaluate_options,
  // the Evaluator decides the storage for the Jacobian based on the
  // type of linear solver being used.
  evaluator_options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
-#ifdef CERES_NO_THREADS
-  if (evaluate_options.num_threads > 1) {
-    LOG(WARNING)
-        << "No threading support is compiled into this binary; "
-        << "only evaluate_options.num_threads = 1 is supported. Switching "
-        << "to single threaded mode.";
-  }
-  evaluator_options.num_threads = 1;
-#else
  evaluator_options.num_threads = evaluate_options.num_threads;
-#endif  // CERES_NO_THREADS

  // The main thread also does work so we only need to launch num_threads - 1.
  context_impl_->EnsureMinimumThreads(evaluator_options.num_threads - 1);
@@ -968,5 +900,4 @@ void ProblemImpl::GetResidualBlocksForParameterBlock(
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/problem_impl.h
+++ b/extern/ceres/internal/ceres/problem_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2021 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,6 @@ namespace ceres {
 class CostFunction;
 class EvaluationCallback;
 class LossFunction;
-class LocalParameterization;
 struct CRSMatrix;

 namespace internal {
@@ -100,10 +99,6 @@ class CERES_NO_EXPORT ProblemImpl {
  }

  void AddParameterBlock(double* values, int size);
-  void AddParameterBlock(double* values,
-                         int size,
-                         LocalParameterization* local_parameterization);
-
  void AddParameterBlock(double* values, int size, Manifold* manifold);

  void RemoveResidualBlock(ResidualBlock* residual_block);
@@ -113,11 +108,6 @@ class CERES_NO_EXPORT ProblemImpl {
  void SetParameterBlockVariable(double* values);
  bool IsParameterBlockConstant(const double* values) const;

-  void SetParameterization(double* values,
-                           LocalParameterization* local_parameterization);
-  const LocalParameterization* GetParameterization(const double* values) const;
-  bool HasParameterization(const double* values) const;
-
  void SetManifold(double* values, Manifold* manifold);
  const Manifold* GetManifold(const double* values) const;
  bool HasManifold(const double* values) const;
@@ -176,14 +166,12 @@ class CERES_NO_EXPORT ProblemImpl {
    return residual_block_set_;
  }

+  const Problem::Options& options() const { return options_; }
+
  ContextImpl* context() { return context_impl_; }

 private:
  ParameterBlock* InternalAddParameterBlock(double* values, int size);
-  void InternalSetParameterization(
-      double* values,
-      ParameterBlock* parameter_block,
-      LocalParameterization* local_parameterization);
  void InternalSetManifold(double* values,
                           ParameterBlock* parameter_block,
                           Manifold* manifold);
@@ -214,15 +202,8 @@ class CERES_NO_EXPORT ProblemImpl {
  std::unique_ptr<internal::Program> program_;

  // TODO(sameeragarwal): Unify the shared object handling across object types.
-  // Right now we are using vectors for LocalParameterization and Manifold
-  // objects and reference counting for CostFunctions and LossFunctions. Ideally
-  // this should be done uniformly.
-
-  // When removing parameter blocks, parameterizations have ambiguous
-  // ownership. Instead of scanning the entire problem to see if the
-  // parameterization is shared with other parameter blocks, buffer
-  // them until destruction.
-  std::vector<LocalParameterization*> local_parameterizations_to_delete_;
+  // Right now we are using vectors for Manifold objects and reference counting
+  // for CostFunctions and LossFunctions. Ideally this should be done uniformly.

  // When removing parameter blocks, manifolds have ambiguous
  // ownership. Instead of scanning the entire problem to see if the
@@ -236,17 +217,6 @@ class CERES_NO_EXPORT ProblemImpl {
  // destroyed.
  CostFunctionRefCount cost_function_ref_count_;
  LossFunctionRefCount loss_function_ref_count_;
-
-  // Because we wrap LocalParameterization objects using a ManifoldAdapter, when
-  // the user calls GetParameterization we cannot use the same logic as
-  // GetManifold as the ParameterBlock object only returns a Manifold object. So
-  // this map stores the association between parameter blocks and local
-  // parameterizations.
-  //
-  // This is a temporary object which will be removed once the
-  // LocalParameterization to Manifold transition is complete.
-  std::unordered_map<const double*, LocalParameterization*>
-      parameter_block_to_local_param_;
 };

 }  // namespace internal
--- a/extern/ceres/internal/ceres/program.cc
+++ b/extern/ceres/internal/ceres/program.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -45,14 +45,14 @@
 #include "ceres/loss_function.h"
 #include "ceres/manifold.h"
 #include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
 #include "ceres/parameter_block.h"
 #include "ceres/problem.h"
 #include "ceres/residual_block.h"
 #include "ceres/stl_util.h"
 #include "ceres/triplet_sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 const std::vector<ParameterBlock*>& Program::parameter_blocks() const {
  return parameter_blocks_;
@@ -109,16 +109,32 @@ bool Program::SetParameterBlockStatePtrsToUserStatePtrs() {

 bool Program::Plus(const double* state,
                   const double* delta,
-                   double* state_plus_delta) const {
-  for (auto* parameter_block : parameter_blocks_) {
-    if (!parameter_block->Plus(state, delta, state_plus_delta)) {
-      return false;
-    }
-    state += parameter_block->Size();
-    delta += parameter_block->TangentSize();
-    state_plus_delta += parameter_block->Size();
-  }
-  return true;
+                   double* state_plus_delta,
+                   ContextImpl* context,
+                   int num_threads) const {
+  std::atomic<bool> abort(false);
+  auto* parameter_blocks = parameter_blocks_.data();
+  ParallelFor(
+      context,
+      0,
+      parameter_blocks_.size(),
+      num_threads,
+      [&abort, state, delta, state_plus_delta, parameter_blocks](int block_id) {
+        if (abort) {
+          return;
+        }
+        auto parameter_block = parameter_blocks[block_id];
+
+        auto block_state = state + parameter_block->state_offset();
+        auto block_delta = delta + parameter_block->delta_offset();
+        auto block_state_plus_delta =
+            state_plus_delta + parameter_block->state_offset();
+        if (!parameter_block->Plus(
+                block_state, block_delta, block_state_plus_delta)) {
+          abort = true;
+        }
+      });
+  return abort == false;
 }

 void Program::SetParameterOffsetsAndIndex() {
@@ -545,5 +561,4 @@ std::string Program::ToString() const {
  return ret;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/program.h
+++ b/extern/ceres/internal/ceres/program.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,13 +40,13 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class ParameterBlock;
 class ProblemImpl;
 class ResidualBlock;
 class TripletSparseMatrix;
+class ContextImpl;

 // A nonlinear least squares optimization problem. This is different from the
 // similarly-named "Problem" object, which offers a mutation interface for
@@ -87,7 +87,9 @@ class CERES_NO_EXPORT Program {
  // Update a state vector for the program given a delta.
  bool Plus(const double* state,
            const double* delta,
-            double* state_plus_delta) const;
+            double* state_plus_delta,
+            ContextImpl* context,
+            int num_threads) const;

  // Set the parameter indices and offsets. This permits mapping backward
  // from a ParameterBlock* to an index in the parameter_blocks() vector. For
@@ -192,8 +194,7 @@ class CERES_NO_EXPORT Program {
  friend class ProblemImpl;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/program_evaluator.h
+++ b/extern/ceres/internal/ceres/program_evaluator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@
 // residual jacobians are written directly into their final position in the
 // block sparse matrix by the user's CostFunction; there is no copying.
 //
-// The evaluation is threaded with OpenMP or C++ threads.
+// The evaluation is threaded with C++ threads.
 //
 // The EvaluatePreparer and JacobianWriter interfaces are as follows:
 //
@@ -96,6 +96,7 @@
 #include "ceres/execution_summary.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
 #include "ceres/parameter_block.h"
 #include "ceres/program.h"
 #include "ceres/residual_block.h"
@@ -105,7 +106,7 @@ namespace ceres {
 namespace internal {

 struct NullJacobianFinalizer {
-  void operator()(SparseMatrix* jacobian, int num_parameters) {}
+  void operator()(SparseMatrix* /*jacobian*/, int /*num_parameters*/) {}
 };

 template <typename EvaluatePreparer,
@@ -118,19 +119,11 @@ class ProgramEvaluator final : public Evaluator {
        program_(program),
        jacobian_writer_(options, program),
        evaluate_preparers_(std::move(
-            jacobian_writer_.CreateEvaluatePreparers(options.num_threads))) {
-#ifdef CERES_NO_THREADS
-    if (options_.num_threads > 1) {
-      LOG(WARNING) << "No threading support is compiled into this binary; "
-                   << "only options.num_threads = 1 is supported. Switching "
-                   << "to single threaded mode.";
-      options_.num_threads = 1;
-    }
-#endif  // CERES_NO_THREADS
-
+            jacobian_writer_.CreateEvaluatePreparers(options.num_threads))),
+        num_parameters_(program->NumEffectiveParameters()) {
    BuildResidualLayout(*program, &residual_layout_);
-    evaluate_scratch_ =
-        std::move(CreateEvaluatorScratch(*program, options.num_threads));
+    evaluate_scratch_ = std::move(CreateEvaluatorScratch(
+        *program, static_cast<unsigned>(options.num_threads)));
  }

  // Implementation of Evaluator interface.
@@ -164,20 +157,24 @@ class ProgramEvaluator final : public Evaluator {
    }

    if (residuals != nullptr) {
-      VectorRef(residuals, program_->NumResiduals()).setZero();
+      ParallelSetZero(options_.context,
+                      options_.num_threads,
+                      residuals,
+                      program_->NumResiduals());
    }

    if (jacobian != nullptr) {
-      jacobian->SetZero();
+      jacobian->SetZero(options_.context, options_.num_threads);
    }

    // Each thread gets it's own cost and evaluate scratch space.
    for (int i = 0; i < options_.num_threads; ++i) {
      evaluate_scratch_[i].cost = 0.0;
      if (gradient != nullptr) {
-        VectorRef(evaluate_scratch_[i].gradient.get(),
-                  program_->NumEffectiveParameters())
-            .setZero();
+        ParallelSetZero(options_.context,
+                        options_.num_threads,
+                        evaluate_scratch_[i].gradient.get(),
+                        num_parameters_);
      }
    }

@@ -259,38 +256,55 @@ class ProgramEvaluator final : public Evaluator {
          }
        });

-    if (!abort) {
-      const int num_parameters = program_->NumEffectiveParameters();
+    if (abort) {
+      return false;
+    }

-      // Sum the cost and gradient (if requested) from each thread.
-      (*cost) = 0.0;
+    // Sum the cost and gradient (if requested) from each thread.
+    (*cost) = 0.0;
+    if (gradient != nullptr) {
+      auto gradient_vector = VectorRef(gradient, num_parameters_);
+      ParallelSetZero(options_.context, options_.num_threads, gradient_vector);
+    }
+
+    for (int i = 0; i < options_.num_threads; ++i) {
+      (*cost) += evaluate_scratch_[i].cost;
      if (gradient != nullptr) {
-        VectorRef(gradient, num_parameters).setZero();
-      }
-      for (int i = 0; i < options_.num_threads; ++i) {
-        (*cost) += evaluate_scratch_[i].cost;
-        if (gradient != nullptr) {
-          VectorRef(gradient, num_parameters) +=
-              VectorRef(evaluate_scratch_[i].gradient.get(), num_parameters);
-        }
-      }
-
-      // Finalize the Jacobian if it is available.
-      // `num_parameters` is passed to the finalizer so that additional
-      // storage can be reserved for additional diagonal elements if
-      // necessary.
-      if (jacobian != nullptr) {
-        JacobianFinalizer f;
-        f(jacobian, num_parameters);
+        auto gradient_vector = VectorRef(gradient, num_parameters_);
+        ParallelAssign(
+            options_.context,
+            options_.num_threads,
+            gradient_vector,
+            gradient_vector + VectorRef(evaluate_scratch_[i].gradient.get(),
+                                        num_parameters_));
      }
    }
-    return !abort;
+
+    // It is possible that after accumulation that the cost has become infinite
+    // or a nan.
+    if (!std::isfinite(*cost)) {
+      LOG(ERROR) << "Accumulated cost = " << *cost
+                 << " is not a finite number. Evaluation failed.";
+      return false;
+    }
+
+    // Finalize the Jacobian if it is available.
+    // `num_parameters` is passed to the finalizer so that additional
+    // storage can be reserved for additional diagonal elements if
+    // necessary.
+    if (jacobian != nullptr) {
+      JacobianFinalizer f;
+      f(jacobian, num_parameters_);
+    }
+
+    return true;
  }

  bool Plus(const double* state,
            const double* delta,
            double* state_plus_delta) const final {
-    return program_->Plus(state, delta, state_plus_delta);
+    return program_->Plus(
+        state, delta, state_plus_delta, options_.context, options_.num_threads);
  }

  int NumParameters() const final { return program_->NumParameters(); }
@@ -345,7 +359,7 @@ class ProgramEvaluator final : public Evaluator {

  // Create scratch space for each thread evaluating the program.
  static std::unique_ptr<EvaluateScratch[]> CreateEvaluatorScratch(
-      const Program& program, int num_threads) {
+      const Program& program, unsigned num_threads) {
    int max_parameters_per_residual_block =
        program.MaxParametersPerResidualBlock();
    int max_scratch_doubles_needed_for_evaluate =
@@ -370,6 +384,7 @@ class ProgramEvaluator final : public Evaluator {
  std::unique_ptr<EvaluatePreparer[]> evaluate_preparers_;
  std::unique_ptr<EvaluateScratch[]> evaluate_scratch_;
  std::vector<int> residual_layout_;
+  int num_parameters_;
  ::ceres::internal::ExecutionSummary execution_summary_;
 };

--- a/extern/ceres/internal/ceres/random.h
+++ b/extern/ceres/internal/ceres/random.h
@@ -1,73 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keir@google.com (Keir Mierle)
-//         sameeragarwal@google.com (Sameer Agarwal)
-
-#ifndef CERES_INTERNAL_RANDOM_H_
-#define CERES_INTERNAL_RANDOM_H_
-
-#include <cmath>
-#include <cstdlib>
-
-#include "ceres/internal/export.h"
-
-namespace ceres {
-
-inline void SetRandomState(int state) { srand(state); }
-
-inline int Uniform(int n) {
-  if (n) {
-    return rand() % n;
-  } else {
-    return 0;
-  }
-}
-
-inline double RandDouble() {
-  auto r = static_cast<double>(rand());
-  return r / RAND_MAX;
-}
-
-// Box-Muller algorithm for normal random number generation.
-// http://en.wikipedia.org/wiki/Box-Muller_transform
-inline double RandNormal() {
-  double x1, x2, w;
-  do {
-    x1 = 2.0 * RandDouble() - 1.0;
-    x2 = 2.0 * RandDouble() - 1.0;
-    w = x1 * x1 + x2 * x2;
-  } while (w >= 1.0 || w == 0.0);
-
-  w = sqrt((-2.0 * log(w)) / w);
-  return x1 * w;
-}
-
-}  // namespace ceres
-
-#endif  // CERES_INTERNAL_RANDOM_H_
--- a/extern/ceres/internal/ceres/reorder_program.cc
+++ b/extern/ceres/internal/ceres/reorder_program.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,12 +31,14 @@
 #include "ceres/reorder_program.h"

 #include <algorithm>
+#include <map>
 #include <memory>
 #include <numeric>
+#include <set>
+#include <string>
 #include <vector>

 #include "Eigen/SparseCore"
-#include "ceres/cxsparse.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/export.h"
 #include "ceres/ordered_groups.h"
@@ -51,18 +53,19 @@
 #include "ceres/types.h"

 #ifdef CERES_USE_EIGEN_SPARSE
+
+#ifndef CERES_NO_EIGEN_METIS
+#include <iostream>  // Need this because MetisSupport refers to std::cerr.
+
+#include "Eigen/MetisSupport"
+#endif
+
 #include "Eigen/OrderingMethods"
 #endif

 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::map;
-using std::set;
-using std::string;
-using std::vector;
+namespace ceres::internal {

 namespace {

@@ -86,7 +89,6 @@ static int MinParameterBlock(const ResidualBlock* residual_block,
  return min_parameter_block_position;
 }

-#if defined(CERES_USE_EIGEN_SPARSE)
 Eigen::SparseMatrix<int> CreateBlockJacobian(
    const TripletSparseMatrix& block_jacobian_transpose) {
  using SparseMatrix = Eigen::SparseMatrix<int>;
@@ -95,7 +97,7 @@ Eigen::SparseMatrix<int> CreateBlockJacobian(
  const int* rows = block_jacobian_transpose.rows();
  const int* cols = block_jacobian_transpose.cols();
  int num_nonzeros = block_jacobian_transpose.num_nonzeros();
-  vector<Triplet> triplets;
+  std::vector<Triplet> triplets;
  triplets.reserve(num_nonzeros);
  for (int i = 0; i < num_nonzeros; ++i) {
    triplets.emplace_back(cols[i], rows[i], 1);
@@ -106,14 +108,20 @@ Eigen::SparseMatrix<int> CreateBlockJacobian(
  block_jacobian.setFromTriplets(triplets.begin(), triplets.end());
  return block_jacobian;
 }
-#endif

 void OrderingForSparseNormalCholeskyUsingSuiteSparse(
+    const LinearSolverOrderingType linear_solver_ordering_type,
    const TripletSparseMatrix& tsm_block_jacobian_transpose,
-    const vector<ParameterBlock*>& parameter_blocks,
+    const std::vector<ParameterBlock*>& parameter_blocks,
    const ParameterBlockOrdering& parameter_block_ordering,
    int* ordering) {
 #ifdef CERES_NO_SUITESPARSE
+  // "Void"ing values to avoid compiler warnings about unused parameters
+  (void)linear_solver_ordering_type;
+  (void)tsm_block_jacobian_transpose;
+  (void)parameter_blocks;
+  (void)parameter_block_ordering;
+  (void)ordering;
  LOG(FATAL) << "Congratulations, you found a Ceres bug! "
             << "Please report this error to the developers.";
 #else
@@ -121,61 +129,47 @@ void OrderingForSparseNormalCholeskyUsingSuiteSparse(
  cholmod_sparse* block_jacobian_transpose = ss.CreateSparseMatrix(
      const_cast<TripletSparseMatrix*>(&tsm_block_jacobian_transpose));

-  // No CAMD or the user did not supply a useful ordering, then just
-  // use regular AMD.
-  if (parameter_block_ordering.NumGroups() <= 1 ||
-      !SuiteSparse::IsConstrainedApproximateMinimumDegreeOrderingAvailable()) {
-    ss.ApproximateMinimumDegreeOrdering(block_jacobian_transpose, &ordering[0]);
-  } else {
-    vector<int> constraints;
-    for (auto* parameter_block : parameter_blocks) {
-      constraints.push_back(parameter_block_ordering.GroupId(
-          parameter_block->mutable_user_state()));
+  if (linear_solver_ordering_type == ceres::AMD) {
+    if (parameter_block_ordering.NumGroups() <= 1) {
+      // The user did not supply a useful ordering so just go ahead
+      // and use AMD.
+      ss.Ordering(block_jacobian_transpose, OrderingType::AMD, ordering);
+    } else {
+      // The user supplied an ordering, so use CAMD.
+      std::vector<int> constraints;
+      constraints.reserve(parameter_blocks.size());
+      for (auto* parameter_block : parameter_blocks) {
+        constraints.push_back(parameter_block_ordering.GroupId(
+            parameter_block->mutable_user_state()));
+      }
+
+      // Renumber the entries of constraints to be contiguous integers
+      // as CAMD requires that the group ids be in the range [0,
+      // parameter_blocks.size() - 1].
+      MapValuesToContiguousRange(constraints.size(), constraints.data());
+      ss.ConstrainedApproximateMinimumDegreeOrdering(
+          block_jacobian_transpose, constraints.data(), ordering);
    }
-
-    // Renumber the entries of constraints to be contiguous integers
-    // as CAMD requires that the group ids be in the range [0,
-    // parameter_blocks.size() - 1].
-    MapValuesToContiguousRange(constraints.size(), &constraints[0]);
-    ss.ConstrainedApproximateMinimumDegreeOrdering(
-        block_jacobian_transpose, &constraints[0], ordering);
+  } else if (linear_solver_ordering_type == ceres::NESDIS) {
+    // If nested dissection is chosen as an ordering algorithm, then
+    // ignore any user provided linear_solver_ordering.
+    CHECK(SuiteSparse::IsNestedDissectionAvailable())
+        << "Congratulations, you found a Ceres bug! "
+        << "Please report this error to the developers.";
+    ss.Ordering(block_jacobian_transpose, OrderingType::NESDIS, ordering);
+  } else {
+    LOG(FATAL) << "Congratulations, you found a Ceres bug! "
+               << "Please report this error to the developers.";
  }

-  VLOG(2) << "Block ordering stats: "
-          << " flops: " << ss.mutable_cc()->fl
-          << " lnz  : " << ss.mutable_cc()->lnz
-          << " anz  : " << ss.mutable_cc()->anz;
-
  ss.Free(block_jacobian_transpose);
 #endif  // CERES_NO_SUITESPARSE
 }

-void OrderingForSparseNormalCholeskyUsingCXSparse(
-    const TripletSparseMatrix& tsm_block_jacobian_transpose, int* ordering) {
-#ifdef CERES_NO_CXSPARSE
-  LOG(FATAL) << "Congratulations, you found a Ceres bug! "
-             << "Please report this error to the developers.";
-#else
-  // CXSparse works with J'J instead of J'. So compute the block
-  // sparsity for J'J and compute an approximate minimum degree
-  // ordering.
-  CXSparse cxsparse;
-  cs_di* block_jacobian_transpose;
-  block_jacobian_transpose = cxsparse.CreateSparseMatrix(
-      const_cast<TripletSparseMatrix*>(&tsm_block_jacobian_transpose));
-  cs_di* block_jacobian = cxsparse.TransposeMatrix(block_jacobian_transpose);
-  cs_di* block_hessian =
-      cxsparse.MatrixMatrixMultiply(block_jacobian_transpose, block_jacobian);
-  cxsparse.Free(block_jacobian);
-  cxsparse.Free(block_jacobian_transpose);
-
-  cxsparse.ApproximateMinimumDegreeOrdering(block_hessian, ordering);
-  cxsparse.Free(block_hessian);
-#endif  // CERES_NO_CXSPARSE
-}
-
 void OrderingForSparseNormalCholeskyUsingEigenSparse(
-    const TripletSparseMatrix& tsm_block_jacobian_transpose, int* ordering) {
+    const LinearSolverOrderingType linear_solver_ordering_type,
+    const TripletSparseMatrix& tsm_block_jacobian_transpose,
+    int* ordering) {
 #ifndef CERES_USE_EIGEN_SPARSE
  LOG(FATAL) << "SPARSE_NORMAL_CHOLESKY cannot be used with EIGEN_SPARSE "
                "because Ceres was not built with support for "
@@ -183,12 +177,12 @@ void OrderingForSparseNormalCholeskyUsingEigenSparse(
                "This requires enabling building with -DEIGENSPARSE=ON.";
 #else

-  // This conversion from a TripletSparseMatrix to a Eigen::Triplet
-  // matrix is unfortunate, but unavoidable for now. It is not a
-  // significant performance penalty in the grand scheme of
-  // things. The right thing to do here would be to get a compressed
-  // row sparse matrix representation of the jacobian and go from
-  // there. But that is a project for another day.
+  // TODO(sameeragarwal): This conversion from a TripletSparseMatrix
+  // to a Eigen::Triplet matrix is unfortunate, but unavoidable for
+  // now. It is not a significant performance penalty in the grand
+  // scheme of things. The right thing to do here would be to get a
+  // compressed row sparse matrix representation of the jacobian and
+  // go from there. But that is a project for another day.
  using SparseMatrix = Eigen::SparseMatrix<int>;

  const SparseMatrix block_jacobian =
@@ -196,9 +190,19 @@ void OrderingForSparseNormalCholeskyUsingEigenSparse(
  const SparseMatrix block_hessian =
      block_jacobian.transpose() * block_jacobian;

-  Eigen::AMDOrdering<int> amd_ordering;
  Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> perm;
-  amd_ordering(block_hessian, perm);
+  if (linear_solver_ordering_type == ceres::AMD) {
+    Eigen::AMDOrdering<int> amd_ordering;
+    amd_ordering(block_hessian, perm);
+  } else {
+#ifndef CERES_NO_EIGEN_METIS
+    Eigen::MetisOrdering<int> metis_ordering;
+    metis_ordering(block_hessian, perm);
+#else
+    perm.setIdentity(block_hessian.rows());
+#endif
+  }
+
  for (int i = 0; i < block_hessian.rows(); ++i) {
    ordering[i] = perm.indices()[i];
  }
@@ -210,7 +214,7 @@ void OrderingForSparseNormalCholeskyUsingEigenSparse(
 bool ApplyOrdering(const ProblemImpl::ParameterMap& parameter_map,
                   const ParameterBlockOrdering& ordering,
                   Program* program,
-                   string* error) {
+                   std::string* error) {
  const int num_parameter_blocks = program->NumParameterBlocks();
  if (ordering.NumElements() != num_parameter_blocks) {
    *error = StringPrintf(
@@ -222,13 +226,15 @@ bool ApplyOrdering(const ProblemImpl::ParameterMap& parameter_map,
    return false;
  }

-  vector<ParameterBlock*>* parameter_blocks =
+  std::vector<ParameterBlock*>* parameter_blocks =
      program->mutable_parameter_blocks();
  parameter_blocks->clear();

-  const map<int, set<double*>>& groups = ordering.group_to_elements();
+  // TODO(sameeragarwal): Investigate whether this should be a set or an
+  // unordered_set.
+  const std::map<int, std::set<double*>>& groups = ordering.group_to_elements();
  for (const auto& p : groups) {
-    const set<double*>& group = p.second;
+    const std::set<double*>& group = p.second;
    for (double* parameter_block_ptr : group) {
      auto it = parameter_map.find(parameter_block_ptr);
      if (it == parameter_map.end()) {
@@ -248,16 +254,18 @@ bool ApplyOrdering(const ProblemImpl::ParameterMap& parameter_map,
 bool LexicographicallyOrderResidualBlocks(
    const int size_of_first_elimination_group,
    Program* program,
-    string* error) {
+    std::string* /*error*/) {
  CHECK_GE(size_of_first_elimination_group, 1)
      << "Congratulations, you found a Ceres bug! Please report this error "
      << "to the developers.";

  // Create a histogram of the number of residuals for each E block. There is an
  // extra bucket at the end to catch all non-eliminated F blocks.
-  vector<int> residual_blocks_per_e_block(size_of_first_elimination_group + 1);
-  vector<ResidualBlock*>* residual_blocks = program->mutable_residual_blocks();
-  vector<int> min_position_per_residual(residual_blocks->size());
+  std::vector<int> residual_blocks_per_e_block(size_of_first_elimination_group +
+                                               1);
+  std::vector<ResidualBlock*>* residual_blocks =
+      program->mutable_residual_blocks();
+  std::vector<int> min_position_per_residual(residual_blocks->size());
  for (int i = 0; i < residual_blocks->size(); ++i) {
    ResidualBlock* residual_block = (*residual_blocks)[i];
    int position =
@@ -270,7 +278,7 @@ bool LexicographicallyOrderResidualBlocks(
  // Run a cumulative sum on the histogram, to obtain offsets to the start of
  // each histogram bucket (where each bucket is for the residuals for that
  // E-block).
-  vector<int> offsets(size_of_first_elimination_group + 1);
+  std::vector<int> offsets(size_of_first_elimination_group + 1);
  std::partial_sum(residual_blocks_per_e_block.begin(),
                   residual_blocks_per_e_block.end(),
                   offsets.begin());
@@ -289,9 +297,9 @@ bool LexicographicallyOrderResidualBlocks(
  // of the bucket. The filling order among the buckets is dictated by the
  // residual blocks. This loop uses the offsets as counters; subtracting one
  // from each offset as a residual block is placed in the bucket. When the
-  // filling is finished, the offset pointerts should have shifted down one
+  // filling is finished, the offset pointers should have shifted down one
  // entry (this is verified below).
-  vector<ResidualBlock*> reordered_residual_blocks(
+  std::vector<ResidualBlock*> reordered_residual_blocks(
      (*residual_blocks).size(), static_cast<ResidualBlock*>(nullptr));
  for (int i = 0; i < residual_blocks->size(); ++i) {
    int bucket = min_position_per_residual[i];
@@ -326,18 +334,18 @@ bool LexicographicallyOrderResidualBlocks(
  return true;
 }

-// Pre-order the columns corresponding to the schur complement if
+// Pre-order the columns corresponding to the Schur complement if
 // possible.
-static void MaybeReorderSchurComplementColumnsUsingSuiteSparse(
+static void ReorderSchurComplementColumnsUsingSuiteSparse(
    const ParameterBlockOrdering& parameter_block_ordering, Program* program) {
-#ifndef CERES_NO_SUITESPARSE
+#ifdef CERES_NO_SUITESPARSE
+  // "Void"ing values to avoid compiler warnings about unused parameters
+  (void)parameter_block_ordering;
+  (void)program;
+#else
  SuiteSparse ss;
-  if (!SuiteSparse::IsConstrainedApproximateMinimumDegreeOrderingAvailable()) {
-    return;
-  }
-
-  vector<int> constraints;
-  vector<ParameterBlock*>& parameter_blocks =
+  std::vector<int> constraints;
+  std::vector<ParameterBlock*>& parameter_blocks =
      *(program->mutable_parameter_blocks());

  for (auto* parameter_block : parameter_blocks) {
@@ -348,7 +356,7 @@ static void MaybeReorderSchurComplementColumnsUsingSuiteSparse(
  // Renumber the entries of constraints to be contiguous integers as
  // CAMD requires that the group ids be in the range [0,
  // parameter_blocks.size() - 1].
-  MapValuesToContiguousRange(constraints.size(), &constraints[0]);
+  MapValuesToContiguousRange(constraints.size(), constraints.data());

  // Compute a block sparse presentation of J'.
  std::unique_ptr<TripletSparseMatrix> tsm_block_jacobian_transpose(
@@ -357,12 +365,12 @@ static void MaybeReorderSchurComplementColumnsUsingSuiteSparse(
  cholmod_sparse* block_jacobian_transpose =
      ss.CreateSparseMatrix(tsm_block_jacobian_transpose.get());

-  vector<int> ordering(parameter_blocks.size(), 0);
+  std::vector<int> ordering(parameter_blocks.size(), 0);
  ss.ConstrainedApproximateMinimumDegreeOrdering(
-      block_jacobian_transpose, &constraints[0], &ordering[0]);
+      block_jacobian_transpose, constraints.data(), ordering.data());
  ss.Free(block_jacobian_transpose);

-  const vector<ParameterBlock*> parameter_blocks_copy(parameter_blocks);
+  const std::vector<ParameterBlock*> parameter_blocks_copy(parameter_blocks);
  for (int i = 0; i < program->NumParameterBlocks(); ++i) {
    parameter_blocks[i] = parameter_blocks_copy[ordering[i]];
  }
@@ -371,14 +379,14 @@ static void MaybeReorderSchurComplementColumnsUsingSuiteSparse(
 #endif
 }

-static void MaybeReorderSchurComplementColumnsUsingEigen(
+static void ReorderSchurComplementColumnsUsingEigen(
+    LinearSolverOrderingType ordering_type,
    const int size_of_first_elimination_group,
-    const ProblemImpl::ParameterMap& parameter_map,
+    const ProblemImpl::ParameterMap& /*parameter_map*/,
    Program* program) {
 #if defined(CERES_USE_EIGEN_SPARSE)
  std::unique_ptr<TripletSparseMatrix> tsm_block_jacobian_transpose(
      program->CreateJacobianBlockSparsityTranspose());
-
  using SparseMatrix = Eigen::SparseMatrix<int>;
  const SparseMatrix block_jacobian =
      CreateBlockJacobian(*tsm_block_jacobian_transpose);
@@ -399,12 +407,22 @@ static void MaybeReorderSchurComplementColumnsUsingEigen(
  const SparseMatrix block_schur_complement =
      F.transpose() * F - F.transpose() * E * E.transpose() * F;

-  Eigen::AMDOrdering<int> amd_ordering;
  Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> perm;
-  amd_ordering(block_schur_complement, perm);
+  if (ordering_type == ceres::AMD) {
+    Eigen::AMDOrdering<int> amd_ordering;
+    amd_ordering(block_schur_complement, perm);
+  } else {
+#ifndef CERES_NO_EIGEN_METIS
+    Eigen::MetisOrdering<int> metis_ordering;
+    metis_ordering(block_schur_complement, perm);
+#else
+    perm.setIdentity(block_schur_complement.rows());
+#endif
+  }

-  const vector<ParameterBlock*>& parameter_blocks = program->parameter_blocks();
-  vector<ParameterBlock*> ordering(num_cols);
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program->parameter_blocks();
+  std::vector<ParameterBlock*> ordering(num_cols);

  // The ordering of the first size_of_first_elimination_group does
  // not matter, so we preserve the existing ordering.
@@ -426,10 +444,11 @@ static void MaybeReorderSchurComplementColumnsUsingEigen(
 bool ReorderProgramForSchurTypeLinearSolver(
    const LinearSolverType linear_solver_type,
    const SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    const LinearSolverOrderingType linear_solver_ordering_type,
    const ProblemImpl::ParameterMap& parameter_map,
    ParameterBlockOrdering* parameter_block_ordering,
    Program* program,
-    string* error) {
+    std::string* error) {
  if (parameter_block_ordering->NumElements() !=
      program->NumParameterBlocks()) {
    *error = StringPrintf(
@@ -447,7 +466,7 @@ bool ReorderProgramForSchurTypeLinearSolver(
    // parameter block ordering as it sees fit. For Schur type solvers,
    // this means that the user wishes for Ceres to identify the
    // e_blocks, which we do by computing a maximal independent set.
-    vector<ParameterBlock*> schur_ordering;
+    std::vector<ParameterBlock*> schur_ordering;
    const int size_of_first_elimination_group =
        ComputeStableSchurOrdering(*program, &schur_ordering);

@@ -470,7 +489,10 @@ bool ReorderProgramForSchurTypeLinearSolver(
    // group.

    // Verify that the first elimination group is an independent set.
-    const set<double*>& first_elimination_group =
+
+    // TODO(sameeragarwal): Investigate if this should be a set or an
+    // unordered_set.
+    const std::set<double*>& first_elimination_group =
        parameter_block_ordering->group_to_elements().begin()->second;
    if (!program->IsParameterBlockSetIndependent(first_elimination_group)) {
      *error = StringPrintf(
@@ -492,12 +514,20 @@ bool ReorderProgramForSchurTypeLinearSolver(
      parameter_block_ordering->group_to_elements().begin()->second.size();

  if (linear_solver_type == SPARSE_SCHUR) {
-    if (sparse_linear_algebra_library_type == SUITE_SPARSE) {
-      MaybeReorderSchurComplementColumnsUsingSuiteSparse(
-          *parameter_block_ordering, program);
+    if (sparse_linear_algebra_library_type == SUITE_SPARSE &&
+        linear_solver_ordering_type == ceres::AMD) {
+      // Preordering support for schur complement only works with AMD
+      // for now, since we are using CAMD.
+      //
+      // TODO(sameeragarwal): It maybe worth adding pre-ordering support for
+      // nested dissection too.
+      ReorderSchurComplementColumnsUsingSuiteSparse(*parameter_block_ordering,
+                                                    program);
    } else if (sparse_linear_algebra_library_type == EIGEN_SPARSE) {
-      MaybeReorderSchurComplementColumnsUsingEigen(
-          size_of_first_elimination_group, parameter_map, program);
+      ReorderSchurComplementColumnsUsingEigen(linear_solver_ordering_type,
+                                              size_of_first_elimination_group,
+                                              parameter_map,
+                                              program);
    }
  }

@@ -509,10 +539,11 @@ bool ReorderProgramForSchurTypeLinearSolver(

 bool ReorderProgramForSparseCholesky(
    const SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    const LinearSolverOrderingType linear_solver_ordering_type,
    const ParameterBlockOrdering& parameter_block_ordering,
    int start_row_block,
    Program* program,
-    string* error) {
+    std::string* error) {
  if (parameter_block_ordering.NumElements() != program->NumParameterBlocks()) {
    *error = StringPrintf(
        "The program has %d parameter blocks, but the parameter block "
@@ -526,19 +557,17 @@ bool ReorderProgramForSparseCholesky(
  std::unique_ptr<TripletSparseMatrix> tsm_block_jacobian_transpose(
      program->CreateJacobianBlockSparsityTranspose(start_row_block));

-  vector<int> ordering(program->NumParameterBlocks(), 0);
-  vector<ParameterBlock*>& parameter_blocks =
+  std::vector<int> ordering(program->NumParameterBlocks(), 0);
+  std::vector<ParameterBlock*>& parameter_blocks =
      *(program->mutable_parameter_blocks());

  if (sparse_linear_algebra_library_type == SUITE_SPARSE) {
    OrderingForSparseNormalCholeskyUsingSuiteSparse(
+        linear_solver_ordering_type,
        *tsm_block_jacobian_transpose,
        parameter_blocks,
        parameter_block_ordering,
-        &ordering[0]);
-  } else if (sparse_linear_algebra_library_type == CX_SPARSE) {
-    OrderingForSparseNormalCholeskyUsingCXSparse(*tsm_block_jacobian_transpose,
-                                                 &ordering[0]);
+        ordering.data());
  } else if (sparse_linear_algebra_library_type == ACCELERATE_SPARSE) {
    // Accelerate does not provide a function to perform reordering without
    // performing a full symbolic factorisation.  As such, we have nothing
@@ -550,11 +579,13 @@ bool ReorderProgramForSparseCholesky(

  } else if (sparse_linear_algebra_library_type == EIGEN_SPARSE) {
    OrderingForSparseNormalCholeskyUsingEigenSparse(
-        *tsm_block_jacobian_transpose, &ordering[0]);
+        linear_solver_ordering_type,
+        *tsm_block_jacobian_transpose,
+        ordering.data());
  }

  // Apply ordering.
-  const vector<ParameterBlock*> parameter_blocks_copy(parameter_blocks);
+  const std::vector<ParameterBlock*> parameter_blocks_copy(parameter_blocks);
  for (int i = 0; i < program->NumParameterBlocks(); ++i) {
    parameter_blocks[i] = parameter_blocks_copy[ordering[i]];
  }
@@ -575,5 +606,39 @@ int ReorderResidualBlocksByPartition(
  return it - residual_blocks->begin();
 }

-}  // namespace internal
-}  // namespace ceres
+bool AreJacobianColumnsOrdered(
+    const LinearSolverType linear_solver_type,
+    const PreconditionerType preconditioner_type,
+    const SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    const LinearSolverOrderingType linear_solver_ordering_type) {
+  if (sparse_linear_algebra_library_type == SUITE_SPARSE) {
+    if (linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+        (linear_solver_type == CGNR && preconditioner_type == SUBSET)) {
+      return true;
+    }
+    if (linear_solver_type == SPARSE_SCHUR &&
+        linear_solver_ordering_type == ceres::AMD) {
+      return true;
+    }
+    return false;
+  }
+
+  if (sparse_linear_algebra_library_type == ceres::EIGEN_SPARSE) {
+    if (linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+        linear_solver_type == SPARSE_SCHUR ||
+        (linear_solver_type == CGNR && preconditioner_type == SUBSET)) {
+      return true;
+    }
+    return false;
+  }
+
+  if (sparse_linear_algebra_library_type == ceres::ACCELERATE_SPARSE) {
+    // Apple's accelerate framework does not allow direct access to
+    // ordering algorithms, so jacobian columns are never pre-ordered.
+    return false;
+  }
+
+  return false;
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/reorder_program.h
+++ b/extern/ceres/internal/ceres/reorder_program.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,12 +35,12 @@

 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
+#include "ceres/linear_solver.h"
 #include "ceres/parameter_block_ordering.h"
 #include "ceres/problem_impl.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;

@@ -76,6 +76,7 @@ CERES_NO_EXPORT bool LexicographicallyOrderResidualBlocks(
 CERES_NO_EXPORT bool ReorderProgramForSchurTypeLinearSolver(
    LinearSolverType linear_solver_type,
    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    LinearSolverOrderingType linear_solver_ordering_type,
    const ProblemImpl::ParameterMap& parameter_map,
    ParameterBlockOrdering* parameter_block_ordering,
    Program* program,
@@ -93,6 +94,7 @@ CERES_NO_EXPORT bool ReorderProgramForSchurTypeLinearSolver(
 // ordering will take it into account, otherwise it will be ignored.
 CERES_NO_EXPORT bool ReorderProgramForSparseCholesky(
    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    LinearSolverOrderingType linear_solver_ordering_type,
    const ParameterBlockOrdering& parameter_block_ordering,
    int start_row_block,
    Program* program,
@@ -112,8 +114,15 @@ CERES_NO_EXPORT int ReorderResidualBlocksByPartition(
    const std::unordered_set<ResidualBlockId>& bottom_residual_blocks,
    Program* program);

-}  // namespace internal
-}  // namespace ceres
+// The return value of this function indicates whether the columns of
+// the Jacobian can be reordered using a fill reducing ordering.
+CERES_NO_EXPORT bool AreJacobianColumnsOrdered(
+    LinearSolverType linear_solver_type,
+    PreconditionerType preconditioner_type,
+    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
+    LinearSolverOrderingType linear_solver_ordering_type);
+
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/residual_block.cc
+++ b/extern/ceres/internal/ceres/residual_block.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@

 using Eigen::Dynamic;

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 ResidualBlock::ResidualBlock(
    const CostFunction* cost_function,
@@ -114,8 +113,7 @@ bool ResidualBlock::Evaluate(const bool apply_loss_function,
    return false;
  }

-  if (!IsEvaluationValid(
-          *this, parameters.data(), cost, residuals, eval_jacobians)) {
+  if (!IsEvaluationValid(*this, parameters.data(), residuals, eval_jacobians)) {
    // clang-format off
    std::string message =
        "\n\n"
@@ -216,5 +214,4 @@ int ResidualBlock::NumScratchDoublesForEvaluate() const {
  return scratch_doubles;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/residual_block.h
+++ b/extern/ceres/internal/ceres/residual_block.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/residual_block_utils.cc
+++ b/extern/ceres/internal/ceres/residual_block_utils.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
 #include <cmath>
 #include <cstddef>
 #include <limits>
+#include <string>

 #include "ceres/array_utils.h"
 #include "ceres/internal/eigen.h"
@@ -42,10 +43,7 @@
 #include "ceres/stringprintf.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::string;
+namespace ceres::internal {

 void InvalidateEvaluation(const ResidualBlock& block,
                          double* cost,
@@ -64,17 +62,17 @@ void InvalidateEvaluation(const ResidualBlock& block,
  }
 }

-string EvaluationToString(const ResidualBlock& block,
-                          double const* const* parameters,
-                          double* cost,
-                          double* residuals,
-                          double** jacobians) {
+std::string EvaluationToString(const ResidualBlock& block,
+                               double const* const* parameters,
+                               double* cost,
+                               double* residuals,
+                               double** jacobians) {
  CHECK(cost != nullptr);
  CHECK(residuals != nullptr);

  const int num_parameter_blocks = block.NumParameterBlocks();
  const int num_residuals = block.NumResiduals();
-  string result = "";
+  std::string result = "";

  // clang-format off
  StringAppendF(&result,
@@ -89,7 +87,7 @@ string EvaluationToString(const ResidualBlock& block,
      "to Inf or NaN is also an error.  \n\n"; // NOLINT
  // clang-format on

-  string space = "Residuals:     ";
+  std::string space = "Residuals:     ";
  result += space;
  AppendArrayToString(num_residuals, residuals, &result);
  StringAppendF(&result, "\n\n");
@@ -117,9 +115,11 @@ string EvaluationToString(const ResidualBlock& block,
  return result;
 }

+// TODO(sameeragarwal) Check cost value validness here
+// Cost value is a part of evaluation but not checked here since according to
+// residual_block.cc cost is not valid at the time this method is called
 bool IsEvaluationValid(const ResidualBlock& block,
-                       double const* const* parameters,
-                       double* cost,
+                       double const* const* /*parameters*/,
                       double* residuals,
                       double** jacobians) {
  const int num_parameter_blocks = block.NumParameterBlocks();
@@ -141,5 +141,4 @@ bool IsEvaluationValid(const ResidualBlock& block,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/residual_block_utils.h
+++ b/extern/ceres/internal/ceres/residual_block_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class ResidualBlock;

@@ -64,7 +63,6 @@ void InvalidateEvaluation(const ResidualBlock& block,
 CERES_NO_EXPORT
 bool IsEvaluationValid(const ResidualBlock& block,
                       double const* const* parameters,
-                       double* cost,
                       double* residuals,
                       double** jacobians);

@@ -78,7 +76,6 @@ std::string EvaluationToString(const ResidualBlock& block,
                               double* residuals,
                               double** jacobians);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_RESIDUAL_BLOCK_UTILS_H_
--- a/extern/ceres/internal/ceres/schur_complement_solver.cc
+++ b/extern/ceres/internal/ceres/schur_complement_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,6 +34,7 @@
 #include <ctime>
 #include <memory>
 #include <set>
+#include <utility>
 #include <vector>

 #include "Eigen/Dense"
@@ -52,58 +53,36 @@
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
-
-using std::make_pair;
-using std::pair;
-using std::set;
-using std::vector;
-
+namespace ceres::internal {
 namespace {

-class BlockRandomAccessSparseMatrixAdapter final : public LinearOperator {
+class BlockRandomAccessSparseMatrixAdapter final
+    : public ConjugateGradientsLinearOperator<Vector> {
 public:
  explicit BlockRandomAccessSparseMatrixAdapter(
      const BlockRandomAccessSparseMatrix& m)
      : m_(m) {}

-  // y = y + Ax;
-  void RightMultiply(const double* x, double* y) const final {
-    m_.SymmetricRightMultiply(x, y);
+  void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
+    m_.SymmetricRightMultiplyAndAccumulate(x.data(), y.data());
  }

-  // y = y + A'x;
-  void LeftMultiply(const double* x, double* y) const final {
-    m_.SymmetricRightMultiply(x, y);
-  }
-
-  int num_rows() const final { return m_.num_rows(); }
-  int num_cols() const final { return m_.num_rows(); }
-
 private:
  const BlockRandomAccessSparseMatrix& m_;
 };

-class BlockRandomAccessDiagonalMatrixAdapter final : public LinearOperator {
+class BlockRandomAccessDiagonalMatrixAdapter final
+    : public ConjugateGradientsLinearOperator<Vector> {
 public:
  explicit BlockRandomAccessDiagonalMatrixAdapter(
      const BlockRandomAccessDiagonalMatrix& m)
      : m_(m) {}

  // y = y + Ax;
-  void RightMultiply(const double* x, double* y) const final {
-    m_.RightMultiply(x, y);
+  void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
+    m_.RightMultiplyAndAccumulate(x.data(), y.data());
  }

-  // y = y + A'x;
-  void LeftMultiply(const double* x, double* y) const final {
-    m_.RightMultiply(x, y);
-  }
-
-  int num_rows() const final { return m_.num_rows(); }
-  int num_cols() const final { return m_.num_rows(); }
-
 private:
  const BlockRandomAccessDiagonalMatrix& m_;
 };
@@ -126,7 +105,7 @@ LinearSolver::Summary SchurComplementSolver::SolveImpl(
  EventLogger event_logger("SchurComplementSolver::Solve");

  const CompressedRowBlockStructure* bs = A->block_structure();
-  if (eliminator_.get() == nullptr) {
+  if (eliminator_ == nullptr) {
    const int num_eliminate_blocks = options_.elimination_groups[0];
    const int num_f_blocks = bs->cols.size() - num_eliminate_blocks;

@@ -161,7 +140,7 @@ LinearSolver::Summary SchurComplementSolver::SolveImpl(
                         b,
                         per_solve_options.D,
                         lhs_.get(),
-                         rhs_.get());
+                         rhs_.data());
  event_logger.AddEvent("Eliminate");

  double* reduced_solution = x + A->num_cols() - lhs_->num_cols();
@@ -169,7 +148,7 @@ LinearSolver::Summary SchurComplementSolver::SolveImpl(
      SolveReducedLinearSystem(per_solve_options, reduced_solution);
  event_logger.AddEvent("ReducedSolve");

-  if (summary.termination_type == LINEAR_SOLVER_SUCCESS) {
+  if (summary.termination_type == LinearSolverTerminationType::SUCCESS) {
    eliminator_->BackSubstitute(
        BlockSparseMatrixData(*A), b, per_solve_options.D, reduced_solution, x);
    event_logger.AddEvent("BackSubstitute");
@@ -190,24 +169,21 @@ void DenseSchurComplementSolver::InitStorage(
    const CompressedRowBlockStructure* bs) {
  const int num_eliminate_blocks = options().elimination_groups[0];
  const int num_col_blocks = bs->cols.size();
-
-  vector<int> blocks(num_col_blocks - num_eliminate_blocks, 0);
-  for (int i = num_eliminate_blocks, j = 0; i < num_col_blocks; ++i, ++j) {
-    blocks[j] = bs->cols[i].size;
-  }
-
-  set_lhs(std::make_unique<BlockRandomAccessDenseMatrix>(blocks));
-  set_rhs(std::make_unique<double[]>(lhs()->num_rows()));
+  auto blocks = Tail(bs->cols, num_col_blocks - num_eliminate_blocks);
+  set_lhs(std::make_unique<BlockRandomAccessDenseMatrix>(
+      blocks, options().context, options().num_threads));
+  ResizeRhs(lhs()->num_rows());
 }

 // Solve the system Sx = r, assuming that the matrix S is stored in a
 // BlockRandomAccessDenseMatrix. The linear system is solved using
 // Eigen's Cholesky factorization.
 LinearSolver::Summary DenseSchurComplementSolver::SolveReducedLinearSystem(
-    const LinearSolver::PerSolveOptions& per_solve_options, double* solution) {
+    const LinearSolver::PerSolveOptions& /*per_solve_options*/,
+    double* solution) {
  LinearSolver::Summary summary;
  summary.num_iterations = 0;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
+  summary.termination_type = LinearSolverTerminationType::SUCCESS;
  summary.message = "Success.";

  auto* m = down_cast<BlockRandomAccessDenseMatrix*>(mutable_lhs());
@@ -221,7 +197,7 @@ LinearSolver::Summary DenseSchurComplementSolver::SolveReducedLinearSystem(

  summary.num_iterations = 1;
  summary.termination_type = cholesky_->FactorAndSolve(
-      num_rows, m->mutable_values(), rhs(), solution, &summary.message);
+      num_rows, m->mutable_values(), rhs().data(), solution, &summary.message);
  return summary;
 }

@@ -233,7 +209,14 @@ SparseSchurComplementSolver::SparseSchurComplementSolver(
  }
 }

-SparseSchurComplementSolver::~SparseSchurComplementSolver() = default;
+SparseSchurComplementSolver::~SparseSchurComplementSolver() {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}

 // Determine the non-zero blocks in the Schur Complement matrix, and
 // initialize a BlockRandomAccessSparseMatrix object.
@@ -243,14 +226,11 @@ void SparseSchurComplementSolver::InitStorage(
  const int num_col_blocks = bs->cols.size();
  const int num_row_blocks = bs->rows.size();

-  blocks_.resize(num_col_blocks - num_eliminate_blocks, 0);
-  for (int i = num_eliminate_blocks; i < num_col_blocks; ++i) {
-    blocks_[i - num_eliminate_blocks] = bs->cols[i].size;
-  }
+  blocks_ = Tail(bs->cols, num_col_blocks - num_eliminate_blocks);

-  set<pair<int, int>> block_pairs;
+  std::set<std::pair<int, int>> block_pairs;
  for (int i = 0; i < blocks_.size(); ++i) {
-    block_pairs.insert(make_pair(i, i));
+    block_pairs.emplace(i, i);
  }

  int r = 0;
@@ -259,7 +239,7 @@ void SparseSchurComplementSolver::InitStorage(
    if (e_block_id >= num_eliminate_blocks) {
      break;
    }
-    vector<int> f_blocks;
+    std::vector<int> f_blocks;

    // Add to the chunk until the first block in the row is
    // different than the one in the first row for the chunk.
@@ -281,7 +261,7 @@ void SparseSchurComplementSolver::InitStorage(
    f_blocks.erase(unique(f_blocks.begin(), f_blocks.end()), f_blocks.end());
    for (int i = 0; i < f_blocks.size(); ++i) {
      for (int j = i + 1; j < f_blocks.size(); ++j) {
-        block_pairs.insert(make_pair(f_blocks[i], f_blocks[j]));
+        block_pairs.emplace(f_blocks[i], f_blocks[j]);
      }
    }
  }
@@ -296,15 +276,15 @@ void SparseSchurComplementSolver::InitStorage(
      for (const auto& cell : row.cells) {
        int r_block2_id = cell.block_id - num_eliminate_blocks;
        if (r_block1_id <= r_block2_id) {
-          block_pairs.insert(make_pair(r_block1_id, r_block2_id));
+          block_pairs.emplace(r_block1_id, r_block2_id);
        }
      }
    }
  }

-  set_lhs(
-      std::make_unique<BlockRandomAccessSparseMatrix>(blocks_, block_pairs));
-  set_rhs(std::make_unique<double[]>(lhs()->num_rows()));
+  set_lhs(std::make_unique<BlockRandomAccessSparseMatrix>(
+      blocks_, block_pairs, options().context, options().num_threads));
+  ResizeRhs(lhs()->num_rows());
 }

 LinearSolver::Summary SparseSchurComplementSolver::SolveReducedLinearSystem(
@@ -316,32 +296,39 @@ LinearSolver::Summary SparseSchurComplementSolver::SolveReducedLinearSystem(

  LinearSolver::Summary summary;
  summary.num_iterations = 0;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
+  summary.termination_type = LinearSolverTerminationType::SUCCESS;
  summary.message = "Success.";

-  const TripletSparseMatrix* tsm =
+  const BlockSparseMatrix* bsm =
      down_cast<const BlockRandomAccessSparseMatrix*>(lhs())->matrix();
-  if (tsm->num_rows() == 0) {
+  if (bsm->num_rows() == 0) {
    return summary;
  }

-  std::unique_ptr<CompressedRowSparseMatrix> lhs;
  const CompressedRowSparseMatrix::StorageType storage_type =
      sparse_cholesky_->StorageType();
-  if (storage_type == CompressedRowSparseMatrix::UPPER_TRIANGULAR) {
-    lhs = CompressedRowSparseMatrix::FromTripletSparseMatrix(*tsm);
-    lhs->set_storage_type(CompressedRowSparseMatrix::UPPER_TRIANGULAR);
+  if (storage_type ==
+      CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR) {
+    if (!crs_lhs_) {
+      crs_lhs_ = bsm->ToCompressedRowSparseMatrix();
+      crs_lhs_->set_storage_type(
+          CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR);
+    } else {
+      bsm->UpdateCompressedRowSparseMatrix(crs_lhs_.get());
+    }
  } else {
-    lhs = CompressedRowSparseMatrix::FromTripletSparseMatrixTransposed(*tsm);
-    lhs->set_storage_type(CompressedRowSparseMatrix::LOWER_TRIANGULAR);
+    if (!crs_lhs_) {
+      crs_lhs_ = bsm->ToCompressedRowSparseMatrixTranspose();
+      crs_lhs_->set_storage_type(
+          CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR);
+    } else {
+      bsm->UpdateCompressedRowSparseMatrixTranspose(crs_lhs_.get());
+    }
  }

-  *lhs->mutable_col_blocks() = blocks_;
-  *lhs->mutable_row_blocks() = blocks_;
-
  summary.num_iterations = 1;
  summary.termination_type = sparse_cholesky_->FactorAndSolve(
-      lhs.get(), rhs(), solution, &summary.message);
+      crs_lhs_.get(), rhs().data(), solution, &summary.message);
  return summary;
 }

@@ -355,7 +342,7 @@ SparseSchurComplementSolver::SolveReducedLinearSystemUsingConjugateGradients(
  if (num_rows == 0) {
    LinearSolver::Summary summary;
    summary.num_iterations = 0;
-    summary.termination_type = LINEAR_SOLVER_SUCCESS;
+    summary.termination_type = LinearSolverTerminationType::SUCCESS;
    summary.message = "Success.";
    return summary;
  }
@@ -363,9 +350,9 @@ SparseSchurComplementSolver::SolveReducedLinearSystemUsingConjugateGradients(
  // Only SCHUR_JACOBI is supported over here right now.
  CHECK_EQ(options().preconditioner_type, SCHUR_JACOBI);

-  if (preconditioner_.get() == nullptr) {
-    preconditioner_ =
-        std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks_);
+  if (preconditioner_ == nullptr) {
+    preconditioner_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
+        blocks_, options().context, options().num_threads);
  }

  auto* sc = down_cast<BlockRandomAccessSparseMatrix*>(mutable_lhs());
@@ -373,7 +360,7 @@ SparseSchurComplementSolver::SolveReducedLinearSystemUsingConjugateGradients(
  // Extract block diagonal from the Schur complement to construct the
  // schur_jacobi preconditioner.
  for (int i = 0; i < blocks_.size(); ++i) {
-    const int block_size = blocks_[i];
+    const int block_size = blocks_[i].size;

    int sc_r, sc_c, sc_row_stride, sc_col_stride;
    CellInfo* sc_cell_info =
@@ -394,25 +381,28 @@ SparseSchurComplementSolver::SolveReducedLinearSystemUsingConjugateGradients(

  VectorRef(solution, num_rows).setZero();

-  std::unique_ptr<LinearOperator> lhs_adapter =
-      std::make_unique<BlockRandomAccessSparseMatrixAdapter>(*sc);
-  std::unique_ptr<LinearOperator> preconditioner_adapter =
+  auto lhs = std::make_unique<BlockRandomAccessSparseMatrixAdapter>(*sc);
+  auto preconditioner =
      std::make_unique<BlockRandomAccessDiagonalMatrixAdapter>(
          *preconditioner_);

-  LinearSolver::Options cg_options;
+  ConjugateGradientsSolverOptions cg_options;
  cg_options.min_num_iterations = options().min_num_iterations;
  cg_options.max_num_iterations = options().max_num_iterations;
-  ConjugateGradientsSolver cg_solver(cg_options);
+  cg_options.residual_reset_period = options().residual_reset_period;
+  cg_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_options.r_tolerance = per_solve_options.r_tolerance;

-  LinearSolver::PerSolveOptions cg_per_solve_options;
-  cg_per_solve_options.r_tolerance = per_solve_options.r_tolerance;
-  cg_per_solve_options.q_tolerance = per_solve_options.q_tolerance;
-  cg_per_solve_options.preconditioner = preconditioner_adapter.get();
-
-  return cg_solver.Solve(
-      lhs_adapter.get(), rhs(), cg_per_solve_options, solution);
+  cg_solution_ = Vector::Zero(sc->num_rows());
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i] == nullptr) {
+      scratch_[i] = new Vector(sc->num_rows());
+    }
+  }
+  auto summary = ConjugateGradientsSolver<Vector>(
+      cg_options, *lhs, rhs(), *preconditioner, scratch_, cg_solution_);
+  VectorRef(solution, sc->num_rows()) = cg_solution_;
+  return summary;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/schur_complement_solver.h
+++ b/extern/ceres/internal/ceres/schur_complement_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -54,8 +54,7 @@

 #include "ceres/internal/disable_warnings.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;
 class SparseCholesky;
@@ -66,7 +65,7 @@ class SparseCholesky;
 //
 //  E y + F z = b
 //
-// Where x = [y;z] is a partition of the variables.  The paritioning
+// Where x = [y;z] is a partition of the variables.  The partitioning
 // of the variables is such that, E'E is a block diagonal
 // matrix. Further, the rows of A are ordered so that for every
 // variable block in y, all the rows containing that variable block
@@ -131,9 +130,8 @@ class CERES_NO_EXPORT SchurComplementSolver : public BlockSparseMatrixSolver {
  }
  const BlockRandomAccessMatrix* lhs() const { return lhs_.get(); }
  BlockRandomAccessMatrix* mutable_lhs() { return lhs_.get(); }
-
-  void set_rhs(std::unique_ptr<double[]> rhs) { rhs_ = std::move(rhs); }
-  const double* rhs() const { return rhs_.get(); }
+  void ResizeRhs(int n) { rhs_.resize(n); }
+  const Vector& rhs() const { return rhs_; }

 private:
  virtual void InitStorage(const CompressedRowBlockStructure* bs) = 0;
@@ -145,7 +143,7 @@ class CERES_NO_EXPORT SchurComplementSolver : public BlockSparseMatrixSolver {

  std::unique_ptr<SchurEliminatorBase> eliminator_;
  std::unique_ptr<BlockRandomAccessMatrix> lhs_;
-  std::unique_ptr<double[]> rhs_;
+  Vector rhs_;
 };

 // Dense Cholesky factorization based solver.
@@ -185,14 +183,15 @@ class CERES_NO_EXPORT SparseSchurComplementSolver final
  LinearSolver::Summary SolveReducedLinearSystemUsingConjugateGradients(
      const LinearSolver::PerSolveOptions& per_solve_options, double* solution);

-  // Size of the blocks in the Schur complement.
-  std::vector<int> blocks_;
+  std::vector<Block> blocks_;
  std::unique_ptr<SparseCholesky> sparse_cholesky_;
  std::unique_ptr<BlockRandomAccessDiagonalMatrix> preconditioner_;
+  std::unique_ptr<CompressedRowSparseMatrix> crs_lhs_;
+  Vector cg_solution_;
+  Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/schur_eliminator.cc
+++ b/extern/ceres/internal/ceres/schur_eliminator.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,7 @@
 #include "ceres/linear_solver.h"
 #include "ceres/schur_eliminator.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 SchurEliminatorBase::~SchurEliminatorBase() = default;

@@ -161,5 +160,4 @@ std::unique_ptr<SchurEliminatorBase> SchurEliminatorBase::Create(
                                          Eigen::Dynamic>>(options);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/schur_eliminator.h
+++ b/extern/ceres/internal/ceres/schur_eliminator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2019 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -46,8 +46,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Classes implementing the SchurEliminatorBase interface implement
 // variable elimination for linear least squares problems. Assuming
@@ -169,9 +168,9 @@ class CERES_NO_EXPORT SchurEliminatorBase {
 public:
  virtual ~SchurEliminatorBase();

-  // Initialize the eliminator. It is the user's responsibilty to call
+  // Initialize the eliminator. It is the user's responsibility to call
  // this function before calling Eliminate or BackSubstitute. It is
-  // also the caller's responsibilty to ensure that the
+  // also the caller's responsibility to ensure that the
  // CompressedRowBlockStructure object passed to this method is the
  // same one (or is equivalent to) the one associated with the
  // BlockSparseMatrix objects below.
@@ -383,8 +382,9 @@ template <int kRowBlockSize = Eigen::Dynamic,
 class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
    : public SchurEliminatorBase {
 public:
+  // TODO(sameeragarwal) Find out why "assume_full_rank_ete" is not used here
  void Init(int num_eliminate_blocks,
-            bool assume_full_rank_ete,
+            bool /*assume_full_rank_ete*/,
            const CompressedRowBlockStructure* bs) override {
    CHECK_GT(num_eliminate_blocks, 0)
        << "SchurComplementSolver cannot be initialized with "
@@ -447,7 +447,7 @@ class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
    const CompressedRowBlockStructure* bs = A.block_structure();
    const double* values = A.values();

-    // Add the diagonal to the schur complement.
+    // Add the diagonal to the Schur complement.
    if (D != nullptr) {
      typename EigenTypes<kFBlockSize>::ConstVectorRef diag(
          D + bs->cols[num_eliminate_blocks_].position, kFBlockSize);
@@ -479,7 +479,7 @@ class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
      const Chunk& chunk = chunks_[i];
      const int e_block_id = bs->rows[chunk.start].cells.front().block_id;

-      // Naming covention, e_t_e = e_block.transpose() * e_block;
+      // Naming convention, e_t_e = e_block.transpose() * e_block;
      Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e;
      Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f;
      Eigen::Matrix<double, kEBlockSize, 1> e_t_b;
@@ -570,7 +570,7 @@ class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
  // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z);
  void BackSubstitute(const BlockSparseMatrixData& A,
                      const double* b,
-                      const double* D,
+                      const double* /*D*/,
                      const double* z_ptr,
                      double* y) override {
    typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize);
@@ -623,8 +623,7 @@ class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
  std::vector<double> e_t_e_inverse_matrices_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/schur_eliminator_impl.h
+++ b/extern/ceres/internal/ceres/schur_eliminator_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -69,8 +69,7 @@
 #include "ceres/thread_token_provider.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::~SchurEliminator() {
@@ -107,7 +106,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::Init(
  }

  // TODO(sameeragarwal): Now that we may have subset block structure,
-  // we need to make sure that we account for the fact that somep
+  // we need to make sure that we account for the fact that some
  // point blocks only have a "diagonal" row and nothing more.
  //
  // This likely requires a slightly different algorithm, which works
@@ -206,8 +205,6 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::Eliminate(
                    const int block_size = bs->cols[i].size;
                    typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag(
                        D + bs->cols[i].position, block_size);
-
-                    std::lock_guard<std::mutex> l(cell_info->m);
                    MatrixRef m(cell_info->values, row_stride, col_stride);
                    m.block(r, c, block_size, block_size).diagonal() +=
                        diag.array().square().matrix();
@@ -301,7 +298,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::Eliminate(
            thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
      });

-  // For rows with no e_blocks, the schur complement update reduces to
+  // For rows with no e_blocks, the Schur complement update reduces to
  // S += F'F.
  NoEBlockRowsUpdate(A, b, uneliminated_row_begins_, lhs, rhs);
 }
@@ -410,7 +407,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::UpdateRhs(
      const int block_id = row.cells[c].block_id;
      const int block_size = bs->cols[block_id].size;
      const int block = block_id - num_eliminate_blocks_;
-      std::lock_guard<std::mutex> l(*rhs_locks_[block]);
+      auto lock = MakeConditionalLock(num_threads_, *rhs_locks_[block]);
      // clang-format off
      MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
          values + row.cells[c].position,
@@ -433,7 +430,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::UpdateRhs(
 //
 //   ete = y11 * y11' + y12 * y12'
 //
-// and the off diagonal blocks in the Guass Newton Hessian.
+// and the off diagonal blocks in the Gauss Newton Hessian.
 //
 //   buffer = [y11'(z11 + z12), y12' * z22, y11' * z51]
 //
@@ -550,7 +547,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
          lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
      if (cell_info != nullptr) {
        const int block2_size = bs->cols[it2->first].size;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
        // clang-format off
        MatrixMatrixMultiply
            <kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>(
@@ -563,7 +560,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }
 }

-// For rows with no e_blocks, the schur complement update reduces to S
+// For rows with no e_blocks, the Schur complement update reduces to S
 // += F'F. This function iterates over the rows of A with no e_block,
 // and calls NoEBlockRowOuterProduct on each row.
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
@@ -596,7 +593,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
 }

 // A row r of A, which has no e_blocks gets added to the Schur
-// Complement as S += r r'. This function is responsible for computing
+// complement as S += r r'. This function is responsible for computing
 // the contribution of a single row r to the Schur complement. It is
 // very similar in structure to EBlockRowOuterProduct except for
 // one difference. It does not use any of the template
@@ -627,7 +624,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    CellInfo* cell_info =
        lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride);
    if (cell_info != nullptr) {
-      std::lock_guard<std::mutex> l(cell_info->m);
+      auto lock = MakeConditionalLock(num_threads_, cell_info->m);
      // This multiply currently ignores the fact that this is a
      // symmetric outer product.
      // clang-format off
@@ -648,7 +645,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
          lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
      if (cell_info != nullptr) {
        const int block2_size = bs->cols[row.cells[j].block_id].size;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
        // clang-format off
        MatrixTransposeMatrixMultiply
            <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
@@ -682,7 +679,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
    CellInfo* cell_info =
        lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride);
    if (cell_info != nullptr) {
-      std::lock_guard<std::mutex> l(cell_info->m);
+      auto lock = MakeConditionalLock(num_threads_, cell_info->m);
      // block += b1.transpose() * b1;
      // clang-format off
      MatrixTransposeMatrixMultiply
@@ -703,7 +700,7 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
          lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
      if (cell_info != nullptr) {
        // block += b1.transpose() * b2;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
        // clang-format off
        MatrixTransposeMatrixMultiply
            <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
@@ -716,7 +713,6 @@ void SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
--- a/extern/ceres/internal/ceres/schur_eliminator_template.py
+++ b/extern/ceres/internal/ceres/schur_eliminator_template.py
@@ -0,0 +1,150 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2023 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: sameeragarwal@google.com (Sameer Agarwal)
+#
+# Script for explicitly generating template specialization of the
+# SchurEliminator class. It is a rather large class
+# and the number of explicit instantiations is also large. Explicitly
+# generating these instantiations in separate .cc files breaks the
+# compilation into separate compilation unit rather than one large cc
+# file which takes 2+GB of RAM to compile.
+#
+# This script creates two sets of files.
+#
+# 1. schur_eliminator_x_x_x.cc
+# where, the x indicates the template parameters and
+#
+# 2. schur_eliminator.cc
+#
+# that contains a factory function for instantiating these classes
+# based on runtime parameters.
+#
+# The list of tuples, specializations indicates the set of
+# specializations that is generated.
+
+# Set of template specializations to generate
+
+HEADER = """// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+//
+// Template specialization of SchurEliminator.
+//
+// ========================================
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+// THIS FILE IS AUTOGENERATED. DO NOT EDIT.
+//=========================================
+//
+// This file is generated using generate_template_specializations.py.
+"""
+
+DYNAMIC_FILE = """
+#include "ceres/schur_eliminator_impl.h"
+
+namespace ceres::internal {
+
+template class SchurEliminator<%s, %s, %s>;
+
+}  // namespace ceres::internal
+"""
+
+SPECIALIZATION_FILE = """
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/config.h"
+
+#ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
+
+#include "ceres/schur_eliminator_impl.h"
+
+namespace ceres::internal {
+
+template class SchurEliminator<%s, %s, %s>;
+
+}  // namespace ceres::internal
+
+#endif  // CERES_RESTRICT_SCHUR_SPECIALIZATION
+"""
+
+FACTORY_FILE_HEADER = """
+#include <memory>
+
+#include "ceres/linear_solver.h"
+#include "ceres/schur_eliminator.h"
+
+namespace ceres::internal {
+
+SchurEliminatorBase::~SchurEliminatorBase() = default;
+
+std::unique_ptr<SchurEliminatorBase> SchurEliminatorBase::Create(
+    const LinearSolver::Options& options) {
+#ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
+"""
+
+FACTORY = """  return std::make_unique<SchurEliminator<%s, %s, %s>>(options);"""
+
+FACTORY_FOOTER = """
+#endif
+  VLOG(1) << "Template specializations not found for <"
+          << options.row_block_size << "," << options.e_block_size << ","
+          << options.f_block_size << ">";
+  return std::make_unique<SchurEliminator<Eigen::Dynamic,
+                                          Eigen::Dynamic,
+                                          Eigen::Dynamic>>(options);
+}
+
+}  // namespace ceres::internal
+"""
--- a/extern/ceres/internal/ceres/schur_jacobi_preconditioner.cc
+++ b/extern/ceres/internal/ceres/schur_jacobi_preconditioner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,7 @@

 #include "ceres/schur_jacobi_preconditioner.h"

+#include <memory>
 #include <utility>
 #include <vector>

@@ -39,8 +40,7 @@
 #include "ceres/schur_eliminator.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 SchurJacobiPreconditioner::SchurJacobiPreconditioner(
    const CompressedRowBlockStructure& bs, Preconditioner::Options options)
@@ -52,12 +52,16 @@ SchurJacobiPreconditioner::SchurJacobiPreconditioner(
                          << "SCHUR_JACOBI preconditioner.";
  CHECK(options_.context != nullptr);

-  std::vector<int> blocks(num_blocks);
+  std::vector<Block> blocks(num_blocks);
+  int position = 0;
  for (int i = 0; i < num_blocks; ++i) {
-    blocks[i] = bs.cols[i + options_.elimination_groups[0]].size;
+    blocks[i] =
+        Block(bs.cols[i + options_.elimination_groups[0]].size, position);
+    position += blocks[i].size;
  }

-  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks);
+  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
+      blocks, options_.context, options_.num_threads);
  InitEliminator(bs);
 }

@@ -92,12 +96,11 @@ bool SchurJacobiPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  return true;
 }

-void SchurJacobiPreconditioner::RightMultiply(const double* x,
-                                              double* y) const {
-  m_->RightMultiply(x, y);
+void SchurJacobiPreconditioner::RightMultiplyAndAccumulate(const double* x,
+                                                           double* y) const {
+  m_->RightMultiplyAndAccumulate(x, y);
 }

 int SchurJacobiPreconditioner::num_rows() const { return m_->num_rows(); }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/schur_jacobi_preconditioner.h
+++ b/extern/ceres/internal/ceres/schur_jacobi_preconditioner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -47,8 +47,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/preconditioner.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockRandomAccessDiagonalMatrix;
 class BlockSparseMatrix;
@@ -72,8 +71,10 @@ class SchurEliminatorBase;
 //   SchurJacobiPreconditioner preconditioner(
 //      *A.block_structure(), options);
 //   preconditioner.Update(A, nullptr);
-//   preconditioner.RightMultiply(x, y);
+//   preconditioner.RightMultiplyAndAccumulate(x, y);
 //
+// TODO(https://github.com/ceres-solver/ceres-solver/issues/935):
+// SchurJacobiPreconditioner::RightMultiply will benefit from multithreading
 class CERES_NO_EXPORT SchurJacobiPreconditioner
    : public BlockSparseMatrixPreconditioner {
 public:
@@ -91,7 +92,7 @@ class CERES_NO_EXPORT SchurJacobiPreconditioner
  ~SchurJacobiPreconditioner() override;

  // Preconditioner interface.
-  void RightMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
  int num_rows() const final;

 private:
@@ -104,8 +105,7 @@ class CERES_NO_EXPORT SchurJacobiPreconditioner
  std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/schur_templates.cc
+++ b/extern/ceres/internal/ceres/schur_templates.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/schur_templates.h
+++ b/extern/ceres/internal/ceres/schur_templates.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,14 +36,12 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 CERES_NO_EXPORT
 void GetBestSchurTemplateSpecialization(int* row_block_size,
                                        int* e_block_size,
                                        int* f_block_size);
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SCHUR_TEMPLATES_H_
--- a/extern/ceres/internal/ceres/scoped_thread_token.h
+++ b/extern/ceres/internal/ceres/scoped_thread_token.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -34,8 +34,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/thread_token_provider.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Helper class for ThreadTokenProvider. This object acquires a token in its
 // constructor and puts that token back with destruction.
@@ -55,7 +54,6 @@ class CERES_NO_EXPORT ScopedThreadToken {
  int token_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
--- a/extern/ceres/internal/ceres/scratch_evaluate_preparer.cc
+++ b/extern/ceres/internal/ceres/scratch_evaluate_preparer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,23 +36,22 @@
 #include "ceres/program.h"
 #include "ceres/residual_block.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 std::unique_ptr<ScratchEvaluatePreparer[]> ScratchEvaluatePreparer::Create(
-    const Program& program, int num_threads) {
+    const Program& program, unsigned num_threads) {
  auto preparers = std::make_unique<ScratchEvaluatePreparer[]>(num_threads);
  int max_derivatives_per_residual_block =
      program.MaxDerivativesPerResidualBlock();
-  for (int i = 0; i < num_threads; i++) {
+  for (unsigned i = 0; i < num_threads; i++) {
    preparers[i].Init(max_derivatives_per_residual_block);
  }
  return preparers;
 }

 void ScratchEvaluatePreparer::Init(int max_derivatives_per_residual_block) {
-  jacobian_scratch_ =
-      std::make_unique<double[]>(max_derivatives_per_residual_block);
+  jacobian_scratch_ = std::make_unique<double[]>(
+      static_cast<std::size_t>(max_derivatives_per_residual_block));
 }

 // Point the jacobian blocks into the scratch area of this evaluate preparer.
@@ -75,5 +74,4 @@ void ScratchEvaluatePreparer::Prepare(const ResidualBlock* residual_block,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/scratch_evaluate_preparer.h
+++ b/extern/ceres/internal/ceres/scratch_evaluate_preparer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class Program;
 class ResidualBlock;
@@ -51,7 +50,7 @@ class CERES_NO_EXPORT ScratchEvaluatePreparer {
 public:
  // Create num_threads ScratchEvaluatePreparers.
  static std::unique_ptr<ScratchEvaluatePreparer[]> Create(
-      const Program& program, int num_threads);
+      const Program& program, unsigned num_threads);

  // EvaluatePreparer interface
  void Init(int max_derivatives_per_residual_block);
@@ -66,8 +65,7 @@ class CERES_NO_EXPORT ScratchEvaluatePreparer {
  std::unique_ptr<double[]> jacobian_scratch_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/single_linkage_clustering.cc
+++ b/extern/ceres/internal/ceres/single_linkage_clustering.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -36,8 +36,7 @@
 #include "ceres/graph.h"
 #include "ceres/graph_algorithms.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 int ComputeSingleLinkageClustering(
    const SingleLinkageClusteringOptions& options,
@@ -91,5 +90,4 @@ int ComputeSingleLinkageClustering(
  return num_clusters;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/single_linkage_clustering.h
+++ b/extern/ceres/internal/ceres/single_linkage_clustering.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct SingleLinkageClusteringOptions {
  // Graph edges with edge weight less than min_similarity are ignored
@@ -61,8 +60,7 @@ CERES_NO_EXPORT int ComputeSingleLinkageClustering(
    const WeightedGraph<int>& graph,
    std::unordered_map<int, int>* membership);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/small_blas.h
+++ b/extern/ceres/internal/ceres/small_blas.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "glog/logging.h"
 #include "small_blas_generic.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // The following three macros are used to share code and reduce
 // template junk across the various GEMM variants.
@@ -561,7 +560,6 @@ inline void MatrixTransposeVectorMultiply(const double* A,
 #undef CERES_GEMM_STORE_SINGLE
 #undef CERES_GEMM_STORE_PAIR

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SMALL_BLAS_H_
--- a/extern/ceres/internal/ceres/small_blas_generic.h
+++ b/extern/ceres/internal/ceres/small_blas_generic.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,38 +35,35 @@
 #ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
 #define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // The following macros are used to share code
-#define CERES_GEMM_OPT_NAIVE_HEADER \
-  double c0 = 0.0;                  \
-  double c1 = 0.0;                  \
-  double c2 = 0.0;                  \
-  double c3 = 0.0;                  \
-  const double* pa = a;             \
-  const double* pb = b;             \
-  const int span = 4;               \
-  int col_r = col_a & (span - 1);   \
+#define CERES_GEMM_OPT_NAIVE_HEADER       \
+  double cvec4[4] = {0.0, 0.0, 0.0, 0.0}; \
+  const double* pa = a;                   \
+  const double* pb = b;                   \
+  const int span = 4;                     \
+  int col_r = col_a & (span - 1);         \
  int col_m = col_a - col_r;

 #define CERES_GEMM_OPT_STORE_MAT1X4 \
  if (kOperation > 0) {             \
-    *c++ += c0;                     \
-    *c++ += c1;                     \
-    *c++ += c2;                     \
-    *c++ += c3;                     \
+    c[0] += cvec4[0];               \
+    c[1] += cvec4[1];               \
+    c[2] += cvec4[2];               \
+    c[3] += cvec4[3];               \
  } else if (kOperation < 0) {      \
-    *c++ -= c0;                     \
-    *c++ -= c1;                     \
-    *c++ -= c2;                     \
-    *c++ -= c3;                     \
+    c[0] -= cvec4[0];               \
+    c[1] -= cvec4[1];               \
+    c[2] -= cvec4[2];               \
+    c[3] -= cvec4[3];               \
  } else {                          \
-    *c++ = c0;                      \
-    *c++ = c1;                      \
-    *c++ = c2;                      \
-    *c++ = c3;                      \
-  }
+    c[0] = cvec4[0];                \
+    c[1] = cvec4[1];                \
+    c[2] = cvec4[2];                \
+    c[3] = cvec4[3];                \
+  }                                 \
+  c += 4;

 // Matrix-Matrix Multiplication
 // Figure out 1x4 of Matrix C in one batch
@@ -100,10 +97,10 @@ static inline void MMM_mat1x4(const int col_a,
 #define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
  av = pa[k];                         \
  pb = b + bi;                        \
-  c0 += av * pb[0];                   \
-  c1 += av * pb[1];                   \
-  c2 += av * pb[2];                   \
-  c3 += av * pb[3];                   \
+  cvec4[0] += av * pb[0];             \
+  cvec4[1] += av * pb[1];             \
+  cvec4[2] += av * pb[2];             \
+  cvec4[3] += av * pb[3];             \
  pb += 4;                            \
  bi += col_stride_b;                 \
  k++;
@@ -168,10 +165,10 @@ static inline void MTM_mat1x4(const int col_a,
 #define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
  av = pa[ai];                        \
  pb = b + bi;                        \
-  c0 += av * pb[0];                   \
-  c1 += av * pb[1];                   \
-  c2 += av * pb[2];                   \
-  c3 += av * pb[3];                   \
+  cvec4[0] += av * pb[0];             \
+  cvec4[1] += av * pb[1];             \
+  cvec4[2] += av * pb[2];             \
+  cvec4[3] += av * pb[3];             \
  pb += 4;                            \
  ai += col_stride_a;                 \
  bi += col_stride_b;
@@ -221,13 +218,13 @@ static inline void MVM_mat4x1(const int col_a,
  double bv = 0.0;

  // clang-format off
-#define CERES_GEMM_OPT_MVM_MAT4X1_MUL  \
-  bv = *pb;                            \
-  c0 += *(pa                   ) * bv; \
-  c1 += *(pa + col_stride_a    ) * bv; \
-  c2 += *(pa + col_stride_a * 2) * bv; \
-  c3 += *(pa + col_stride_a * 3) * bv; \
-  pa++;                                \
+#define CERES_GEMM_OPT_MVM_MAT4X1_MUL       \
+  bv = *pb;                                 \
+  cvec4[0] += *(pa                   ) * bv; \
+  cvec4[1] += *(pa + col_stride_a    ) * bv; \
+  cvec4[2] += *(pa + col_stride_a * 2) * bv; \
+  cvec4[3] += *(pa + col_stride_a * 3) * bv; \
+  pa++;                                     \
  pb++;
  // clang-format on

@@ -285,16 +282,14 @@ static inline void MTV_mat4x1(const int col_a,
  CERES_GEMM_OPT_NAIVE_HEADER
  double bv = 0.0;

-  // clang-format off
 #define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
  bv = *pb;                           \
-  c0 += *(pa    ) * bv;               \
-  c1 += *(pa + 1) * bv;               \
-  c2 += *(pa + 2) * bv;               \
-  c3 += *(pa + 3) * bv;               \
+  cvec4[0] += pa[0] * bv;             \
+  cvec4[1] += pa[1] * bv;             \
+  cvec4[2] += pa[2] * bv;             \
+  cvec4[3] += pa[3] * bv;             \
  pa += col_stride_a;                 \
  pb++;
-  // clang-format on

  for (int k = 0; k < col_m; k += span) {
    CERES_GEMM_OPT_MTV_MAT4X1_MUL
@@ -315,7 +310,6 @@ static inline void MTV_mat4x1(const int col_a,
 #undef CERES_GEMM_OPT_NAIVE_HEADER
 #undef CERES_GEMM_OPT_STORE_MAT1X4

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_
--- a/extern/ceres/internal/ceres/solver.cc
+++ b/extern/ceres/internal/ceres/solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,14 +32,17 @@
 #include "ceres/solver.h"

 #include <algorithm>
+#include <map>
 #include <memory>
 #include <sstream>  // NOLINT
+#include <string>
 #include <vector>

 #include "ceres/casts.h"
 #include "ceres/context.h"
 #include "ceres/context_impl.h"
 #include "ceres/detect_structure.h"
+#include "ceres/eigensparse.h"
 #include "ceres/gradient_checking_cost_function.h"
 #include "ceres/internal/export.h"
 #include "ceres/parameter_block_ordering.h"
@@ -50,6 +53,7 @@
 #include "ceres/schur_templates.h"
 #include "ceres/solver_utils.h"
 #include "ceres/stringprintf.h"
+#include "ceres/suitesparse.h"
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

@@ -58,32 +62,29 @@ namespace {

 using internal::StringAppendF;
 using internal::StringPrintf;
-using std::map;
-using std::string;
-using std::vector;

-#define OPTION_OP(x, y, OP)                                          \
-  if (!(options.x OP y)) {                                           \
-    std::stringstream ss;                                            \
-    ss << "Invalid configuration. ";                                 \
-    ss << string("Solver::Options::" #x " = ") << options.x << ". "; \
-    ss << "Violated constraint: ";                                   \
-    ss << string("Solver::Options::" #x " " #OP " " #y);             \
-    *error = ss.str();                                               \
-    return false;                                                    \
+#define OPTION_OP(x, y, OP)                                               \
+  if (!(options.x OP y)) {                                                \
+    std::stringstream ss;                                                 \
+    ss << "Invalid configuration. ";                                      \
+    ss << std::string("Solver::Options::" #x " = ") << options.x << ". "; \
+    ss << "Violated constraint: ";                                        \
+    ss << std::string("Solver::Options::" #x " " #OP " " #y);             \
+    *error = ss.str();                                                    \
+    return false;                                                         \
  }

-#define OPTION_OP_OPTION(x, y, OP)                                   \
-  if (!(options.x OP options.y)) {                                   \
-    std::stringstream ss;                                            \
-    ss << "Invalid configuration. ";                                 \
-    ss << string("Solver::Options::" #x " = ") << options.x << ". "; \
-    ss << string("Solver::Options::" #y " = ") << options.y << ". "; \
-    ss << "Violated constraint: ";                                   \
-    ss << string("Solver::Options::" #x);                            \
-    ss << string(#OP " Solver::Options::" #y ".");                   \
-    *error = ss.str();                                               \
-    return false;                                                    \
+#define OPTION_OP_OPTION(x, y, OP)                                        \
+  if (!(options.x OP options.y)) {                                        \
+    std::stringstream ss;                                                 \
+    ss << "Invalid configuration. ";                                      \
+    ss << std::string("Solver::Options::" #x " = ") << options.x << ". "; \
+    ss << std::string("Solver::Options::" #y " = ") << options.y << ". "; \
+    ss << "Violated constraint: ";                                        \
+    ss << std::string("Solver::Options::" #x);                            \
+    ss << std::string(#OP " Solver::Options::" #y ".");                   \
+    *error = ss.str();                                                    \
+    return false;                                                         \
  }

 #define OPTION_GE(x, y) OPTION_OP(x, y, >=);
@@ -93,7 +94,7 @@ using std::vector;
 #define OPTION_LE_OPTION(x, y) OPTION_OP_OPTION(x, y, <=)
 #define OPTION_LT_OPTION(x, y) OPTION_OP_OPTION(x, y, <)

-bool CommonOptionsAreValid(const Solver::Options& options, string* error) {
+bool CommonOptionsAreValid(const Solver::Options& options, std::string* error) {
  OPTION_GE(max_num_iterations, 0);
  OPTION_GE(max_solver_time_in_seconds, 0.0);
  OPTION_GE(function_tolerance, 0.0);
@@ -107,7 +108,286 @@ bool CommonOptionsAreValid(const Solver::Options& options, string* error) {
  return true;
 }

-bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
+bool IsNestedDissectionAvailable(SparseLinearAlgebraLibraryType type) {
+  return (((type == SUITE_SPARSE) &&
+           internal::SuiteSparse::IsNestedDissectionAvailable()) ||
+          (type == ACCELERATE_SPARSE) ||
+          ((type == EIGEN_SPARSE) &&
+           internal::EigenSparse::IsNestedDissectionAvailable()));
+}
+
+bool IsIterativeSolver(LinearSolverType type) {
+  return (type == CGNR || type == ITERATIVE_SCHUR);
+}
+
+bool OptionsAreValidForDenseSolver(const Solver::Options& options,
+                                   std::string* error) {
+  const char* library_name = DenseLinearAlgebraLibraryTypeToString(
+      options.dense_linear_algebra_library_type);
+  const char* solver_name =
+      LinearSolverTypeToString(options.linear_solver_type);
+  constexpr char kFormat[] =
+      "Can't use %s with dense_linear_algebra_library_type = %s "
+      "because support not enabled when Ceres was built.";
+
+  if (!IsDenseLinearAlgebraLibraryTypeAvailable(
+          options.dense_linear_algebra_library_type)) {
+    *error = StringPrintf(kFormat, solver_name, library_name);
+    return false;
+  }
+  return true;
+}
+
+bool OptionsAreValidForSparseCholeskyBasedSolver(const Solver::Options& options,
+                                                 std::string* error) {
+  const char* library_name = SparseLinearAlgebraLibraryTypeToString(
+      options.sparse_linear_algebra_library_type);
+  // Sparse factorization based solvers and some preconditioners require a
+  // sparse Cholesky factorization.
+  const char* solver_name =
+      IsIterativeSolver(options.linear_solver_type)
+          ? PreconditionerTypeToString(options.preconditioner_type)
+          : LinearSolverTypeToString(options.linear_solver_type);
+
+  constexpr char kNoSparseFormat[] =
+      "Can't use %s with sparse_linear_algebra_library_type = %s.";
+  constexpr char kNoLibraryFormat[] =
+      "Can't use %s sparse_linear_algebra_library_type = %s, because support "
+      "was not enabled when Ceres Solver was built.";
+  constexpr char kNoNesdisFormat[] =
+      "NESDIS is not available with sparse_linear_algebra_library_type = %s.";
+  constexpr char kMixedFormat[] =
+      "use_mixed_precision_solves with %s is not supported with "
+      "sparse_linear_algebra_library_type = %s";
+  constexpr char kDynamicSparsityFormat[] =
+      "dynamic sparsity is not supported with "
+      "sparse_linear_algebra_library_type = %s";
+
+  if (options.sparse_linear_algebra_library_type == NO_SPARSE) {
+    *error = StringPrintf(kNoSparseFormat, solver_name, library_name);
+    return false;
+  }
+
+  if (!IsSparseLinearAlgebraLibraryTypeAvailable(
+          options.sparse_linear_algebra_library_type)) {
+    *error = StringPrintf(kNoLibraryFormat, solver_name, library_name);
+    return false;
+  }
+
+  if (options.linear_solver_ordering_type == ceres::NESDIS &&
+      !IsNestedDissectionAvailable(
+          options.sparse_linear_algebra_library_type)) {
+    *error = StringPrintf(kNoNesdisFormat, library_name);
+    return false;
+  }
+
+  if (options.use_mixed_precision_solves &&
+      options.sparse_linear_algebra_library_type == SUITE_SPARSE) {
+    *error = StringPrintf(kMixedFormat, solver_name, library_name);
+    return false;
+  }
+
+  if (options.dynamic_sparsity &&
+      options.sparse_linear_algebra_library_type == ACCELERATE_SPARSE) {
+    *error = StringPrintf(kDynamicSparsityFormat, library_name);
+    return false;
+  }
+
+  return true;
+}
+
+bool OptionsAreValidForDenseNormalCholesky(const Solver::Options& options,
+                                           std::string* error) {
+  CHECK_EQ(options.linear_solver_type, DENSE_NORMAL_CHOLESKY);
+  return OptionsAreValidForDenseSolver(options, error);
+}
+
+bool OptionsAreValidForDenseQr(const Solver::Options& options,
+                               std::string* error) {
+  CHECK_EQ(options.linear_solver_type, DENSE_QR);
+
+  if (!OptionsAreValidForDenseSolver(options, error)) {
+    return false;
+  }
+
+  if (options.use_mixed_precision_solves) {
+    *error = "Can't use use_mixed_precision_solves with DENSE_QR.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptionsAreValidForSparseNormalCholesky(const Solver::Options& options,
+                                            std::string* error) {
+  CHECK_EQ(options.linear_solver_type, SPARSE_NORMAL_CHOLESKY);
+  return OptionsAreValidForSparseCholeskyBasedSolver(options, error);
+}
+
+bool OptionsAreValidForDenseSchur(const Solver::Options& options,
+                                  std::string* error) {
+  CHECK_EQ(options.linear_solver_type, DENSE_SCHUR);
+
+  if (options.dynamic_sparsity) {
+    *error = "dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY";
+    return false;
+  }
+
+  if (!OptionsAreValidForDenseSolver(options, error)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool OptionsAreValidForSparseSchur(const Solver::Options& options,
+                                   std::string* error) {
+  CHECK_EQ(options.linear_solver_type, SPARSE_SCHUR);
+  if (options.dynamic_sparsity) {
+    *error = "Dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY.";
+    return false;
+  }
+  return OptionsAreValidForSparseCholeskyBasedSolver(options, error);
+}
+
+bool OptionsAreValidForIterativeSchur(const Solver::Options& options,
+                                      std::string* error) {
+  CHECK_EQ(options.linear_solver_type, ITERATIVE_SCHUR);
+  if (options.dynamic_sparsity) {
+    *error = "Dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY.";
+    return false;
+  }
+
+  if (options.use_explicit_schur_complement) {
+    if (options.preconditioner_type != SCHUR_JACOBI) {
+      *error =
+          "use_explicit_schur_complement only supports "
+          "SCHUR_JACOBI as the preconditioner.";
+      return false;
+    }
+    if (options.use_spse_initialization) {
+      *error =
+          "use_explicit_schur_complement does not support "
+          "use_spse_initialization.";
+      return false;
+    }
+  }
+
+  if (options.use_spse_initialization ||
+      options.preconditioner_type == SCHUR_POWER_SERIES_EXPANSION) {
+    OPTION_GE(max_num_spse_iterations, 1)
+    OPTION_GE(spse_tolerance, 0.0)
+  }
+
+  if (options.use_mixed_precision_solves) {
+    *error = "Can't use use_mixed_precision_solves with ITERATIVE_SCHUR";
+    return false;
+  }
+
+  if (options.dynamic_sparsity) {
+    *error = "Dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY.";
+    return false;
+  }
+
+  if (options.preconditioner_type == SUBSET) {
+    *error = "Can't use SUBSET preconditioner with ITERATIVE_SCHUR";
+    return false;
+  }
+
+  // CLUSTER_JACOBI and CLUSTER_TRIDIAGONAL require sparse Cholesky
+  // factorization.
+  if (options.preconditioner_type == CLUSTER_JACOBI ||
+      options.preconditioner_type == CLUSTER_TRIDIAGONAL) {
+    return OptionsAreValidForSparseCholeskyBasedSolver(options, error);
+  }
+
+  return true;
+}
+
+bool OptionsAreValidForCgnr(const Solver::Options& options,
+                            std::string* error) {
+  CHECK_EQ(options.linear_solver_type, CGNR);
+
+  if (options.preconditioner_type != IDENTITY &&
+      options.preconditioner_type != JACOBI &&
+      options.preconditioner_type != SUBSET) {
+    *error =
+        StringPrintf("Can't use CGNR with preconditioner_type = %s.",
+                     PreconditionerTypeToString(options.preconditioner_type));
+    return false;
+  }
+
+  if (options.use_mixed_precision_solves) {
+    *error = "use_mixed_precision_solves cannot be used with CGNR";
+    return false;
+  }
+
+  if (options.dynamic_sparsity) {
+    *error = "Dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY.";
+    return false;
+  }
+
+  if (options.preconditioner_type == SUBSET) {
+    if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+      *error =
+          "Can't use CGNR with preconditioner_type = SUBSET when "
+          "sparse_linear_algebra_library_type = CUDA_SPARSE.";
+      return false;
+    }
+
+    if (options.residual_blocks_for_subset_preconditioner.empty()) {
+      *error =
+          "When using SUBSET preconditioner, "
+          "residual_blocks_for_subset_preconditioner cannot be empty";
+      return false;
+    }
+
+    // SUBSET preconditioner requires sparse Cholesky factorization.
+    if (!OptionsAreValidForSparseCholeskyBasedSolver(options, error)) {
+      return false;
+    }
+  }
+
+  // Check options for CGNR with CUDA_SPARSE.
+  if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+    if (!IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE)) {
+      *error =
+          "Can't use CGNR with sparse_linear_algebra_library_type = "
+          "CUDA_SPARSE because support was not enabled when Ceres was built.";
+      return false;
+    }
+  }
+  return true;
+}
+
+bool OptionsAreValidForLinearSolver(const Solver::Options& options,
+                                    std::string* error) {
+  switch (options.linear_solver_type) {
+    case DENSE_NORMAL_CHOLESKY:
+      return OptionsAreValidForDenseNormalCholesky(options, error);
+    case DENSE_QR:
+      return OptionsAreValidForDenseQr(options, error);
+    case SPARSE_NORMAL_CHOLESKY:
+      return OptionsAreValidForSparseNormalCholesky(options, error);
+    case DENSE_SCHUR:
+      return OptionsAreValidForDenseSchur(options, error);
+    case SPARSE_SCHUR:
+      return OptionsAreValidForSparseSchur(options, error);
+    case ITERATIVE_SCHUR:
+      return OptionsAreValidForIterativeSchur(options, error);
+    case CGNR:
+      return OptionsAreValidForCgnr(options, error);
+    default:
+      LOG(FATAL) << "Congratulations you have found a bug. Please report "
+                    "this to the "
+                    "Ceres Solver developers. Unknown linear solver type: "
+                 << LinearSolverTypeToString(options.linear_solver_type);
+  }
+  return false;
+}
+
+bool TrustRegionOptionsAreValid(const Solver::Options& options,
+                                std::string* error) {
  OPTION_GT(initial_trust_region_radius, 0.0);
  OPTION_GT(min_trust_region_radius, 0.0);
  OPTION_GT(max_trust_region_radius, 0.0);
@@ -121,7 +401,7 @@ bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
  OPTION_GE(max_num_consecutive_invalid_steps, 0);
  OPTION_GT(eta, 0.0);
  OPTION_GE(min_linear_solver_iterations, 0);
-  OPTION_GE(max_linear_solver_iterations, 1);
+  OPTION_GE(max_linear_solver_iterations, 0);
  OPTION_LE_OPTION(min_linear_solver_iterations, max_linear_solver_iterations);

  if (options.use_inner_iterations) {
@@ -132,80 +412,19 @@ bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
    OPTION_GT(max_consecutive_nonmonotonic_steps, 0);
  }

-  if (options.linear_solver_type == ITERATIVE_SCHUR &&
-      options.use_explicit_schur_complement &&
-      options.preconditioner_type != SCHUR_JACOBI) {
+  if ((options.trust_region_strategy_type == DOGLEG) &&
+      IsIterativeSolver(options.linear_solver_type)) {
    *error =
-        "use_explicit_schur_complement only supports "
-        "SCHUR_JACOBI as the preconditioner.";
+        "DOGLEG only supports exact factorization based linear "
+        "solvers. If you want to use an iterative solver please "
+        "use LEVENBERG_MARQUARDT as the trust_region_strategy_type";
    return false;
  }

-  if (!IsDenseLinearAlgebraLibraryTypeAvailable(
-          options.dense_linear_algebra_library_type) &&
-      (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
-       options.linear_solver_type == DENSE_QR ||
-       options.linear_solver_type == DENSE_SCHUR)) {
-    *error = StringPrintf(
-        "Can't use %s with "
-        "Solver::Options::dense_linear_algebra_library_type = %s "
-        "because %s was not enabled when Ceres was built.",
-        LinearSolverTypeToString(options.linear_solver_type),
-        DenseLinearAlgebraLibraryTypeToString(
-            options.dense_linear_algebra_library_type),
-        DenseLinearAlgebraLibraryTypeToString(
-            options.dense_linear_algebra_library_type));
+  if (!OptionsAreValidForLinearSolver(options, error)) {
    return false;
  }

-  {
-    const char* sparse_linear_algebra_library_name =
-        SparseLinearAlgebraLibraryTypeToString(
-            options.sparse_linear_algebra_library_type);
-    const char* name = nullptr;
-    if (options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
-        options.linear_solver_type == SPARSE_SCHUR) {
-      name = LinearSolverTypeToString(options.linear_solver_type);
-    } else if ((options.linear_solver_type == ITERATIVE_SCHUR &&
-                (options.preconditioner_type == CLUSTER_JACOBI ||
-                 options.preconditioner_type == CLUSTER_TRIDIAGONAL)) ||
-               (options.linear_solver_type == CGNR &&
-                options.preconditioner_type == SUBSET)) {
-      name = PreconditionerTypeToString(options.preconditioner_type);
-    }
-
-    if (name) {
-      if (options.sparse_linear_algebra_library_type == NO_SPARSE) {
-        *error = StringPrintf(
-            "Can't use %s with "
-            "Solver::Options::sparse_linear_algebra_library_type = %s.",
-            name,
-            sparse_linear_algebra_library_name);
-        return false;
-      } else if (!IsSparseLinearAlgebraLibraryTypeAvailable(
-                     options.sparse_linear_algebra_library_type)) {
-        *error = StringPrintf(
-            "Can't use %s with "
-            "Solver::Options::sparse_linear_algebra_library_type = %s, "
-            "because support was not enabled when Ceres Solver was built.",
-            name,
-            sparse_linear_algebra_library_name);
-        return false;
-      }
-    }
-  }
-
-  if (options.trust_region_strategy_type == DOGLEG) {
-    if (options.linear_solver_type == ITERATIVE_SCHUR ||
-        options.linear_solver_type == CGNR) {
-      *error =
-          "DOGLEG only supports exact factorization based linear "
-          "solvers. If you want to use an iterative solver please "
-          "use LEVENBERG_MARQUARDT as the trust_region_strategy_type";
-      return false;
-    }
-  }
-
  if (!options.trust_region_minimizer_iterations_to_dump.empty() &&
      options.trust_region_problem_dump_format_type != CONSOLE &&
      options.trust_region_problem_dump_directory.empty()) {
@@ -213,33 +432,11 @@ bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
    return false;
  }

-  if (options.dynamic_sparsity) {
-    if (options.linear_solver_type != SPARSE_NORMAL_CHOLESKY) {
-      *error =
-          "Dynamic sparsity is only supported with SPARSE_NORMAL_CHOLESKY.";
-      return false;
-    }
-    if (options.sparse_linear_algebra_library_type == ACCELERATE_SPARSE) {
-      *error =
-          "ACCELERATE_SPARSE is not currently supported with dynamic sparsity.";
-      return false;
-    }
-  }
-
-  if (options.linear_solver_type == CGNR &&
-      options.preconditioner_type == SUBSET &&
-      options.residual_blocks_for_subset_preconditioner.empty()) {
-    *error =
-        "When using SUBSET preconditioner, "
-        "Solver::Options::residual_blocks_for_subset_preconditioner cannot be "
-        "empty";
-    return false;
-  }
-
  return true;
 }

-bool LineSearchOptionsAreValid(const Solver::Options& options, string* error) {
+bool LineSearchOptionsAreValid(const Solver::Options& options,
+                               std::string* error) {
  OPTION_GT(max_lbfgs_rank, 0);
  OPTION_GT(min_line_search_step_size, 0.0);
  OPTION_GT(max_line_search_step_contraction, 0.0);
@@ -259,9 +456,10 @@ bool LineSearchOptionsAreValid(const Solver::Options& options, string* error) {
       options.line_search_direction_type == ceres::LBFGS) &&
      options.line_search_type != ceres::WOLFE) {
    *error =
-        string("Invalid configuration: Solver::Options::line_search_type = ") +
-        string(LineSearchTypeToString(options.line_search_type)) +
-        string(
+        std::string(
+            "Invalid configuration: Solver::Options::line_search_type = ") +
+        std::string(LineSearchTypeToString(options.line_search_type)) +
+        std::string(
            ". When using (L)BFGS, "
            "Solver::Options::line_search_type must be set to WOLFE.");
    return false;
@@ -269,8 +467,8 @@ bool LineSearchOptionsAreValid(const Solver::Options& options, string* error) {

  // Warn user if they have requested BISECTION interpolation, but constraints
  // on max/min step size change during line search prevent bisection scaling
-  // from occurring. Warn only, as this is likely a user mistake, but one which
-  // does not prevent us from continuing.
+  // from occurring. Warn only, as this is likely a user mistake, but one
+  // which does not prevent us from continuing.
  if (options.line_search_interpolation_type == ceres::BISECTION &&
      (options.max_line_search_step_contraction > 0.5 ||
       options.min_line_search_step_contraction < 0.5)) {
@@ -295,7 +493,7 @@ bool LineSearchOptionsAreValid(const Solver::Options& options, string* error) {
 #undef OPTION_LE_OPTION
 #undef OPTION_LT_OPTION

-void StringifyOrdering(const vector<int>& ordering, string* report) {
+void StringifyOrdering(const std::vector<int>& ordering, std::string* report) {
  if (ordering.empty()) {
    internal::StringAppendF(report, "AUTOMATIC");
    return;
@@ -339,7 +537,7 @@ void PreSolveSummarize(const Solver::Options& options,
                                 &(summary->inner_iteration_ordering_given));

  // clang-format off
-  summary->dense_linear_algebra_library_type  = options.dense_linear_algebra_library_type;  //  NOLINT
+  summary->dense_linear_algebra_library_type  = options.dense_linear_algebra_library_type;
  summary->dogleg_type                        = options.dogleg_type;
  summary->inner_iteration_time_in_seconds    = 0.0;
  summary->num_line_search_steps              = 0;
@@ -348,18 +546,19 @@ void PreSolveSummarize(const Solver::Options& options,
  summary->line_search_polynomial_minimization_time_in_seconds = 0.0;
  summary->line_search_total_time_in_seconds  = 0.0;
  summary->inner_iterations_given             = options.use_inner_iterations;
-  summary->line_search_direction_type         = options.line_search_direction_type;         //  NOLINT
-  summary->line_search_interpolation_type     = options.line_search_interpolation_type;     //  NOLINT
+  summary->line_search_direction_type         = options.line_search_direction_type;
+  summary->line_search_interpolation_type     = options.line_search_interpolation_type;
  summary->line_search_type                   = options.line_search_type;
  summary->linear_solver_type_given           = options.linear_solver_type;
  summary->max_lbfgs_rank                     = options.max_lbfgs_rank;
  summary->minimizer_type                     = options.minimizer_type;
-  summary->nonlinear_conjugate_gradient_type  = options.nonlinear_conjugate_gradient_type;  //  NOLINT
+  summary->nonlinear_conjugate_gradient_type  = options.nonlinear_conjugate_gradient_type;
  summary->num_threads_given                  = options.num_threads;
  summary->preconditioner_type_given          = options.preconditioner_type;
-  summary->sparse_linear_algebra_library_type = options.sparse_linear_algebra_library_type; //  NOLINT
-  summary->trust_region_strategy_type         = options.trust_region_strategy_type;         //  NOLINT
-  summary->visibility_clustering_type         = options.visibility_clustering_type;         //  NOLINT
+  summary->sparse_linear_algebra_library_type = options.sparse_linear_algebra_library_type;
+  summary->linear_solver_ordering_type        = options.linear_solver_ordering_type;
+  summary->trust_region_strategy_type         = options.trust_region_strategy_type;
+  summary->visibility_clustering_type         = options.visibility_clustering_type;
  // clang-format on
 }

@@ -367,19 +566,23 @@ void PostSolveSummarize(const internal::PreprocessedProblem& pp,
                        Solver::Summary* summary) {
  internal::OrderingToGroupSizes(pp.options.linear_solver_ordering.get(),
                                 &(summary->linear_solver_ordering_used));
+  // TODO(sameeragarwal): Update the preprocessor to collapse the
+  // second and higher groups into one group when nested dissection is
+  // used.
  internal::OrderingToGroupSizes(pp.options.inner_iteration_ordering.get(),
                                 &(summary->inner_iteration_ordering_used));

  // clang-format off
-  summary->inner_iterations_used          = pp.inner_iteration_minimizer.get() != nullptr;     // NOLINT
+  summary->inner_iterations_used          = pp.inner_iteration_minimizer != nullptr;
  summary->linear_solver_type_used        = pp.linear_solver_options.type;
+  summary->mixed_precision_solves_used    = pp.options.use_mixed_precision_solves;
  summary->num_threads_used               = pp.options.num_threads;
  summary->preconditioner_type_used       = pp.options.preconditioner_type;
  // clang-format on

  internal::SetSummaryFinalCost(summary);

-  if (pp.reduced_program.get() != nullptr) {
+  if (pp.reduced_program != nullptr) {
    SummarizeReducedProgram(*pp.reduced_program, summary);
  }

@@ -389,8 +592,8 @@ void PostSolveSummarize(const internal::PreprocessedProblem& pp,
  // case if the preprocessor failed, or if the reduced problem did
  // not contain any parameter blocks. Thus, only extract the
  // evaluator statistics if one exists.
-  if (pp.evaluator.get() != nullptr) {
-    const map<string, CallStatistics>& evaluator_statistics =
+  if (pp.evaluator != nullptr) {
+    const std::map<std::string, CallStatistics>& evaluator_statistics =
        pp.evaluator->Statistics();
    {
      const CallStatistics& call_stats = FindWithDefault(
@@ -411,8 +614,8 @@ void PostSolveSummarize(const internal::PreprocessedProblem& pp,
  // Again, like the evaluator, there may or may not be a linear
  // solver from which we can extract run time statistics. In
  // particular the line search solver does not use a linear solver.
-  if (pp.linear_solver.get() != nullptr) {
-    const map<string, CallStatistics>& linear_solver_statistics =
+  if (pp.linear_solver != nullptr) {
+    const std::map<std::string, CallStatistics>& linear_solver_statistics =
        pp.linear_solver->Statistics();
    const CallStatistics& call_stats = FindWithDefault(
        linear_solver_statistics, "LinearSolver::Solve", CallStatistics());
@@ -468,9 +671,23 @@ std::string SchurStructureToString(const int row_block_size,
  return internal::StringPrintf("%s,%s,%s", row.c_str(), e.c_str(), f.c_str());
 }

+#ifndef CERES_NO_CUDA
+bool IsCudaRequired(const Solver::Options& options) {
+  if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+      options.linear_solver_type == DENSE_SCHUR ||
+      options.linear_solver_type == DENSE_QR) {
+    return (options.dense_linear_algebra_library_type == CUDA);
+  }
+  if (options.linear_solver_type == CGNR) {
+    return (options.sparse_linear_algebra_library_type == CUDA_SPARSE);
+  }
+  return false;
+}
+#endif
+
 }  // namespace

-bool Solver::Options::IsValid(string* error) const {
+bool Solver::Options::IsValid(std::string* error) const {
  if (!CommonOptionsAreValid(*this, error)) {
    return false;
  }
@@ -509,10 +726,19 @@ void Solver::Solve(const Solver::Options& options,
    return;
  }

-  ProblemImpl* problem_impl = problem->impl_.get();
+  ProblemImpl* problem_impl = problem->mutable_impl();
  Program* program = problem_impl->mutable_program();
  PreSolveSummarize(options, problem_impl, summary);

+#ifndef CERES_NO_CUDA
+  if (IsCudaRequired(options)) {
+    if (!problem_impl->context()->InitCuda(&summary->message)) {
+      LOG(ERROR) << "Terminating: " << summary->message;
+      return;
+    }
+  }
+#endif  // CERES_NO_CUDA
+
  // If gradient_checking is enabled, wrap all cost functions in a
  // gradient checker and install a callback that terminates if any gradient
  // error is detected.
@@ -582,7 +808,7 @@ void Solver::Solve(const Solver::Options& options,
  }

  const double postprocessor_start_time = WallTimeInSeconds();
-  problem_impl = problem->impl_.get();
+  problem_impl = problem->mutable_impl();
  program = problem_impl->mutable_program();
  // On exit, ensure that the parameter blocks again point at the user
  // provided values and the parameter blocks are numbered according
@@ -610,7 +836,7 @@ void Solve(const Solver::Options& options,
  solver.Solve(options, problem, summary);
 }

-string Solver::Summary::BriefReport() const {
+std::string Solver::Summary::BriefReport() const {
  return StringPrintf(
      "Ceres Solver Report: "
      "Iterations: %d, "
@@ -623,10 +849,12 @@ string Solver::Summary::BriefReport() const {
      TerminationTypeToString(termination_type));
 }

-string Solver::Summary::FullReport() const {
+std::string Solver::Summary::FullReport() const {
  using internal::VersionString;

-  string report = string("\nSolver Summary (v " + VersionString() + ")\n\n");
+  // NOTE operator+ is not usable for concatenating a string and a string_view.
+  std::string report =
+      std::string{"\nSolver Summary (v "}.append(VersionString()) + ")\n\n";

  StringAppendF(&report, "%45s    %21s\n", "Original", "Reduced");
  StringAppendF(&report,
@@ -660,21 +888,13 @@ string Solver::Summary::FullReport() const {
    if (linear_solver_type_used == DENSE_NORMAL_CHOLESKY ||
        linear_solver_type_used == DENSE_SCHUR ||
        linear_solver_type_used == DENSE_QR) {
+      const char* mixed_precision_suffix =
+          (mixed_precision_solves_used ? "(Mixed Precision)" : "");
      StringAppendF(&report,
-                    "\nDense linear algebra library  %15s\n",
+                    "\nDense linear algebra library  %15s %s\n",
                    DenseLinearAlgebraLibraryTypeToString(
-                        dense_linear_algebra_library_type));
-    }
-
-    if (linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
-        linear_solver_type_used == SPARSE_SCHUR ||
-        (linear_solver_type_used == ITERATIVE_SCHUR &&
-         (preconditioner_type_used == CLUSTER_JACOBI ||
-          preconditioner_type_used == CLUSTER_TRIDIAGONAL))) {
-      StringAppendF(&report,
-                    "\nSparse linear algebra library %15s\n",
-                    SparseLinearAlgebraLibraryTypeToString(
-                        sparse_linear_algebra_library_type));
+                        dense_linear_algebra_library_type),
+                    mixed_precision_suffix);
    }

    StringAppendF(&report,
@@ -687,17 +907,50 @@ string Solver::Summary::FullReport() const {
        StringAppendF(&report, " (SUBSPACE)");
      }
    }
-    StringAppendF(&report, "\n");
-    StringAppendF(&report, "\n");

+    const bool used_sparse_linear_algebra_library =
+        linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
+        linear_solver_type_used == SPARSE_SCHUR ||
+        linear_solver_type_used == CGNR ||
+        (linear_solver_type_used == ITERATIVE_SCHUR &&
+         (preconditioner_type_used == CLUSTER_JACOBI ||
+          preconditioner_type_used == CLUSTER_TRIDIAGONAL));
+
+    const bool linear_solver_ordering_required =
+        linear_solver_type_used == SPARSE_SCHUR ||
+        (linear_solver_type_used == ITERATIVE_SCHUR &&
+         (preconditioner_type_used == CLUSTER_JACOBI ||
+          preconditioner_type_used == CLUSTER_TRIDIAGONAL)) ||
+        (linear_solver_type_used == CGNR && preconditioner_type_used == SUBSET);
+
+    if (used_sparse_linear_algebra_library) {
+      const char* mixed_precision_suffix =
+          (mixed_precision_solves_used ? "(Mixed Precision)" : "");
+      if (linear_solver_ordering_required) {
+        StringAppendF(
+            &report,
+            "\nSparse linear algebra library %15s + %s %s\n",
+            SparseLinearAlgebraLibraryTypeToString(
+                sparse_linear_algebra_library_type),
+            LinearSolverOrderingTypeToString(linear_solver_ordering_type),
+            mixed_precision_suffix);
+      } else {
+        StringAppendF(&report,
+                      "\nSparse linear algebra library %15s %s\n",
+                      SparseLinearAlgebraLibraryTypeToString(
+                          sparse_linear_algebra_library_type),
+                      mixed_precision_suffix);
+      }
+    }
+
+    StringAppendF(&report, "\n");
    StringAppendF(&report, "%45s    %21s\n", "Given", "Used");
    StringAppendF(&report,
                  "Linear solver       %25s%25s\n",
                  LinearSolverTypeToString(linear_solver_type_given),
                  LinearSolverTypeToString(linear_solver_type_used));

-    if (linear_solver_type_given == CGNR ||
-        linear_solver_type_given == ITERATIVE_SCHUR) {
+    if (IsIterativeSolver(linear_solver_type_given)) {
      StringAppendF(&report,
                    "Preconditioner      %25s%25s\n",
                    PreconditionerTypeToString(preconditioner_type_given),
@@ -717,9 +970,9 @@ string Solver::Summary::FullReport() const {
                  num_threads_given,
                  num_threads_used);

-    string given;
+    std::string given;
    StringifyOrdering(linear_solver_ordering_given, &given);
-    string used;
+    std::string used;
    StringifyOrdering(linear_solver_ordering_used, &used);
    StringAppendF(&report,
                  "Linear solver ordering %22s %24s\n",
@@ -740,9 +993,9 @@ string Solver::Summary::FullReport() const {
    }

    if (inner_iterations_used) {
-      string given;
+      std::string given;
      StringifyOrdering(inner_iteration_ordering_given, &given);
-      string used;
+      std::string used;
      StringifyOrdering(inner_iteration_ordering_used, &used);
      StringAppendF(&report,
                    "Inner iteration ordering %20s %24s\n",
@@ -753,7 +1006,7 @@ string Solver::Summary::FullReport() const {
    // LINE_SEARCH HEADER
    StringAppendF(&report, "\nMinimizer                 %19s\n", "LINE_SEARCH");

-    string line_search_direction_string;
+    std::string line_search_direction_string;
    if (line_search_direction_type == LBFGS) {
      line_search_direction_string = StringPrintf("LBFGS (%d)", max_lbfgs_rank);
    } else if (line_search_direction_type == NONLINEAR_CONJUGATE_GRADIENT) {
@@ -768,7 +1021,7 @@ string Solver::Summary::FullReport() const {
                  "Line search direction     %19s\n",
                  line_search_direction_string.c_str());

-    const string line_search_type_string = StringPrintf(
+    const std::string line_search_type_string = StringPrintf(
        "%s %s",
        LineSearchInterpolationTypeToString(line_search_interpolation_type),
        LineSearchTypeToString(line_search_type));
--- a/extern/ceres/internal/ceres/solver_utils.cc
+++ b/extern/ceres/internal/ceres/solver_utils.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,8 +30,6 @@

 #include "ceres/solver_utils.h"

-#include <string>
-
 #include "Eigen/Core"
 #include "ceres/internal/config.h"
 #include "ceres/internal/export.h"
@@ -40,8 +38,7 @@
 #include "cuda_runtime.h"
 #endif  // CERES_NO_CUDA

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // clang-format off
 #define CERES_EIGEN_VERSION                 \
@@ -50,52 +47,47 @@ namespace internal {
  CERES_TO_STRING(EIGEN_MINOR_VERSION)
 // clang-format on

-std::string VersionString() {
-  std::string value = std::string(CERES_VERSION_STRING);
-  value += "-eigen-(" + std::string(CERES_EIGEN_VERSION) + ")";
+constexpr char kVersion[] =
+    // clang-format off
+  CERES_VERSION_STRING
+  "-eigen-(" CERES_EIGEN_VERSION ")"

 #ifdef CERES_NO_LAPACK
-  value += "-no_lapack";
+  "-no_lapack"
 #else
-  value += "-lapack";
+  "-lapack"
 #endif

 #ifndef CERES_NO_SUITESPARSE
-  value += "-suitesparse-(" + std::string(CERES_SUITESPARSE_VERSION) + ")";
+  "-suitesparse-(" CERES_SUITESPARSE_VERSION ")"
 #endif

-#ifndef CERES_NO_CXSPARSE
-  value += "-cxsparse-(" + std::string(CERES_CXSPARSE_VERSION) + ")";
+#if !defined(CERES_NO_EIGEN_METIS) || !defined(CERES_NO_CHOLMOD_PARTITION)
+  "-metis-(" CERES_METIS_VERSION ")"
 #endif

 #ifndef CERES_NO_ACCELERATE_SPARSE
-  value += "-acceleratesparse";
+  "-acceleratesparse"
 #endif

 #ifdef CERES_USE_EIGEN_SPARSE
-  value += "-eigensparse";
+  "-eigensparse"
 #endif

 #ifdef CERES_RESTRUCT_SCHUR_SPECIALIZATIONS
-  value += "-no_schur_specializations";
-#endif
-
-#ifdef CERES_USE_OPENMP
-  value += "-openmp";
-#else
-  value += "-no_openmp";
+  "-no_schur_specializations"
 #endif

 #ifdef CERES_NO_CUSTOM_BLAS
-  value += "-no_custom_blas";
+  "-no_custom_blas"
 #endif

 #ifndef CERES_NO_CUDA
-  value += "-cuda-(" + std::to_string(CUDART_VERSION) + ")";
+  "-cuda-(" CERES_TO_STRING(CUDART_VERSION) ")"
 #endif
+  ;
+// clang-format on

-  return value;
-}
+std::string_view VersionString() noexcept { return kVersion; }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/solver_utils.h
+++ b/extern/ceres/internal/ceres/solver_utils.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,15 +32,14 @@
 #define CERES_INTERNAL_SOLVER_UTILS_H_

 #include <algorithm>
-#include <string>
+#include <string_view>

 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 #include "ceres/iteration_callback.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 template <typename SummaryType>
 bool IsSolutionUsable(const SummaryType& summary) {
@@ -61,10 +60,9 @@ void SetSummaryFinalCost(SummaryType* summary) {
 }

 CERES_NO_EXPORT
-std::string VersionString();
+std::string_view VersionString() noexcept;

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/sparse_cholesky.cc
+++ b/extern/ceres/internal/ceres/sparse_cholesky.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,30 +31,28 @@
 #include "ceres/sparse_cholesky.h"

 #include <memory>
+#include <utility>

 #include "ceres/accelerate_sparse.h"
-#include "ceres/cxsparse.h"
 #include "ceres/eigensparse.h"
-#include "ceres/float_cxsparse.h"
 #include "ceres/float_suitesparse.h"
 #include "ceres/iterative_refiner.h"
 #include "ceres/suitesparse.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 std::unique_ptr<SparseCholesky> SparseCholesky::Create(
    const LinearSolver::Options& options) {
-  const OrderingType ordering_type = options.use_postordering ? AMD : NATURAL;
  std::unique_ptr<SparseCholesky> sparse_cholesky;

  switch (options.sparse_linear_algebra_library_type) {
    case SUITE_SPARSE:
 #ifndef CERES_NO_SUITESPARSE
      if (options.use_mixed_precision_solves) {
-        sparse_cholesky = FloatSuiteSparseCholesky::Create(ordering_type);
+        sparse_cholesky =
+            FloatSuiteSparseCholesky::Create(options.ordering_type);
      } else {
-        sparse_cholesky = SuiteSparseCholesky::Create(ordering_type);
+        sparse_cholesky = SuiteSparseCholesky::Create(options.ordering_type);
      }
      break;
 #else
@@ -64,9 +62,10 @@ std::unique_ptr<SparseCholesky> SparseCholesky::Create(
    case EIGEN_SPARSE:
 #ifdef CERES_USE_EIGEN_SPARSE
      if (options.use_mixed_precision_solves) {
-        sparse_cholesky = FloatEigenSparseCholesky::Create(ordering_type);
+        sparse_cholesky =
+            FloatEigenSparseCholesky::Create(options.ordering_type);
      } else {
-        sparse_cholesky = EigenSparseCholesky::Create(ordering_type);
+        sparse_cholesky = EigenSparseCholesky::Create(options.ordering_type);
      }
      break;
 #else
@@ -74,25 +73,14 @@ std::unique_ptr<SparseCholesky> SparseCholesky::Create(
                 << "Eigen's sparse Cholesky factorization routines.";
 #endif

-    case CX_SPARSE:
-#ifndef CERES_NO_CXSPARSE
-      if (options.use_mixed_precision_solves) {
-        sparse_cholesky = FloatCXSparseCholesky::Create(ordering_type);
-      } else {
-        sparse_cholesky = CXSparseCholesky::Create(ordering_type);
-      }
-      break;
-#else
-      LOG(FATAL) << "Ceres was compiled without support for CXSparse.";
-#endif
-
    case ACCELERATE_SPARSE:
 #ifndef CERES_NO_ACCELERATE_SPARSE
      if (options.use_mixed_precision_solves) {
-        sparse_cholesky = AppleAccelerateCholesky<float>::Create(ordering_type);
+        sparse_cholesky =
+            AppleAccelerateCholesky<float>::Create(options.ordering_type);
      } else {
        sparse_cholesky =
-            AppleAccelerateCholesky<double>::Create(ordering_type);
+            AppleAccelerateCholesky<double>::Create(options.ordering_type);
      }
      break;
 #else
@@ -107,10 +95,10 @@ std::unique_ptr<SparseCholesky> SparseCholesky::Create(
  }

  if (options.max_num_refinement_iterations > 0) {
-    std::unique_ptr<IterativeRefiner> refiner(
-        new IterativeRefiner(options.max_num_refinement_iterations));
-    sparse_cholesky = std::unique_ptr<SparseCholesky>(new RefinedSparseCholesky(
-        std::move(sparse_cholesky), std::move(refiner)));
+    auto refiner = std::make_unique<SparseIterativeRefiner>(
+        options.max_num_refinement_iterations);
+    sparse_cholesky = std::make_unique<RefinedSparseCholesky>(
+        std::move(sparse_cholesky), std::move(refiner));
  }
  return sparse_cholesky;
 }
@@ -123,7 +111,7 @@ LinearSolverTerminationType SparseCholesky::FactorAndSolve(
    double* solution,
    std::string* message) {
  LinearSolverTerminationType termination_type = Factorize(lhs, message);
-  if (termination_type == LINEAR_SOLVER_SUCCESS) {
+  if (termination_type == LinearSolverTerminationType::SUCCESS) {
    termination_type = Solve(rhs, solution, message);
  }
  return termination_type;
@@ -131,7 +119,7 @@ LinearSolverTerminationType SparseCholesky::FactorAndSolve(

 RefinedSparseCholesky::RefinedSparseCholesky(
    std::unique_ptr<SparseCholesky> sparse_cholesky,
-    std::unique_ptr<IterativeRefiner> iterative_refiner)
+    std::unique_ptr<SparseIterativeRefiner> iterative_refiner)
    : sparse_cholesky_(std::move(sparse_cholesky)),
      iterative_refiner_(std::move(iterative_refiner)) {}

@@ -153,13 +141,12 @@ LinearSolverTerminationType RefinedSparseCholesky::Solve(const double* rhs,
                                                         std::string* message) {
  CHECK(lhs_ != nullptr);
  auto termination_type = sparse_cholesky_->Solve(rhs, solution, message);
-  if (termination_type != LINEAR_SOLVER_SUCCESS) {
+  if (termination_type != LinearSolverTerminationType::SUCCESS) {
    return termination_type;
  }

  iterative_refiner_->Refine(*lhs_, rhs, sparse_cholesky_.get(), solution);
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/sparse_cholesky.h
+++ b/extern/ceres/internal/ceres/sparse_cholesky.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/linear_solver.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // An interface that abstracts away the internal details of various
 // sparse linear algebra libraries and offers a simple API for solving
@@ -63,11 +62,12 @@ namespace internal {
 //
 //  CompressedRowSparseMatrix lhs = ...;
 //  std::string message;
-//  CHECK_EQ(sparse_cholesky->Factorize(&lhs, &message), LINEAR_SOLVER_SUCCESS);
+//  CHECK_EQ(sparse_cholesky->Factorize(&lhs, &message),
+//           LinearSolverTerminationType::SUCCESS);
 //  Vector rhs = ...;
 //  Vector solution = ...;
 //  CHECK_EQ(sparse_cholesky->Solve(rhs.data(), solution.data(), &message),
-//           LINEAR_SOLVER_SUCCESS);
+//           LinearSolverTerminationType::SUCCESS);

 class CERES_NO_EXPORT SparseCholesky {
 public:
@@ -105,21 +105,22 @@ class CERES_NO_EXPORT SparseCholesky {

  // Convenience method which combines a call to Factorize and
  // Solve. Solve is only called if Factorize returns
-  // LINEAR_SOLVER_SUCCESS.
+  // LinearSolverTerminationType::SUCCESS.
  LinearSolverTerminationType FactorAndSolve(CompressedRowSparseMatrix* lhs,
                                             const double* rhs,
                                             double* solution,
                                             std::string* message);
 };

-class IterativeRefiner;
+class SparseIterativeRefiner;

 // Computes an initial solution using the given instance of
-// SparseCholesky, and then refines it using the IterativeRefiner.
+// SparseCholesky, and then refines it using the SparseIterativeRefiner.
 class CERES_NO_EXPORT RefinedSparseCholesky final : public SparseCholesky {
 public:
-  RefinedSparseCholesky(std::unique_ptr<SparseCholesky> sparse_cholesky,
-                        std::unique_ptr<IterativeRefiner> iterative_refiner);
+  RefinedSparseCholesky(
+      std::unique_ptr<SparseCholesky> sparse_cholesky,
+      std::unique_ptr<SparseIterativeRefiner> iterative_refiner);
  ~RefinedSparseCholesky() override;

  CompressedRowSparseMatrix::StorageType StorageType() const override;
@@ -131,12 +132,11 @@ class CERES_NO_EXPORT RefinedSparseCholesky final : public SparseCholesky {

 private:
  std::unique_ptr<SparseCholesky> sparse_cholesky_;
-  std::unique_ptr<IterativeRefiner> iterative_refiner_;
+  std::unique_ptr<SparseIterativeRefiner> iterative_refiner_;
  CompressedRowSparseMatrix* lhs_ = nullptr;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,24 @@

 #include "ceres/sparse_matrix.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 SparseMatrix::~SparseMatrix() = default;

-}  // namespace internal
-}  // namespace ceres
+void SparseMatrix::SquaredColumnNorm(double* x,
+                                     ContextImpl* context,
+                                     int num_threads) const {
+  (void)context;
+  (void)num_threads;
+  SquaredColumnNorm(x);
+}
+
+void SparseMatrix::ScaleColumns(const double* scale,
+                                ContextImpl* context,
+                                int num_threads) {
+  (void)context;
+  (void)num_threads;
+  ScaleColumns(scale);
+}
+
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/sparse_matrix.h
+++ b/extern/ceres/internal/ceres/sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,8 @@
 #include "ceres/linear_operator.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+class ContextImpl;

 // This class defines the interface for storing and manipulating
 // sparse matrices. The key property that differentiates different
@@ -69,18 +69,30 @@ class CERES_NO_EXPORT SparseMatrix : public LinearOperator {
  ~SparseMatrix() override;

  // y += Ax;
-  void RightMultiply(const double* x, double* y) const override = 0;
+  using LinearOperator::RightMultiplyAndAccumulate;
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y) const override = 0;
+
  // y += A'x;
-  void LeftMultiply(const double* x, double* y) const override = 0;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const override = 0;

  // In MATLAB notation sum(A.*A, 1)
  virtual void SquaredColumnNorm(double* x) const = 0;
+  virtual void SquaredColumnNorm(double* x,
+                                 ContextImpl* context,
+                                 int num_threads) const;
  // A = A * diag(scale)
  virtual void ScaleColumns(const double* scale) = 0;
+  virtual void ScaleColumns(const double* scale,
+                            ContextImpl* context,
+                            int num_threads);

  // A = 0. A->num_nonzeros() == 0 is true after this call. The
  // sparsity pattern is preserved.
  virtual void SetZero() = 0;
+  virtual void SetZero(ContextImpl* /*context*/, int /*num_threads*/) {
+    SetZero();
+  }

  // Resize and populate dense_matrix with a dense version of the
  // sparse matrix.
@@ -103,7 +115,6 @@ class CERES_NO_EXPORT SparseMatrix : public LinearOperator {
  virtual int num_nonzeros() const = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SPARSE_MATRIX_H_
--- a/extern/ceres/internal/ceres/sparse_normal_cholesky_solver.cc
+++ b/extern/ceres/internal/ceres/sparse_normal_cholesky_solver.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,7 @@
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 SparseNormalCholeskySolver::SparseNormalCholeskySolver(
    const LinearSolver::Options& options)
@@ -64,7 +63,7 @@ LinearSolver::Summary SparseNormalCholeskySolver::SolveImpl(
  EventLogger event_logger("SparseNormalCholeskySolver::Solve");
  LinearSolver::Summary summary;
  summary.num_iterations = 1;
-  summary.termination_type = LINEAR_SOLVER_SUCCESS;
+  summary.termination_type = LinearSolverTerminationType::SUCCESS;
  summary.message = "Success.";

  const int num_cols = A->num_cols();
@@ -72,7 +71,7 @@ LinearSolver::Summary SparseNormalCholeskySolver::SolveImpl(
  xref.setZero();
  rhs_.resize(num_cols);
  rhs_.setZero();
-  A->LeftMultiply(b, rhs_.data());
+  A->LeftMultiplyAndAccumulate(b, rhs_.data());
  event_logger.AddEvent("Compute RHS");

  if (per_solve_options.D != nullptr) {
@@ -110,5 +109,4 @@ LinearSolver::Summary SparseNormalCholeskySolver::SolveImpl(
  return summary;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/sparse_normal_cholesky_solver.h
+++ b/extern/ceres/internal/ceres/sparse_normal_cholesky_solver.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -45,8 +45,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CompressedRowSparseMatrix;
 class InnerProductComputer;
@@ -75,7 +74,6 @@ class CERES_NO_EXPORT SparseNormalCholeskySolver
  std::unique_ptr<InnerProductComputer> inner_product_computer_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_SPARSE_NORMAL_CHOLESKY_SOLVER_H_
--- a/extern/ceres/internal/ceres/stl_util.h
+++ b/extern/ceres/internal/ceres/stl_util.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
--- a/extern/ceres/internal/ceres/stringprintf.cc
+++ b/extern/ceres/internal/ceres/stringprintf.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,9 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-using std::string;
-
-void StringAppendV(string* dst, const char* format, va_list ap) {
+void StringAppendV(std::string* dst, const char* format, va_list ap) {
  // First try with a small fixed size buffer
  char space[1024];

@@ -93,16 +90,16 @@ void StringAppendV(string* dst, const char* format, va_list ap) {
  delete[] buf;
 }

-string StringPrintf(const char* format, ...) {
+std::string StringPrintf(const char* format, ...) {
  va_list ap;
  va_start(ap, format);
-  string result;
+  std::string result;
  StringAppendV(&result, format, ap);
  va_end(ap);
  return result;
 }

-const string& SStringPrintf(string* dst, const char* format, ...) {
+const std::string& SStringPrintf(std::string* dst, const char* format, ...) {
  va_list ap;
  va_start(ap, format);
  dst->clear();
@@ -111,12 +108,11 @@ const string& SStringPrintf(string* dst, const char* format, ...) {
  return *dst;
 }

-void StringAppendF(string* dst, const char* format, ...) {
+void StringAppendF(std::string* dst, const char* format, ...) {
  va_list ap;
  va_start(ap, format);
  StringAppendV(dst, format, ap);
  va_end(ap);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/stringprintf.h
+++ b/extern/ceres/internal/ceres/stringprintf.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 #if (defined(__GNUC__) || defined(__clang__))
 // Tell the compiler to do printf format string checking if the compiler
@@ -90,8 +89,7 @@ CERES_NO_EXPORT extern void StringAppendV(std::string* dst,

 #undef CERES_PRINTF_ATTRIBUTE

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/subset_preconditioner.cc
+++ b/extern/ceres/internal/ceres/subset_preconditioner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,7 @@
 #include "ceres/sparse_cholesky.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 SubsetPreconditioner::SubsetPreconditioner(Preconditioner::Options options,
                                           const BlockSparseMatrix& A)
@@ -52,13 +51,14 @@ SubsetPreconditioner::SubsetPreconditioner(Preconditioner::Options options,
  LinearSolver::Options sparse_cholesky_options;
  sparse_cholesky_options.sparse_linear_algebra_library_type =
      options_.sparse_linear_algebra_library_type;
-  sparse_cholesky_options.use_postordering = options_.use_postordering;
+  sparse_cholesky_options.ordering_type = options_.ordering_type;
  sparse_cholesky_ = SparseCholesky::Create(sparse_cholesky_options);
 }

 SubsetPreconditioner::~SubsetPreconditioner() = default;

-void SubsetPreconditioner::RightMultiply(const double* x, double* y) const {
+void SubsetPreconditioner::RightMultiplyAndAccumulate(const double* x,
+                                                      double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);
  std::string message;
@@ -106,7 +106,7 @@ bool SubsetPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  const LinearSolverTerminationType termination_type =
      sparse_cholesky_->Factorize(inner_product_computer_->mutable_result(),
                                  &message);
-  if (termination_type != LINEAR_SOLVER_SUCCESS) {
+  if (termination_type != LinearSolverTerminationType::SUCCESS) {
    LOG(ERROR) << "Preconditioner factorization failed: " << message;
    return false;
  }
@@ -114,5 +114,4 @@ bool SubsetPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/subset_preconditioner.h
+++ b/extern/ceres/internal/ceres/subset_preconditioner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/preconditioner.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockSparseMatrix;
 class SparseCholesky;
@@ -76,7 +75,7 @@ class CERES_NO_EXPORT SubsetPreconditioner
  ~SubsetPreconditioner() override;

  // Preconditioner interface
-  void RightMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
  int num_rows() const final { return num_cols_; }
  int num_cols() const final { return num_cols_; }

@@ -89,8 +88,7 @@ class CERES_NO_EXPORT SubsetPreconditioner
  std::unique_ptr<InnerProductComputer> inner_product_computer_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/suitesparse.cc
+++ b/extern/ceres/internal/ceres/suitesparse.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,7 +32,9 @@
 #include "ceres/internal/config.h"

 #ifndef CERES_NO_SUITESPARSE
+
 #include <memory>
+#include <string>
 #include <vector>

 #include "ceres/compressed_col_sparse_matrix_utils.h"
@@ -42,11 +44,24 @@
 #include "ceres/triplet_sparse_matrix.h"
 #include "cholmod.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
+namespace {
+int OrderingTypeToCHOLMODEnum(OrderingType ordering_type) {
+  if (ordering_type == OrderingType::AMD) {
+    return CHOLMOD_AMD;
+  }
+  if (ordering_type == OrderingType::NESDIS) {
+    return CHOLMOD_NESDIS;
+  }

-using std::string;
-using std::vector;
+  if (ordering_type == OrderingType::NATURAL) {
+    return CHOLMOD_NATURAL;
+  }
+  LOG(FATAL) << "Congratulations you have discovered a bug in Ceres Solver."
+             << "Please report it to the developers. " << ordering_type;
+  return -1;
+}
+}  // namespace

 SuiteSparse::SuiteSparse() { cholmod_start(&cc_); }

@@ -103,9 +118,11 @@ cholmod_sparse SuiteSparse::CreateSparseMatrixTransposeView(
  m.x = reinterpret_cast<void*>(A->mutable_values());
  m.z = nullptr;

-  if (A->storage_type() == CompressedRowSparseMatrix::LOWER_TRIANGULAR) {
+  if (A->storage_type() ==
+      CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR) {
    m.stype = 1;
-  } else if (A->storage_type() == CompressedRowSparseMatrix::UPPER_TRIANGULAR) {
+  } else if (A->storage_type() ==
+             CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR) {
    m.stype = -1;
  } else {
    m.stype = 0;
@@ -144,19 +161,18 @@ cholmod_dense* SuiteSparse::CreateDenseVector(const double* x,
 }

 cholmod_factor* SuiteSparse::AnalyzeCholesky(cholmod_sparse* A,
-                                             string* message) {
-  // Cholmod can try multiple re-ordering strategies to find a fill
-  // reducing ordering. Here we just tell it use AMD with automatic
-  // matrix dependence choice of supernodal versus simplicial
-  // factorization.
+                                             OrderingType ordering_type,
+                                             std::string* message) {
  cc_.nmethods = 1;
-  cc_.method[0].ordering = CHOLMOD_AMD;
-  cc_.supernodal = CHOLMOD_AUTO;
+  cc_.method[0].ordering = OrderingTypeToCHOLMODEnum(ordering_type);
+
+  // postordering with a NATURAL ordering leads to a significant regression in
+  // performance. See https://github.com/ceres-solver/ceres-solver/issues/905
+  if (ordering_type == OrderingType::NATURAL) {
+    cc_.postorder = 0;
+  }

  cholmod_factor* factor = cholmod_analyze(A, &cc_);
-  if (VLOG_IS_ON(2)) {
-    cholmod_print_common(const_cast<char*>("Symbolic Analysis"), &cc_);
-  }

  if (cc_.status != CHOLMOD_OK) {
    *message =
@@ -165,32 +181,22 @@ cholmod_factor* SuiteSparse::AnalyzeCholesky(cholmod_sparse* A,
  }

  CHECK(factor != nullptr);
+  if (VLOG_IS_ON(2)) {
+    cholmod_print_common(const_cast<char*>("Symbolic Analysis"), &cc_);
+  }
+
  return factor;
 }

-cholmod_factor* SuiteSparse::BlockAnalyzeCholesky(cholmod_sparse* A,
-                                                  const vector<int>& row_blocks,
-                                                  const vector<int>& col_blocks,
-                                                  string* message) {
-  vector<int> ordering;
-  if (!BlockAMDOrdering(A, row_blocks, col_blocks, &ordering)) {
-    return nullptr;
-  }
-  return AnalyzeCholeskyWithUserOrdering(A, ordering, message);
-}
-
-cholmod_factor* SuiteSparse::AnalyzeCholeskyWithUserOrdering(
-    cholmod_sparse* A, const vector<int>& ordering, string* message) {
+cholmod_factor* SuiteSparse::AnalyzeCholeskyWithGivenOrdering(
+    cholmod_sparse* A, const std::vector<int>& ordering, std::string* message) {
  CHECK_EQ(ordering.size(), A->nrow);

  cc_.nmethods = 1;
  cc_.method[0].ordering = CHOLMOD_GIVEN;
-
  cholmod_factor* factor =
-      cholmod_analyze_p(A, const_cast<int*>(&ordering[0]), nullptr, 0, &cc_);
-  if (VLOG_IS_ON(2)) {
-    cholmod_print_common(const_cast<char*>("Symbolic Analysis"), &cc_);
-  }
+      cholmod_analyze_p(A, const_cast<int*>(ordering.data()), nullptr, 0, &cc_);
+
  if (cc_.status != CHOLMOD_OK) {
    *message =
        StringPrintf("cholmod_analyze failed. error code: %d", cc_.status);
@@ -198,40 +204,33 @@ cholmod_factor* SuiteSparse::AnalyzeCholeskyWithUserOrdering(
  }

  CHECK(factor != nullptr);
-  return factor;
-}
-
-cholmod_factor* SuiteSparse::AnalyzeCholeskyWithNaturalOrdering(
-    cholmod_sparse* A, string* message) {
-  cc_.nmethods = 1;
-  cc_.method[0].ordering = CHOLMOD_NATURAL;
-  cc_.postorder = 0;
-
-  cholmod_factor* factor = cholmod_analyze(A, &cc_);
  if (VLOG_IS_ON(2)) {
    cholmod_print_common(const_cast<char*>("Symbolic Analysis"), &cc_);
  }
-  if (cc_.status != CHOLMOD_OK) {
-    *message =
-        StringPrintf("cholmod_analyze failed. error code: %d", cc_.status);
-    return nullptr;
-  }

-  CHECK(factor != nullptr);
  return factor;
 }

-bool SuiteSparse::BlockAMDOrdering(const cholmod_sparse* A,
-                                   const vector<int>& row_blocks,
-                                   const vector<int>& col_blocks,
-                                   vector<int>* ordering) {
+bool SuiteSparse::BlockOrdering(const cholmod_sparse* A,
+                                OrderingType ordering_type,
+                                const std::vector<Block>& row_blocks,
+                                const std::vector<Block>& col_blocks,
+                                std::vector<int>* ordering) {
+  if (ordering_type == OrderingType::NATURAL) {
+    ordering->resize(A->nrow);
+    for (int i = 0; i < A->nrow; ++i) {
+      (*ordering)[i] = i;
+    }
+    return true;
+  }
+
  const int num_row_blocks = row_blocks.size();
  const int num_col_blocks = col_blocks.size();

  // Arrays storing the compressed column structure of the matrix
-  // incoding the block sparsity of A.
-  vector<int> block_cols;
-  vector<int> block_rows;
+  // encoding the block sparsity of A.
+  std::vector<int> block_cols;
+  std::vector<int> block_rows;

  CompressedColumnScalarMatrixToBlockMatrix(reinterpret_cast<const int*>(A->i),
                                            reinterpret_cast<const int*>(A->p),
@@ -243,8 +242,8 @@ bool SuiteSparse::BlockAMDOrdering(const cholmod_sparse* A,
  block_matrix.nrow = num_row_blocks;
  block_matrix.ncol = num_col_blocks;
  block_matrix.nzmax = block_rows.size();
-  block_matrix.p = reinterpret_cast<void*>(&block_cols[0]);
-  block_matrix.i = reinterpret_cast<void*>(&block_rows[0]);
+  block_matrix.p = reinterpret_cast<void*>(block_cols.data());
+  block_matrix.i = reinterpret_cast<void*>(block_rows.data());
  block_matrix.x = nullptr;
  block_matrix.stype = A->stype;
  block_matrix.itype = CHOLMOD_INT;
@@ -253,8 +252,8 @@ bool SuiteSparse::BlockAMDOrdering(const cholmod_sparse* A,
  block_matrix.sorted = 1;
  block_matrix.packed = 1;

-  vector<int> block_ordering(num_row_blocks);
-  if (!cholmod_amd(&block_matrix, nullptr, 0, &block_ordering[0], &cc_)) {
+  std::vector<int> block_ordering(num_row_blocks);
+  if (!Ordering(&block_matrix, ordering_type, block_ordering.data())) {
    return false;
  }

@@ -262,9 +261,22 @@ bool SuiteSparse::BlockAMDOrdering(const cholmod_sparse* A,
  return true;
 }

+cholmod_factor* SuiteSparse::BlockAnalyzeCholesky(
+    cholmod_sparse* A,
+    OrderingType ordering_type,
+    const std::vector<Block>& row_blocks,
+    const std::vector<Block>& col_blocks,
+    std::string* message) {
+  std::vector<int> ordering;
+  if (!BlockOrdering(A, ordering_type, row_blocks, col_blocks, &ordering)) {
+    return nullptr;
+  }
+  return AnalyzeCholeskyWithGivenOrdering(A, ordering, message);
+}
+
 LinearSolverTerminationType SuiteSparse::Cholesky(cholmod_sparse* A,
                                                  cholmod_factor* L,
-                                                  string* message) {
+                                                  std::string* message) {
  CHECK(A != nullptr);
  CHECK(L != nullptr);

@@ -282,48 +294,48 @@ LinearSolverTerminationType SuiteSparse::Cholesky(cholmod_sparse* A,
  switch (cc_.status) {
    case CHOLMOD_NOT_INSTALLED:
      *message = "CHOLMOD failure: Method not installed.";
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    case CHOLMOD_OUT_OF_MEMORY:
      *message = "CHOLMOD failure: Out of memory.";
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    case CHOLMOD_TOO_LARGE:
      *message = "CHOLMOD failure: Integer overflow occurred.";
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    case CHOLMOD_INVALID:
      *message = "CHOLMOD failure: Invalid input.";
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    case CHOLMOD_NOT_POSDEF:
      *message = "CHOLMOD warning: Matrix not positive definite.";
-      return LINEAR_SOLVER_FAILURE;
+      return LinearSolverTerminationType::FAILURE;
    case CHOLMOD_DSMALL:
      *message =
          "CHOLMOD warning: D for LDL' or diag(L) or "
          "LL' has tiny absolute value.";
-      return LINEAR_SOLVER_FAILURE;
+      return LinearSolverTerminationType::FAILURE;
    case CHOLMOD_OK:
      if (cholmod_status != 0) {
-        return LINEAR_SOLVER_SUCCESS;
+        return LinearSolverTerminationType::SUCCESS;
      }

      *message =
          "CHOLMOD failure: cholmod_factorize returned false "
          "but cholmod_common::status is CHOLMOD_OK."
          "Please report this to ceres-solver@googlegroups.com.";
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
    default:
      *message = StringPrintf(
          "Unknown cholmod return code: %d. "
          "Please report this to ceres-solver@googlegroups.com.",
          cc_.status);
-      return LINEAR_SOLVER_FATAL_ERROR;
+      return LinearSolverTerminationType::FATAL_ERROR;
  }

-  return LINEAR_SOLVER_FATAL_ERROR;
+  return LinearSolverTerminationType::FATAL_ERROR;
 }

 cholmod_dense* SuiteSparse::Solve(cholmod_factor* L,
                                  cholmod_dense* b,
-                                  string* message) {
+                                  std::string* message) {
  if (cc_.status != CHOLMOD_OK) {
    *message = "cholmod_solve failed. CHOLMOD status is not CHOLMOD_OK";
    return nullptr;
@@ -332,22 +344,34 @@ cholmod_dense* SuiteSparse::Solve(cholmod_factor* L,
  return cholmod_solve(CHOLMOD_A, L, b, &cc_);
 }

-bool SuiteSparse::ApproximateMinimumDegreeOrdering(cholmod_sparse* matrix,
-                                                   int* ordering) {
-  return cholmod_amd(matrix, nullptr, 0, ordering, &cc_);
+bool SuiteSparse::Ordering(cholmod_sparse* matrix,
+                           OrderingType ordering_type,
+                           int* ordering) {
+  CHECK_NE(ordering_type, OrderingType::NATURAL);
+  if (ordering_type == OrderingType::AMD) {
+    return cholmod_amd(matrix, nullptr, 0, ordering, &cc_);
+  }
+
+#ifdef CERES_NO_CHOLMOD_PARTITION
+  return false;
+#else
+  std::vector<int> CParent(matrix->nrow, 0);
+  std::vector<int> CMember(matrix->nrow, 0);
+  return cholmod_nested_dissection(
+      matrix, nullptr, 0, ordering, CParent.data(), CMember.data(), &cc_);
+#endif
 }

 bool SuiteSparse::ConstrainedApproximateMinimumDegreeOrdering(
    cholmod_sparse* matrix, int* constraints, int* ordering) {
-#ifndef CERES_NO_CAMD
  return cholmod_camd(matrix, nullptr, 0, constraints, ordering, &cc_);
-#else
-  LOG(FATAL) << "Congratulations you have found a bug in Ceres."
-             << "Ceres Solver was compiled with SuiteSparse "
-             << "version 4.1.0 or less. Calling this function "
-             << "in that case is a bug. Please contact the"
-             << "the Ceres Solver developers.";
+}
+
+bool SuiteSparse::IsNestedDissectionAvailable() {
+#ifdef CERES_NO_CHOLMOD_PARTITION
  return false;
+#else
+  return true;
 #endif
 }

@@ -367,48 +391,61 @@ SuiteSparseCholesky::~SuiteSparseCholesky() {
 }

 LinearSolverTerminationType SuiteSparseCholesky::Factorize(
-    CompressedRowSparseMatrix* lhs, string* message) {
+    CompressedRowSparseMatrix* lhs, std::string* message) {
  if (lhs == nullptr) {
    *message = "Failure: Input lhs is nullptr.";
-    return LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }

  cholmod_sparse cholmod_lhs = ss_.CreateSparseMatrixTransposeView(lhs);

+  // If a factorization does not exist, compute the symbolic
+  // factorization first.
+  //
+  // If the ordering type is NATURAL, then there is no fill reducing
+  // ordering to be computed, regardless of block structure, so we can
+  // just call the scalar version of symbolic factorization. For
+  // SuiteSparse this is the common case since we have already
+  // pre-ordered the columns of the Jacobian.
+  //
+  // Similarly regardless of ordering type, if there is no block
+  // structure in the matrix we call the scalar version of symbolic
+  // factorization.
  if (factor_ == nullptr) {
-    if (ordering_type_ == NATURAL) {
-      factor_ = ss_.AnalyzeCholeskyWithNaturalOrdering(&cholmod_lhs, message);
+    if (ordering_type_ == OrderingType::NATURAL ||
+        (lhs->col_blocks().empty() || lhs->row_blocks().empty())) {
+      factor_ = ss_.AnalyzeCholesky(&cholmod_lhs, ordering_type_, message);
    } else {
-      if (!lhs->col_blocks().empty() && !(lhs->row_blocks().empty())) {
-        factor_ = ss_.BlockAnalyzeCholesky(
-            &cholmod_lhs, lhs->col_blocks(), lhs->row_blocks(), message);
-      } else {
-        factor_ = ss_.AnalyzeCholesky(&cholmod_lhs, message);
-      }
-    }
-
-    if (factor_ == nullptr) {
-      return LINEAR_SOLVER_FATAL_ERROR;
+      factor_ = ss_.BlockAnalyzeCholesky(&cholmod_lhs,
+                                         ordering_type_,
+                                         lhs->col_blocks(),
+                                         lhs->row_blocks(),
+                                         message);
    }
  }

+  if (factor_ == nullptr) {
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+
+  // Compute and return the numeric factorization.
  return ss_.Cholesky(&cholmod_lhs, factor_, message);
 }

 CompressedRowSparseMatrix::StorageType SuiteSparseCholesky::StorageType()
    const {
-  return ((ordering_type_ == NATURAL)
-              ? CompressedRowSparseMatrix::UPPER_TRIANGULAR
-              : CompressedRowSparseMatrix::LOWER_TRIANGULAR);
+  return ((ordering_type_ == OrderingType::NATURAL)
+              ? CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR
+              : CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR);
 }

 LinearSolverTerminationType SuiteSparseCholesky::Solve(const double* rhs,
                                                       double* solution,
-                                                       string* message) {
+                                                       std::string* message) {
  // Error checking
  if (factor_ == nullptr) {
    *message = "Solve called without a call to Factorize first.";
-    return LINEAR_SOLVER_FATAL_ERROR;
+    return LinearSolverTerminationType::FATAL_ERROR;
  }

  const int num_cols = factor_->n;
@@ -417,15 +454,14 @@ LinearSolverTerminationType SuiteSparseCholesky::Solve(const double* rhs,
      ss_.Solve(factor_, &cholmod_rhs, message);

  if (cholmod_dense_solution == nullptr) {
-    return LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::FAILURE;
  }

  memcpy(solution, cholmod_dense_solution->x, num_cols * sizeof(*solution));
  ss_.Free(cholmod_dense_solution);
-  return LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::SUCCESS;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_NO_SUITESPARSE
--- a/extern/ceres/internal/ceres/suitesparse.h
+++ b/extern/ceres/internal/ceres/suitesparse.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,37 +44,14 @@
 #include <vector>

 #include "SuiteSparseQR.hpp"
+#include "ceres/block_structure.h"
+#include "ceres/internal/disable_warnings.h"
 #include "ceres/linear_solver.h"
 #include "ceres/sparse_cholesky.h"
 #include "cholmod.h"
 #include "glog/logging.h"

-// Before SuiteSparse version 4.2.0, cholmod_camd was only enabled
-// if SuiteSparse was compiled with Metis support. This makes
-// calling and linking into cholmod_camd problematic even though it
-// has nothing to do with Metis. This has been fixed reliably in
-// 4.2.0.
-//
-// The fix was actually committed in 4.1.0, but there is
-// some confusion about a silent update to the tar ball, so we are
-// being conservative and choosing the next minor version where
-// things are stable.
-#if (SUITESPARSE_VERSION < 4002)
-#define CERES_NO_CAMD
-#endif
-
-// UF_long is deprecated but SuiteSparse_long is only available in
-// newer versions of SuiteSparse. So for older versions of
-// SuiteSparse, we define SuiteSparse_long to be the same as UF_long,
-// which is what recent versions of SuiteSparse do anyways.
-#ifndef SuiteSparse_long
-#define SuiteSparse_long UF_long
-#endif
-
-#include "ceres/internal/disable_warnings.h"
-
-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CompressedRowSparseMatrix;
 class TripletSparseMatrix;
@@ -91,7 +68,7 @@ class CERES_NO_EXPORT SuiteSparse {

  // Functions for building cholmod_sparse objects from sparse
  // matrices stored in triplet form. The matrix A is not
-  // modifed. Called owns the result.
+  // modified. Called owns the result.
  cholmod_sparse* CreateSparseMatrix(TripletSparseMatrix* A);

  // This function works like CreateSparseMatrix, except that the
@@ -142,12 +119,11 @@ class CERES_NO_EXPORT SuiteSparse {
    cholmod_sdmult(A, 0, alpha_, beta_, x, y, &cc_);
  }

-  // Find an ordering of A or AA' (if A is unsymmetric) that minimizes
-  // the fill-in in the Cholesky factorization of the corresponding
-  // matrix. This is done by using the AMD algorithm.
-  //
-  // Using this ordering, the symbolic Cholesky factorization of A (or
-  // AA') is computed and returned.
+  // Compute a symbolic factorization for A or AA' (if A is
+  // unsymmetric). If ordering_type is NATURAL, then no fill reducing
+  // ordering is computed, otherwise depending on the value of
+  // ordering_type AMD or Nested Dissection is used to compute a fill
+  // reducing ordering before the symbolic factorization is computed.
  //
  // A is not modified, only the pattern of non-zeros of A is used,
  // the actual numerical values in A are of no consequence.
@@ -155,11 +131,15 @@ class CERES_NO_EXPORT SuiteSparse {
  // message contains an explanation of the failures if any.
  //
  // Caller owns the result.
-  cholmod_factor* AnalyzeCholesky(cholmod_sparse* A, std::string* message);
+  cholmod_factor* AnalyzeCholesky(cholmod_sparse* A,
+                                  OrderingType ordering_type,
+                                  std::string* message);

+  // Block oriented version of AnalyzeCholesky.
  cholmod_factor* BlockAnalyzeCholesky(cholmod_sparse* A,
-                                       const std::vector<int>& row_blocks,
-                                       const std::vector<int>& col_blocks,
+                                       OrderingType ordering_type,
+                                       const std::vector<Block>& row_blocks,
+                                       const std::vector<Block>& col_blocks,
                                       std::string* message);

  // If A is symmetric, then compute the symbolic Cholesky
@@ -173,20 +153,11 @@ class CERES_NO_EXPORT SuiteSparse {
  // message contains an explanation of the failures if any.
  //
  // Caller owns the result.
-  cholmod_factor* AnalyzeCholeskyWithUserOrdering(
+  cholmod_factor* AnalyzeCholeskyWithGivenOrdering(
      cholmod_sparse* A,
      const std::vector<int>& ordering,
      std::string* message);

-  // Perform a symbolic factorization of A without re-ordering A. No
-  // postordering of the elimination tree is performed. This ensures
-  // that the symbolic factor does not introduce an extra permutation
-  // on the matrix. See the documentation for CHOLMOD for more details.
-  //
-  // message contains an explanation of the failures if any.
-  cholmod_factor* AnalyzeCholeskyWithNaturalOrdering(cholmod_sparse* A,
-                                                     std::string* message);
-
  // Use the symbolic factorization in L, to find the numerical
  // factorization for the matrix A or AA^T. Return true if
  // successful, false otherwise. L contains the numeric factorization
@@ -206,51 +177,39 @@ class CERES_NO_EXPORT SuiteSparse {
                       cholmod_dense* b,
                       std::string* message);

+  // Find a fill reducing ordering. ordering is expected to be large
+  // enough to hold the ordering. ordering_type must be AMD or NESDIS.
+  bool Ordering(cholmod_sparse* matrix,
+                OrderingType ordering_type,
+                int* ordering);
+
+  // Find the block oriented fill reducing ordering of a matrix A,
+  // whose row and column blocks are given by row_blocks, and
+  // col_blocks respectively. The matrix may or may not be
+  // symmetric. The entries of col_blocks do not need to sum to the
+  // number of columns in A. If this is the case, only the first
+  // sum(col_blocks) are used to compute the ordering.
+  //
  // By virtue of the modeling layer in Ceres being block oriented,
  // all the matrices used by Ceres are also block oriented. When
  // doing sparse direct factorization of these matrices the
-  // fill-reducing ordering algorithms (in particular AMD) can either
-  // be run on the block or the scalar form of these matrices. The two
-  // SuiteSparse::AnalyzeCholesky methods allows the client to
-  // compute the symbolic factorization of a matrix by either using
-  // AMD on the matrix or a user provided ordering of the rows.
-  //
-  // But since the underlying matrices are block oriented, it is worth
-  // running AMD on just the block structure of these matrices and then
-  // lifting these block orderings to a full scalar ordering. This
-  // preserves the block structure of the permuted matrix, and exposes
-  // more of the super-nodal structure of the matrix to the numerical
-  // factorization routines.
-  //
-  // Find the block oriented AMD ordering of a matrix A, whose row and
-  // column blocks are given by row_blocks, and col_blocks
-  // respectively. The matrix may or may not be symmetric. The entries
-  // of col_blocks do not need to sum to the number of columns in
-  // A. If this is the case, only the first sum(col_blocks) are used
-  // to compute the ordering.
-  bool BlockAMDOrdering(const cholmod_sparse* A,
-                        const std::vector<int>& row_blocks,
-                        const std::vector<int>& col_blocks,
-                        std::vector<int>* ordering);
+  // fill-reducing ordering algorithms can either be run on the block
+  // or the scalar form of these matrices. But since the underlying
+  // matrices are block oriented, it is worth running the fill
+  // reducing ordering on just the block structure of these matrices
+  // and then lifting these block orderings to a full scalar
+  // ordering. This preserves the block structure of the permuted
+  // matrix, and exposes more of the super-nodal structure of the
+  // matrix to the numerical factorization routines.
+  bool BlockOrdering(const cholmod_sparse* A,
+                     OrderingType ordering_type,
+                     const std::vector<Block>& row_blocks,
+                     const std::vector<Block>& col_blocks,
+                     std::vector<int>* ordering);

-  // Find a fill reducing approximate minimum degree
-  // ordering. ordering is expected to be large enough to hold the
-  // ordering.
-  bool ApproximateMinimumDegreeOrdering(cholmod_sparse* matrix, int* ordering);
-
-  // Before SuiteSparse version 4.2.0, cholmod_camd was only enabled
-  // if SuiteSparse was compiled with Metis support. This makes
-  // calling and linking into cholmod_camd problematic even though it
-  // has nothing to do with Metis. This has been fixed reliably in
-  // 4.2.0.
-  //
-  // The fix was actually committed in 4.1.0, but there is
-  // some confusion about a silent update to the tar ball, so we are
-  // being conservative and choosing the next minor version where
-  // things are stable.
-  static bool IsConstrainedApproximateMinimumDegreeOrderingAvailable() {
-    return (SUITESPARSE_VERSION > 4001);
-  }
+  // Nested dissection is only available if SuiteSparse is compiled
+  // with Metis support.
+  static bool IsNestedDissectionAvailable();

  // Find a fill reducing approximate minimum degree
  // ordering. constraints is an array which associates with each
@@ -262,9 +221,6 @@ class CERES_NO_EXPORT SuiteSparse {
  // Calling ApproximateMinimumDegreeOrdering is equivalent to calling
  // ConstrainedApproximateMinimumDegreeOrdering with a constraint
  // array that puts all columns in the same elimination group.
-  //
-  // If CERES_NO_CAMD is defined then calling this function will
-  // result in a crash.
  bool ConstrainedApproximateMinimumDegreeOrdering(cholmod_sparse* matrix,
                                                   int* constraints,
                                                   int* ordering);
@@ -312,14 +268,13 @@ class CERES_NO_EXPORT SuiteSparseCholesky final : public SparseCholesky {
  cholmod_factor* factor_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

 #else  // CERES_NO_SUITESPARSE

-typedef void cholmod_factor;
+using cholmod_factor = void;

 #include "ceres/internal/disable_warnings.h"

@@ -328,17 +283,9 @@ namespace internal {

 class CERES_NO_EXPORT SuiteSparse {
 public:
-  // Defining this static function even when SuiteSparse is not
-  // available, allows client code to check for the presence of CAMD
-  // without checking for the absence of the CERES_NO_CAMD symbol.
-  //
-  // This is safer because the symbol maybe missing due to a user
-  // accidentally not including suitesparse.h in their code when
-  // checking for the symbol.
-  static bool IsConstrainedApproximateMinimumDegreeOrderingAvailable() {
-    return false;
-  }
-
+  // Nested dissection is only available if SuiteSparse is compiled
+  // with Metis support.
+  static bool IsNestedDissectionAvailable() { return false; }
  void Free(void* /*arg*/) {}
 };

--- a/extern/ceres/internal/ceres/thread_pool.cc
+++ b/extern/ceres/internal/ceres/thread_pool.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -28,18 +28,14 @@
 //
 // Author: vitus@google.com (Michael Vitus)

-// This include must come before any #ifndef check on Ceres compile options.
-#include "ceres/internal/config.h"
-
-#ifdef CERES_USE_CXX_THREADS
+#include "ceres/thread_pool.h"

 #include <cmath>
 #include <limits>

-#include "ceres/thread_pool.h"
+#include "ceres/internal/config.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 namespace {

 // Constrain the total number of threads to the amount the hardware can support.
@@ -105,7 +101,4 @@ void ThreadPool::ThreadMainLoop() {

 void ThreadPool::Stop() { task_queue_.StopWaiters(); }

-}  // namespace internal
-}  // namespace ceres
-
-#endif  // CERES_USE_CXX_THREADS
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/thread_pool.h
+++ b/extern/ceres/internal/ceres/thread_pool.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
 #include "ceres/concurrent_queue.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // A thread-safe thread pool with an unbounded task queue and a resizable number
 // of workers.  The size of the thread pool can be increased but never decreased
@@ -115,7 +114,6 @@ class CERES_NO_EXPORT ThreadPool {
  std::mutex thread_pool_mutex_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_THREAD_POOL_H_
--- a/extern/ceres/internal/ceres/thread_token_provider.cc
+++ b/extern/ceres/internal/ceres/thread_token_provider.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,44 +30,20 @@

 #include "ceres/thread_token_provider.h"

-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#endif
-
-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 ThreadTokenProvider::ThreadTokenProvider(int num_threads) {
-  (void)num_threads;
-#ifdef CERES_USE_CXX_THREADS
  for (int i = 0; i < num_threads; i++) {
    pool_.Push(i);
  }
-#endif
 }

 int ThreadTokenProvider::Acquire() {
-#ifdef CERES_USE_OPENMP
-  return omp_get_thread_num();
-#endif
-
-#ifdef CERES_NO_THREADS
-  return 0;
-#endif
-
-#ifdef CERES_USE_CXX_THREADS
  int thread_id;
  CHECK(pool_.Wait(&thread_id));
  return thread_id;
-#endif
 }

-void ThreadTokenProvider::Release(int thread_id) {
-  (void)thread_id;
-#ifdef CERES_USE_CXX_THREADS
-  pool_.Push(thread_id);
-#endif
-}
+void ThreadTokenProvider::Release(int thread_id) { pool_.Push(thread_id); }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/thread_token_provider.h
+++ b/extern/ceres/internal/ceres/thread_token_provider.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,15 +31,11 @@
 #ifndef CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
 #define CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_

+#include "ceres/concurrent_queue.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/export.h"

-#ifdef CERES_USE_CXX_THREADS
-#include "ceres/concurrent_queue.h"
-#endif
-
-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Helper for C++ thread number identification that is similar to
 // omp_get_thread_num() behaviour. This is necessary to support C++
@@ -48,12 +44,6 @@ namespace internal {
 // 0 to num_threads - 1 that can be acquired to identify the thread in a thread
 // pool.
 //
-// If CERES_NO_THREADS is defined, Acquire() always returns 0 and Release()
-// takes no action.
-//
-// If CERES_USE_OPENMP, omp_get_thread_num() is used to Acquire() with no action
-// in Release()
-//
 //
 // Example usage pseudocode:
 //
@@ -78,20 +68,16 @@ class CERES_NO_EXPORT ThreadTokenProvider {
  void Release(int thread_id);

 private:
-#ifdef CERES_USE_CXX_THREADS
  // This queue initially holds a sequence from 0..num_threads-1. Every
  // Acquire() call the first number is removed from here. When the token is not
  // needed anymore it shall be given back with corresponding Release()
  // call. This concurrent queue is more expensive than TBB's version, so you
  // should not acquire the thread ID on every for loop iteration.
  ConcurrentQueue<int> pool_;
-#endif
-
  ThreadTokenProvider(ThreadTokenProvider&) = delete;
  ThreadTokenProvider& operator=(ThreadTokenProvider&) = delete;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
--- a/extern/ceres/internal/ceres/triplet_sparse_matrix.cc
+++ b/extern/ceres/internal/ceres/triplet_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,15 +32,16 @@

 #include <algorithm>
 #include <memory>
+#include <random>

+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
-#include "ceres/random.h"
 #include "ceres/types.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 TripletSparseMatrix::TripletSparseMatrix()
    : num_rows_(0), num_cols_(0), max_num_nonzeros_(0), num_nonzeros_(0) {}
@@ -168,13 +169,15 @@ void TripletSparseMatrix::CopyData(const TripletSparseMatrix& orig) {
  }
 }

-void TripletSparseMatrix::RightMultiply(const double* x, double* y) const {
+void TripletSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                     double* y) const {
  for (int i = 0; i < num_nonzeros_; ++i) {
    y[rows_[i]] += values_[i] * x[cols_[i]];
  }
 }

-void TripletSparseMatrix::LeftMultiply(const double* x, double* y) const {
+void TripletSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+                                                    double* y) const {
  for (int i = 0; i < num_nonzeros_; ++i) {
    y[cols_[i]] += values_[i] * x[rows_[i]];
  }
@@ -195,6 +198,11 @@ void TripletSparseMatrix::ScaleColumns(const double* scale) {
  }
 }

+void TripletSparseMatrix::ToCRSMatrix(CRSMatrix* crs_matrix) const {
+  CompressedRowSparseMatrix::FromTripletSparseMatrix(*this)->ToCRSMatrix(
+      crs_matrix);
+}
+
 void TripletSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
  dense_matrix->resize(num_rows_, num_cols_);
  dense_matrix->setZero();
@@ -276,8 +284,34 @@ void TripletSparseMatrix::ToTextFile(FILE* file) const {
  }
 }

+std::unique_ptr<TripletSparseMatrix> TripletSparseMatrix::CreateFromTextFile(
+    FILE* file) {
+  CHECK(file != nullptr);
+  int num_rows = 0;
+  int num_cols = 0;
+  std::vector<int> rows;
+  std::vector<int> cols;
+  std::vector<double> values;
+  while (true) {
+    int row, col;
+    double value;
+    if (fscanf(file, "%d %d %lf", &row, &col, &value) != 3) {
+      break;
+    }
+    rows.push_back(row);
+    cols.push_back(col);
+    values.push_back(value);
+    num_rows = std::max(num_rows, row + 1);
+    num_cols = std::max(num_cols, col + 1);
+  }
+  VLOG(1) << "Read " << rows.size() << " nonzeros from file.";
+  return std::make_unique<TripletSparseMatrix>(
+      num_rows, num_cols, rows, cols, values);
+}
+
 std::unique_ptr<TripletSparseMatrix> TripletSparseMatrix::CreateRandomMatrix(
-    const TripletSparseMatrix::RandomMatrixOptions& options) {
+    const TripletSparseMatrix::RandomMatrixOptions& options,
+    std::mt19937& prng) {
  CHECK_GT(options.num_rows, 0);
  CHECK_GT(options.num_cols, 0);
  CHECK_GT(options.density, 0.0);
@@ -286,16 +320,18 @@ std::unique_ptr<TripletSparseMatrix> TripletSparseMatrix::CreateRandomMatrix(
  std::vector<int> rows;
  std::vector<int> cols;
  std::vector<double> values;
+  std::uniform_real_distribution<double> uniform01(0.0, 1.0);
+  std::normal_distribution<double> standard_normal;
  while (rows.empty()) {
    rows.clear();
    cols.clear();
    values.clear();
    for (int r = 0; r < options.num_rows; ++r) {
      for (int c = 0; c < options.num_cols; ++c) {
-        if (RandDouble() <= options.density) {
+        if (uniform01(prng) <= options.density) {
          rows.push_back(r);
          cols.push_back(c);
-          values.push_back(RandNormal());
+          values.push_back(standard_normal(prng));
        }
      }
    }
@@ -305,5 +341,4 @@ std::unique_ptr<TripletSparseMatrix> TripletSparseMatrix::CreateRandomMatrix(
      options.num_rows, options.num_cols, rows, cols, values);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/triplet_sparse_matrix.h
+++ b/extern/ceres/internal/ceres/triplet_sparse_matrix.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,16 +32,17 @@
 #define CERES_INTERNAL_TRIPLET_SPARSE_MATRIX_H_

 #include <memory>
+#include <random>
 #include <vector>

+#include "ceres/crs_matrix.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "ceres/sparse_matrix.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // An implementation of the SparseMatrix interface to store and
 // manipulate sparse matrices in triplet (i,j,s) form.  This object is
@@ -65,10 +66,11 @@ class CERES_NO_EXPORT TripletSparseMatrix final : public SparseMatrix {

  // Implementation of the SparseMatrix interface.
  void SetZero() final;
-  void RightMultiply(const double* x, double* y) const final;
-  void LeftMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
  void SquaredColumnNorm(double* x) const final;
  void ScaleColumns(const double* scale) final;
+  void ToCRSMatrix(CRSMatrix* matrix) const;
  void ToDenseMatrix(Matrix* dense_matrix) const final;
  void ToTextFile(FILE* file) const final;
  // clang-format off
@@ -134,7 +136,11 @@ class CERES_NO_EXPORT TripletSparseMatrix final : public SparseMatrix {
  // normally distributed and whose structure is determined by
  // RandomMatrixOptions.
  static std::unique_ptr<TripletSparseMatrix> CreateRandomMatrix(
-      const TripletSparseMatrix::RandomMatrixOptions& options);
+      const TripletSparseMatrix::RandomMatrixOptions& options,
+      std::mt19937& prng);
+
+  // Load a triplet sparse matrix from a text file.
+  static std::unique_ptr<TripletSparseMatrix> CreateFromTextFile(FILE* file);

 private:
  void AllocateMemory();
@@ -154,8 +160,7 @@ class CERES_NO_EXPORT TripletSparseMatrix final : public SparseMatrix {
  std::unique_ptr<double[]> values_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/trust_region_minimizer.cc
+++ b/extern/ceres/internal/ceres/trust_region_minimizer.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,11 @@
 #include "Eigen/Core"
 #include "ceres/array_utils.h"
 #include "ceres/coordinate_descent_minimizer.h"
+#include "ceres/eigen_vector_ops.h"
 #include "ceres/evaluator.h"
 #include "ceres/file.h"
 #include "ceres/line_search.h"
+#include "ceres/parallel_for.h"
 #include "ceres/stringprintf.h"
 #include "ceres/types.h"
 #include "ceres/wall_time.h"
@@ -59,8 +61,7 @@
    }                                                            \
  } while (0)

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 void TrustRegionMinimizer::Minimize(const Minimizer::Options& options,
                                    double* parameters,
@@ -79,6 +80,7 @@ void TrustRegionMinimizer::Minimize(const Minimizer::Options& options,
          ? options_.max_consecutive_nonmonotonic_steps
          : 0);

+  bool atleast_one_successful_step = false;
  while (FinalizeIterationAndCheckIfMinimizerCanContinue()) {
    iteration_start_time_in_secs_ = WallTimeInSeconds();

@@ -106,7 +108,7 @@ void TrustRegionMinimizer::Minimize(const Minimizer::Options& options,
    ComputeCandidatePointAndEvaluateCost();
    DoInnerIterationsIfNeeded();

-    if (ParameterToleranceReached()) {
+    if (atleast_one_successful_step && ParameterToleranceReached()) {
      return;
    }

@@ -115,6 +117,7 @@ void TrustRegionMinimizer::Minimize(const Minimizer::Options& options,
    }

    if (IsStepSuccessful()) {
+      atleast_one_successful_step = true;
      RETURN_IF_ERROR_AND_LOG(HandleSuccessfulStep());
    } else {
      // Declare the step unsuccessful and inform the trust region strategy.
@@ -137,8 +140,8 @@ void TrustRegionMinimizer::Init(const Minimizer::Options& options,
                                double* parameters,
                                Solver::Summary* solver_summary) {
  options_ = options;
-  sort(options_.trust_region_minimizer_iterations_to_dump.begin(),
-       options_.trust_region_minimizer_iterations_to_dump.end());
+  std::sort(options_.trust_region_minimizer_iterations_to_dump.begin(),
+            options_.trust_region_minimizer_iterations_to_dump.end());

  parameters_ = parameters;

@@ -166,7 +169,6 @@ void TrustRegionMinimizer::Init(const Minimizer::Options& options,
  num_consecutive_invalid_steps_ = 0;

  x_ = ConstVectorRef(parameters_, num_parameters_);
-  x_norm_ = x_.norm();
  residuals_.resize(num_residuals_);
  trust_region_step_.resize(num_effective_parameters_);
  delta_.resize(num_effective_parameters_);
@@ -180,7 +182,6 @@ void TrustRegionMinimizer::Init(const Minimizer::Options& options,
  // the Jacobian, we will compute and overwrite this vector.
  jacobian_scaling_ = Vector::Ones(num_effective_parameters_);

-  x_norm_ = -1;  // Invalid value
  x_cost_ = std::numeric_limits<double>::max();
  minimum_cost_ = x_cost_;
  model_cost_change_ = 0.0;
@@ -214,10 +215,11 @@ bool TrustRegionMinimizer::IterationZero() {
    }

    x_ = candidate_x_;
-    x_norm_ = x_.norm();
  }

  if (!EvaluateGradientAndJacobian(/*new_evaluation_point=*/true)) {
+    solver_summary_->message =
+        "Initial residual and Jacobian evaluation failed.";
    return false;
  }

@@ -270,7 +272,8 @@ bool TrustRegionMinimizer::EvaluateGradientAndJacobian(
    }

    // jacobian = jacobian * diag(J'J) ^{-1}
-    jacobian_->ScaleColumns(jacobian_scaling_.data());
+    jacobian_->ScaleColumns(
+        jacobian_scaling_.data(), options_.context, options_.num_threads);
  }

  // The gradient exists in the local tangent space. To account for
@@ -357,13 +360,13 @@ bool TrustRegionMinimizer::FinalizeIterationAndCheckIfMinimizerCanContinue() {
 // Compute the trust region step using the TrustRegionStrategy chosen
 // by the user.
 //
-// If the strategy returns with LINEAR_SOLVER_FATAL_ERROR, which
+// If the strategy returns with LinearSolverTerminationType::FATAL_ERROR, which
 // indicates an unrecoverable error, return false. This is the only
 // condition that returns false.
 //
-// If the strategy returns with LINEAR_SOLVER_FAILURE, which indicates
-// a numerical failure that could be recovered from by retrying
-// (e.g. by increasing the strength of the regularization), we set
+// If the strategy returns with LinearSolverTerminationType::FAILURE, which
+// indicates a numerical failure that could be recovered from by retrying (e.g.
+// by increasing the strength of the regularization), we set
 // iteration_summary_.step_is_valid to false and return true.
 //
 // In all other cases, we compute the decrease in the trust region
@@ -395,7 +398,8 @@ bool TrustRegionMinimizer::ComputeTrustRegionStep() {
                             residuals_.data(),
                             trust_region_step_.data());

-  if (strategy_summary.termination_type == LINEAR_SOLVER_FATAL_ERROR) {
+  if (strategy_summary.termination_type ==
+      LinearSolverTerminationType::FATAL_ERROR) {
    solver_summary_->message =
        "Linear solver failed due to unrecoverable "
        "non-numeric causes. Please see the error log for clues. ";
@@ -407,7 +411,8 @@ bool TrustRegionMinimizer::ComputeTrustRegionStep() {
      WallTimeInSeconds() - strategy_start_time;
  iteration_summary_.linear_solver_iterations = strategy_summary.num_iterations;

-  if (strategy_summary.termination_type == LINEAR_SOLVER_FAILURE) {
+  if (strategy_summary.termination_type ==
+      LinearSolverTerminationType::FAILURE) {
    return true;
  }

@@ -419,10 +424,15 @@ bool TrustRegionMinimizer::ComputeTrustRegionStep() {
  //  = f'f/2  - 1/2 [ f'f + 2f'J * step + step' * J' * J * step]
  //  = -f'J * step - step' * J' * J * step / 2
  //  = -(J * step)'(f + J * step / 2)
-  model_residuals_.setZero();
-  jacobian_->RightMultiply(trust_region_step_.data(), model_residuals_.data());
-  model_cost_change_ =
-      -model_residuals_.dot(residuals_ + model_residuals_ / 2.0);
+  ParallelSetZero(options_.context, options_.num_threads, model_residuals_);
+  jacobian_->RightMultiplyAndAccumulate(trust_region_step_.data(),
+                                        model_residuals_.data(),
+                                        options_.context,
+                                        options_.num_threads);
+  model_cost_change_ = -Dot(model_residuals_,
+                            residuals_ + model_residuals_ / 2.0,
+                            options_.context,
+                            options_.num_threads);

  // TODO(sameeragarwal)
  //
@@ -432,7 +442,10 @@ bool TrustRegionMinimizer::ComputeTrustRegionStep() {
  iteration_summary_.step_is_valid = (model_cost_change_ > 0.0);
  if (iteration_summary_.step_is_valid) {
    // Undo the Jacobian column scaling.
-    delta_ = (trust_region_step_.array() * jacobian_scaling_.array()).matrix();
+    ParallelAssign(options_.context,
+                   options_.num_threads,
+                   delta_,
+                   (trust_region_step_.array() * jacobian_scaling_.array()));
    num_consecutive_invalid_steps_ = 0;
  }

@@ -702,10 +715,12 @@ bool TrustRegionMinimizer::MinTrustRegionRadiusReached() {

 // Solver::Options::parameter_tolerance based convergence check.
 bool TrustRegionMinimizer::ParameterToleranceReached() {
+  const double x_norm = x_.norm();
+
  // Compute the norm of the step in the ambient space.
  iteration_summary_.step_norm = (x_ - candidate_x_).norm();
  const double step_size_tolerance =
-      options_.parameter_tolerance * (x_norm_ + options_.parameter_tolerance);
+      options_.parameter_tolerance * (x_norm + options_.parameter_tolerance);

  if (iteration_summary_.step_norm > step_size_tolerance) {
    return false;
@@ -714,7 +729,7 @@ bool TrustRegionMinimizer::ParameterToleranceReached() {
  solver_summary_->message = StringPrintf(
      "Parameter tolerance reached. "
      "Relative step_norm: %e <= %e.",
-      (iteration_summary_.step_norm / (x_norm_ + options_.parameter_tolerance)),
+      (iteration_summary_.step_norm / (x_norm + options_.parameter_tolerance)),
      options_.parameter_tolerance);
  solver_summary_->termination_type = CONVERGENCE;
  if (is_not_silent_) {
@@ -807,7 +822,6 @@ bool TrustRegionMinimizer::IsStepSuccessful() {
 // evaluator know that the step has been accepted.
 bool TrustRegionMinimizer::HandleSuccessfulStep() {
  x_ = candidate_x_;
-  x_norm_ = x_.norm();

  // Since the step was successful, this point has already had the residual
  // evaluated (but not the jacobian). So indicate that to the evaluator.
@@ -821,5 +835,4 @@ bool TrustRegionMinimizer::HandleSuccessfulStep() {
  return true;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/trust_region_minimizer.h
+++ b/extern/ceres/internal/ceres/trust_region_minimizer.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/trust_region_strategy.h"
 #include "ceres/types.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // Generic trust region minimization algorithm.
 //
@@ -139,8 +138,6 @@ class CERES_NO_EXPORT TrustRegionMinimizer final : public Minimizer {
  // Scaling vector to scale the columns of the Jacobian.
  Vector jacobian_scaling_;

-  // Euclidean norm of x_.
-  double x_norm_;
  // Cost at x_.
  double x_cost_;
  // Minimum cost encountered up till now.
@@ -160,8 +157,7 @@ class CERES_NO_EXPORT TrustRegionMinimizer final : public Minimizer {
  int num_consecutive_invalid_steps_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/trust_region_preprocessor.cc
+++ b/extern/ceres/internal/ceres/trust_region_preprocessor.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -32,6 +32,7 @@

 #include <numeric>
 #include <string>
+#include <vector>

 #include "ceres/callbacks.h"
 #include "ceres/context_impl.h"
@@ -48,10 +49,7 @@
 #include "ceres/trust_region_strategy.h"
 #include "ceres/wall_time.h"

-namespace ceres {
-namespace internal {
-
-using std::vector;
+namespace ceres::internal {

 namespace {

@@ -59,7 +57,8 @@ std::shared_ptr<ParameterBlockOrdering> CreateDefaultLinearSolverOrdering(
    const Program& program) {
  std::shared_ptr<ParameterBlockOrdering> ordering =
      std::make_shared<ParameterBlockOrdering>();
-  const vector<ParameterBlock*>& parameter_blocks = program.parameter_blocks();
+  const std::vector<ParameterBlock*>& parameter_blocks =
+      program.parameter_blocks();
  for (auto* parameter_block : parameter_blocks) {
    ordering->AddElementToGroup(
        const_cast<double*>(parameter_block->user_state()), 0);
@@ -114,6 +113,7 @@ bool ReorderProgram(PreprocessedProblem* pp) {
    return ReorderProgramForSchurTypeLinearSolver(
        options.linear_solver_type,
        options.sparse_linear_algebra_library_type,
+        options.linear_solver_ordering_type,
        pp->problem->parameter_map(),
        options.linear_solver_ordering.get(),
        pp->reduced_program.get(),
@@ -124,6 +124,7 @@ bool ReorderProgram(PreprocessedProblem* pp) {
      !options.dynamic_sparsity) {
    return ReorderProgramForSparseCholesky(
        options.sparse_linear_algebra_library_type,
+        options.linear_solver_ordering_type,
        *options.linear_solver_ordering,
        0, /* use all the rows of the jacobian */
        pp->reduced_program.get(),
@@ -139,6 +140,7 @@ bool ReorderProgram(PreprocessedProblem* pp) {

    return ReorderProgramForSparseCholesky(
        options.sparse_linear_algebra_library_type,
+        options.linear_solver_ordering_type,
        *options.linear_solver_ordering,
        pp->linear_solver_options.subset_preconditioner_start_row_block,
        pp->reduced_program.get(),
@@ -197,10 +199,16 @@ bool SetupLinearSolver(PreprocessedProblem* pp) {
      options.max_linear_solver_iterations;
  pp->linear_solver_options.type = options.linear_solver_type;
  pp->linear_solver_options.preconditioner_type = options.preconditioner_type;
+  pp->linear_solver_options.use_spse_initialization =
+      options.use_spse_initialization;
+  pp->linear_solver_options.spse_tolerance = options.spse_tolerance;
+  pp->linear_solver_options.max_num_spse_iterations =
+      options.max_num_spse_iterations;
  pp->linear_solver_options.visibility_clustering_type =
      options.visibility_clustering_type;
  pp->linear_solver_options.sparse_linear_algebra_library_type =
      options.sparse_linear_algebra_library_type;
+
  pp->linear_solver_options.dense_linear_algebra_library_type =
      options.dense_linear_algebra_library_type;
  pp->linear_solver_options.use_explicit_schur_complement =
@@ -211,7 +219,6 @@ bool SetupLinearSolver(PreprocessedProblem* pp) {
  pp->linear_solver_options.max_num_refinement_iterations =
      options.max_num_refinement_iterations;
  pp->linear_solver_options.num_threads = options.num_threads;
-  pp->linear_solver_options.use_postordering = options.use_postordering;
  pp->linear_solver_options.context = pp->problem->context();

  if (IsSchurType(pp->linear_solver_options.type)) {
@@ -225,26 +232,23 @@ bool SetupLinearSolver(PreprocessedProblem* pp) {
    if (pp->linear_solver_options.elimination_groups.size() == 1) {
      pp->linear_solver_options.elimination_groups.push_back(0);
    }
+  }

-    if (options.linear_solver_type == SPARSE_SCHUR) {
-      // When using SPARSE_SCHUR, we ignore the user's postordering
-      // preferences in certain cases.
-      //
-      // 1. SUITE_SPARSE is the sparse linear algebra library requested
-      //    but cholmod_camd is not available.
-      // 2. CX_SPARSE is the sparse linear algebra library requested.
-      //
-      // This ensures that the linear solver does not assume that a
-      // fill-reducing pre-ordering has been done.
-      //
-      // TODO(sameeragarwal): Implement the reordering of parameter
-      // blocks for CX_SPARSE.
-      if ((options.sparse_linear_algebra_library_type == SUITE_SPARSE &&
-           !SuiteSparse::
-               IsConstrainedApproximateMinimumDegreeOrderingAvailable()) ||
-          (options.sparse_linear_algebra_library_type == CX_SPARSE)) {
-        pp->linear_solver_options.use_postordering = true;
-      }
+  if (!options.dynamic_sparsity &&
+      AreJacobianColumnsOrdered(options.linear_solver_type,
+                                options.preconditioner_type,
+                                options.sparse_linear_algebra_library_type,
+                                options.linear_solver_ordering_type)) {
+    pp->linear_solver_options.ordering_type = OrderingType::NATURAL;
+  } else {
+    if (options.linear_solver_ordering_type == ceres::AMD) {
+      pp->linear_solver_options.ordering_type = OrderingType::AMD;
+    } else if (options.linear_solver_ordering_type == ceres::NESDIS) {
+      pp->linear_solver_options.ordering_type = OrderingType::NESDIS;
+    } else {
+      LOG(FATAL) << "Congratulations you have found a bug in Ceres Solver."
+                 << " Please report this to the maintainers. : "
+                 << options.linear_solver_ordering_type;
    }
  }

@@ -257,6 +261,8 @@ bool SetupEvaluator(PreprocessedProblem* pp) {
  const Solver::Options& options = pp->options;
  pp->evaluator_options = Evaluator::Options();
  pp->evaluator_options.linear_solver_type = options.linear_solver_type;
+  pp->evaluator_options.sparse_linear_algebra_library_type =
+      options.sparse_linear_algebra_library_type;
  pp->evaluator_options.num_eliminate_blocks = 0;
  if (IsSchurType(options.linear_solver_type)) {
    pp->evaluator_options.num_eliminate_blocks =
@@ -330,13 +336,19 @@ bool SetupInnerIterationMinimizer(PreprocessedProblem* pp) {
 }

 // Configure and create a TrustRegionMinimizer object.
-void SetupMinimizerOptions(PreprocessedProblem* pp) {
+bool SetupMinimizerOptions(PreprocessedProblem* pp) {
  const Solver::Options& options = pp->options;

  SetupCommonMinimizerOptions(pp);
  pp->minimizer_options.is_constrained =
      pp->reduced_program->IsBoundsConstrained();
  pp->minimizer_options.jacobian = pp->evaluator->CreateJacobian();
+  if (pp->minimizer_options.jacobian == nullptr) {
+    pp->error =
+        "Unable to create Jacobian matrix. Likely because it is too large.";
+    return false;
+  }
+
  pp->minimizer_options.inner_iteration_minimizer =
      pp->inner_iteration_minimizer;

@@ -349,9 +361,12 @@ void SetupMinimizerOptions(PreprocessedProblem* pp) {
  strategy_options.trust_region_strategy_type =
      options.trust_region_strategy_type;
  strategy_options.dogleg_type = options.dogleg_type;
+  strategy_options.context = pp->problem->context();
+  strategy_options.num_threads = options.num_threads;
  pp->minimizer_options.trust_region_strategy =
      TrustRegionStrategy::Create(strategy_options);
  CHECK(pp->minimizer_options.trust_region_strategy != nullptr);
+  return true;
 }

 }  // namespace
@@ -387,9 +402,7 @@ bool TrustRegionPreprocessor::Preprocess(const Solver::Options& options,
    return false;
  }

-  SetupMinimizerOptions(pp);
-  return true;
+  return SetupMinimizerOptions(pp);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/trust_region_preprocessor.h
+++ b/extern/ceres/internal/ceres/trust_region_preprocessor.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/preprocessor.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class CERES_NO_EXPORT TrustRegionPreprocessor final : public Preprocessor {
 public:
@@ -45,8 +44,7 @@ class CERES_NO_EXPORT TrustRegionPreprocessor final : public Preprocessor {
                  PreprocessedProblem* preprocessed_problem) override;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/trust_region_step_evaluator.cc
+++ b/extern/ceres/internal/ceres/trust_region_step_evaluator.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,8 +35,7 @@

 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 TrustRegionStepEvaluator::TrustRegionStepEvaluator(
    const double initial_cost, const int max_consecutive_nonmonotonic_steps)
@@ -111,5 +110,4 @@ void TrustRegionStepEvaluator::StepAccepted(const double cost,
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/trust_region_step_evaluator.h
+++ b/extern/ceres/internal/ceres/trust_region_step_evaluator.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2016 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,7 @@

 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 // The job of the TrustRegionStepEvaluator is to evaluate the quality
 // of a step, i.e., how the cost of a step compares with the reduction
@@ -118,7 +117,6 @@ class CERES_NO_EXPORT TrustRegionStepEvaluator {
  int num_consecutive_nonmonotonic_steps_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_TRUST_REGION_STEP_EVALUATOR_H_
--- a/extern/ceres/internal/ceres/trust_region_strategy.cc
+++ b/extern/ceres/internal/ceres/trust_region_strategy.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,7 @@
 #include "ceres/dogleg_strategy.h"
 #include "ceres/levenberg_marquardt_strategy.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 TrustRegionStrategy::~TrustRegionStrategy() = default;

@@ -59,5 +58,4 @@ std::unique_ptr<TrustRegionStrategy> TrustRegionStrategy::Create(
  return nullptr;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/trust_region_strategy.h
+++ b/extern/ceres/internal/ceres/trust_region_strategy.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,7 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class LinearSolver;
 class SparseMatrix;
@@ -74,6 +73,9 @@ class CERES_NO_EXPORT TrustRegionStrategy {

    // Further specify which dogleg method to use
    DoglegType dogleg_type = TRADITIONAL_DOGLEG;
+
+    ContextImpl* context = nullptr;
+    int num_threads = 1;
  };

  // Factory.
@@ -112,7 +114,8 @@ class CERES_NO_EXPORT TrustRegionStrategy {
    int num_iterations = -1;

    // Status of the linear solver used to solve the Newton system.
-    LinearSolverTerminationType termination_type = LINEAR_SOLVER_FAILURE;
+    LinearSolverTerminationType termination_type =
+        LinearSolverTerminationType::FAILURE;
  };

  // Use the current radius to solve for the trust region step.
@@ -141,8 +144,7 @@ class CERES_NO_EXPORT TrustRegionStrategy {
  virtual double Radius() const = 0;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/types.cc
+++ b/extern/ceres/internal/ceres/types.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,14 +39,12 @@

 namespace ceres {

-using std::string;
-
 // clang-format off
 #define CASESTR(x) case x: return #x
 #define STRENUM(x) if (value == #x) { *type = x; return true; }
 // clang-format on

-static void UpperCase(string* input) {
+static void UpperCase(std::string* input) {
  std::transform(input->begin(), input->end(), input->begin(), ::toupper);
 }

@@ -64,7 +62,7 @@ const char* LinearSolverTypeToString(LinearSolverType type) {
  }
 }

-bool StringToLinearSolverType(string value, LinearSolverType* type) {
+bool StringToLinearSolverType(std::string value, LinearSolverType* type) {
  UpperCase(&value);
  STRENUM(DENSE_NORMAL_CHOLESKY);
  STRENUM(DENSE_QR);
@@ -81,6 +79,7 @@ const char* PreconditionerTypeToString(PreconditionerType type) {
    CASESTR(IDENTITY);
    CASESTR(JACOBI);
    CASESTR(SCHUR_JACOBI);
+    CASESTR(SCHUR_POWER_SERIES_EXPANSION);
    CASESTR(CLUSTER_JACOBI);
    CASESTR(CLUSTER_TRIDIAGONAL);
    CASESTR(SUBSET);
@@ -89,11 +88,12 @@ const char* PreconditionerTypeToString(PreconditionerType type) {
  }
 }

-bool StringToPreconditionerType(string value, PreconditionerType* type) {
+bool StringToPreconditionerType(std::string value, PreconditionerType* type) {
  UpperCase(&value);
  STRENUM(IDENTITY);
  STRENUM(JACOBI);
  STRENUM(SCHUR_JACOBI);
+  STRENUM(SCHUR_POWER_SERIES_EXPANSION);
  STRENUM(CLUSTER_JACOBI);
  STRENUM(CLUSTER_TRIDIAGONAL);
  STRENUM(SUBSET);
@@ -104,9 +104,9 @@ const char* SparseLinearAlgebraLibraryTypeToString(
    SparseLinearAlgebraLibraryType type) {
  switch (type) {
    CASESTR(SUITE_SPARSE);
-    CASESTR(CX_SPARSE);
    CASESTR(EIGEN_SPARSE);
    CASESTR(ACCELERATE_SPARSE);
+    CASESTR(CUDA_SPARSE);
    CASESTR(NO_SPARSE);
    default:
      return "UNKNOWN";
@@ -114,16 +114,33 @@ const char* SparseLinearAlgebraLibraryTypeToString(
 }

 bool StringToSparseLinearAlgebraLibraryType(
-    string value, SparseLinearAlgebraLibraryType* type) {
+    std::string value, SparseLinearAlgebraLibraryType* type) {
  UpperCase(&value);
  STRENUM(SUITE_SPARSE);
-  STRENUM(CX_SPARSE);
  STRENUM(EIGEN_SPARSE);
  STRENUM(ACCELERATE_SPARSE);
+  STRENUM(CUDA_SPARSE);
  STRENUM(NO_SPARSE);
  return false;
 }

+const char* LinearSolverOrderingTypeToString(LinearSolverOrderingType type) {
+  switch (type) {
+    CASESTR(AMD);
+    CASESTR(NESDIS);
+    default:
+      return "UNKNOWN";
+  }
+}
+
+bool StringToLinearSolverOrderingType(std::string value,
+                                      LinearSolverOrderingType* type) {
+  UpperCase(&value);
+  STRENUM(AMD);
+  STRENUM(NESDIS);
+  return false;
+}
+
 const char* DenseLinearAlgebraLibraryTypeToString(
    DenseLinearAlgebraLibraryType type) {
  switch (type) {
@@ -136,7 +153,7 @@ const char* DenseLinearAlgebraLibraryTypeToString(
 }

 bool StringToDenseLinearAlgebraLibraryType(
-    string value, DenseLinearAlgebraLibraryType* type) {
+    std::string value, DenseLinearAlgebraLibraryType* type) {
  UpperCase(&value);
  STRENUM(EIGEN);
  STRENUM(LAPACK);
@@ -153,7 +170,7 @@ const char* TrustRegionStrategyTypeToString(TrustRegionStrategyType type) {
  }
 }

-bool StringToTrustRegionStrategyType(string value,
+bool StringToTrustRegionStrategyType(std::string value,
                                     TrustRegionStrategyType* type) {
  UpperCase(&value);
  STRENUM(LEVENBERG_MARQUARDT);
@@ -170,7 +187,7 @@ const char* DoglegTypeToString(DoglegType type) {
  }
 }

-bool StringToDoglegType(string value, DoglegType* type) {
+bool StringToDoglegType(std::string value, DoglegType* type) {
  UpperCase(&value);
  STRENUM(TRADITIONAL_DOGLEG);
  STRENUM(SUBSPACE_DOGLEG);
@@ -186,7 +203,7 @@ const char* MinimizerTypeToString(MinimizerType type) {
  }
 }

-bool StringToMinimizerType(string value, MinimizerType* type) {
+bool StringToMinimizerType(std::string value, MinimizerType* type) {
  UpperCase(&value);
  STRENUM(TRUST_REGION);
  STRENUM(LINE_SEARCH);
@@ -204,7 +221,7 @@ const char* LineSearchDirectionTypeToString(LineSearchDirectionType type) {
  }
 }

-bool StringToLineSearchDirectionType(string value,
+bool StringToLineSearchDirectionType(std::string value,
                                     LineSearchDirectionType* type) {
  UpperCase(&value);
  STRENUM(STEEPEST_DESCENT);
@@ -223,7 +240,7 @@ const char* LineSearchTypeToString(LineSearchType type) {
  }
 }

-bool StringToLineSearchType(string value, LineSearchType* type) {
+bool StringToLineSearchType(std::string value, LineSearchType* type) {
  UpperCase(&value);
  STRENUM(ARMIJO);
  STRENUM(WOLFE);
@@ -241,7 +258,7 @@ const char* LineSearchInterpolationTypeToString(
  }
 }

-bool StringToLineSearchInterpolationType(string value,
+bool StringToLineSearchInterpolationType(std::string value,
                                         LineSearchInterpolationType* type) {
  UpperCase(&value);
  STRENUM(BISECTION);
@@ -262,7 +279,7 @@ const char* NonlinearConjugateGradientTypeToString(
 }

 bool StringToNonlinearConjugateGradientType(
-    string value, NonlinearConjugateGradientType* type) {
+    std::string value, NonlinearConjugateGradientType* type) {
  UpperCase(&value);
  STRENUM(FLETCHER_REEVES);
  STRENUM(POLAK_RIBIERE);
@@ -279,7 +296,7 @@ const char* CovarianceAlgorithmTypeToString(CovarianceAlgorithmType type) {
  }
 }

-bool StringToCovarianceAlgorithmType(string value,
+bool StringToCovarianceAlgorithmType(std::string value,
                                     CovarianceAlgorithmType* type) {
  UpperCase(&value);
  STRENUM(DENSE_SVD);
@@ -297,7 +314,8 @@ const char* NumericDiffMethodTypeToString(NumericDiffMethodType type) {
  }
 }

-bool StringToNumericDiffMethodType(string value, NumericDiffMethodType* type) {
+bool StringToNumericDiffMethodType(std::string value,
+                                   NumericDiffMethodType* type) {
  UpperCase(&value);
  STRENUM(CENTRAL);
  STRENUM(FORWARD);
@@ -314,7 +332,7 @@ const char* VisibilityClusteringTypeToString(VisibilityClusteringType type) {
  }
 }

-bool StringToVisibilityClusteringType(string value,
+bool StringToVisibilityClusteringType(std::string value,
                                      VisibilityClusteringType* type) {
  UpperCase(&value);
  STRENUM(CANONICAL_VIEWS);
@@ -387,14 +405,6 @@ bool IsSparseLinearAlgebraLibraryTypeAvailable(
 #endif
  }

-  if (type == CX_SPARSE) {
-#ifdef CERES_NO_CXSPARSE
-    return false;
-#else
-    return true;
-#endif
-  }
-
  if (type == ACCELERATE_SPARSE) {
 #ifdef CERES_NO_ACCELERATE_SPARSE
    return false;
@@ -411,6 +421,18 @@ bool IsSparseLinearAlgebraLibraryTypeAvailable(
 #endif
  }

+  if (type == CUDA_SPARSE) {
+#ifdef CERES_NO_CUDA
+    return false;
+#else
+    return true;
+#endif
+  }
+
+  if (type == NO_SPARSE) {
+    return true;
+  }
+
  LOG(WARNING) << "Unknown sparse linear algebra library " << type;
  return false;
 }
--- a/extern/ceres/internal/ceres/visibility.cc
+++ b/extern/ceres/internal/ceres/visibility.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -44,18 +44,11 @@
 #include "ceres/pair_hash.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::make_pair;
-using std::max;
-using std::pair;
-using std::set;
-using std::vector;
+namespace ceres::internal {

 void ComputeVisibility(const CompressedRowBlockStructure& block_structure,
                       const int num_eliminate_blocks,
-                       vector<set<int>>* visibility) {
+                       std::vector<std::set<int>>* visibility) {
  CHECK(visibility != nullptr);

  // Clear the visibility vector and resize it to hold a
@@ -64,7 +57,7 @@ void ComputeVisibility(const CompressedRowBlockStructure& block_structure,
  visibility->resize(block_structure.cols.size() - num_eliminate_blocks);

  for (const auto& row : block_structure.rows) {
-    const vector<Cell>& cells = row.cells;
+    const std::vector<Cell>& cells = row.cells;
    int block_id = cells[0].block_id;
    // If the first block is not an e_block, then skip this row block.
    if (block_id >= num_eliminate_blocks) {
@@ -81,7 +74,7 @@ void ComputeVisibility(const CompressedRowBlockStructure& block_structure,
 }

 std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
-    const vector<set<int>>& visibility) {
+    const std::vector<std::set<int>>& visibility) {
  const time_t start_time = time(nullptr);
  // Compute the number of e_blocks/point blocks. Since the visibility
  // set for each e_block/camera contains the set of e_blocks/points
@@ -89,7 +82,7 @@ std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
  int num_points = 0;
  for (const auto& visible : visibility) {
    if (!visible.empty()) {
-      num_points = max(num_points, (*visible.rbegin()) + 1);
+      num_points = std::max(num_points, (*visible.rbegin()) + 1);
    }
  }

@@ -98,9 +91,9 @@ std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
  // cameras. However, to compute the sparsity structure of the Schur
  // Complement efficiently, its better to have the point->camera
  // mapping.
-  vector<set<int>> inverse_visibility(num_points);
+  std::vector<std::set<int>> inverse_visibility(num_points);
  for (int i = 0; i < visibility.size(); i++) {
-    const set<int>& visibility_set = visibility[i];
+    const std::set<int>& visibility_set = visibility[i];
    for (int v : visibility_set) {
      inverse_visibility[v].insert(i);
    }
@@ -108,7 +101,7 @@ std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(

  // Map from camera pairs to number of points visible to both cameras
  // in the pair.
-  std::unordered_map<pair<int, int>, int, pair_hash> camera_pairs;
+  std::unordered_map<std::pair<int, int>, int, pair_hash> camera_pairs;

  // Count the number of points visible to each camera/f_block pair.
  for (const auto& inverse_visibility_set : inverse_visibility) {
@@ -117,7 +110,7 @@ std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
         ++camera1) {
      auto camera2 = camera1;
      for (++camera2; camera2 != inverse_visibility_set.end(); ++camera2) {
-        ++(camera_pairs[make_pair(*camera1, *camera2)]);
+        ++(camera_pairs[std::make_pair(*camera1, *camera2)]);
      }
    }
  }
@@ -151,5 +144,4 @@ std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
  return graph;
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/visibility.h
+++ b/extern/ceres/internal/ceres/visibility.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,8 +43,7 @@
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 struct CompressedRowBlockStructure;

@@ -77,8 +76,7 @@ CERES_NO_EXPORT void ComputeVisibility(
 CERES_NO_EXPORT std::unique_ptr<WeightedGraph<int>> CreateSchurComplementGraph(
    const std::vector<std::set<int>>& visibility);

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"

--- a/extern/ceres/internal/ceres/visibility_based_preconditioner.cc
+++ b/extern/ceres/internal/ceres/visibility_based_preconditioner.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,8 @@
 #include <iterator>
 #include <memory>
 #include <set>
+#include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>

@@ -50,14 +52,7 @@
 #include "ceres/visibility.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
-
-using std::make_pair;
-using std::pair;
-using std::set;
-using std::swap;
-using std::vector;
+namespace ceres::internal {

 // TODO(sameeragarwal): Currently these are magic weights for the
 // preconditioner construction. Move these higher up into the Options
@@ -82,10 +77,7 @@ VisibilityBasedPreconditioner::VisibilityBasedPreconditioner(
  CHECK(options_.context != nullptr);

  // Vector of camera block sizes
-  block_size_.resize(num_blocks_);
-  for (int i = 0; i < num_blocks_; ++i) {
-    block_size_[i] = bs.cols[i + options_.elimination_groups[0]].size;
-  }
+  blocks_ = Tail(bs.cols, bs.cols.size() - options_.elimination_groups[0]);

  const time_t start_time = time(nullptr);
  switch (options_.type) {
@@ -107,14 +99,7 @@ VisibilityBasedPreconditioner::VisibilityBasedPreconditioner(
  LinearSolver::Options sparse_cholesky_options;
  sparse_cholesky_options.sparse_linear_algebra_library_type =
      options_.sparse_linear_algebra_library_type;
-
-  // The preconditioner's sparsity is not available in the
-  // preprocessor, so the columns of the Jacobian have not been
-  // reordered to minimize fill in when computing its sparse Cholesky
-  // factorization. So we must tell the SparseCholesky object to
-  // perform approximate minimum-degree reordering, which is done by
-  // setting use_postordering to true.
-  sparse_cholesky_options.use_postordering = true;
+  sparse_cholesky_options.ordering_type = options_.ordering_type;
  sparse_cholesky_ = SparseCholesky::Create(sparse_cholesky_options);

  const time_t init_time = time(nullptr);
@@ -132,13 +117,13 @@ VisibilityBasedPreconditioner::~VisibilityBasedPreconditioner() = default;
 // preconditioner matrix.
 void VisibilityBasedPreconditioner::ComputeClusterJacobiSparsity(
    const CompressedRowBlockStructure& bs) {
-  vector<set<int>> visibility;
+  std::vector<std::set<int>> visibility;
  ComputeVisibility(bs, options_.elimination_groups[0], &visibility);
  CHECK_EQ(num_blocks_, visibility.size());
  ClusterCameras(visibility);
  cluster_pairs_.clear();
  for (int i = 0; i < num_clusters_; ++i) {
-    cluster_pairs_.insert(make_pair(i, i));
+    cluster_pairs_.insert(std::make_pair(i, i));
  }
 }

@@ -150,7 +135,7 @@ void VisibilityBasedPreconditioner::ComputeClusterJacobiSparsity(
 // of edges in this forest are the cluster pairs.
 void VisibilityBasedPreconditioner::ComputeClusterTridiagonalSparsity(
    const CompressedRowBlockStructure& bs) {
-  vector<set<int>> visibility;
+  std::vector<std::set<int>> visibility;
  ComputeVisibility(bs, options_.elimination_groups[0], &visibility);
  CHECK_EQ(num_blocks_, visibility.size());
  ClusterCameras(visibility);
@@ -159,7 +144,7 @@ void VisibilityBasedPreconditioner::ComputeClusterTridiagonalSparsity(
  // edges are the number of 3D points/e_blocks visible in both the
  // clusters at the ends of the edge. Return an approximate degree-2
  // maximum spanning forest of this graph.
-  vector<set<int>> cluster_visibility;
+  std::vector<std::set<int>> cluster_visibility;
  ComputeClusterVisibility(visibility, &cluster_visibility);
  auto cluster_graph = CreateClusterGraph(cluster_visibility);
  CHECK(cluster_graph != nullptr);
@@ -172,8 +157,8 @@ void VisibilityBasedPreconditioner::ComputeClusterTridiagonalSparsity(
 void VisibilityBasedPreconditioner::InitStorage(
    const CompressedRowBlockStructure& bs) {
  ComputeBlockPairsInPreconditioner(bs);
-  m_ = std::make_unique<BlockRandomAccessSparseMatrix>(block_size_,
-                                                       block_pairs_);
+  m_ = std::make_unique<BlockRandomAccessSparseMatrix>(
+      blocks_, block_pairs_, options_.context, options_.num_threads);
 }

 // Call the canonical views algorithm and cluster the cameras based on
@@ -183,14 +168,14 @@ void VisibilityBasedPreconditioner::InitStorage(
 // The cluster_membership_ vector is updated to indicate cluster
 // memberships for each camera block.
 void VisibilityBasedPreconditioner::ClusterCameras(
-    const vector<set<int>>& visibility) {
+    const std::vector<std::set<int>>& visibility) {
  auto schur_complement_graph = CreateSchurComplementGraph(visibility);
  CHECK(schur_complement_graph != nullptr);

  std::unordered_map<int, int> membership;

  if (options_.visibility_clustering_type == CANONICAL_VIEWS) {
-    vector<int> centers;
+    std::vector<int> centers;
    CanonicalViewsClusteringOptions clustering_options;
    clustering_options.size_penalty_weight = kCanonicalViewsSizePenaltyWeight;
    clustering_options.similarity_penalty_weight =
@@ -236,7 +221,7 @@ void VisibilityBasedPreconditioner::ComputeBlockPairsInPreconditioner(
    const CompressedRowBlockStructure& bs) {
  block_pairs_.clear();
  for (int i = 0; i < num_blocks_; ++i) {
-    block_pairs_.insert(make_pair(i, i));
+    block_pairs_.insert(std::make_pair(i, i));
  }

  int r = 0;
@@ -264,7 +249,7 @@ void VisibilityBasedPreconditioner::ComputeBlockPairsInPreconditioner(
      break;
    }

-    set<int> f_blocks;
+    std::set<int> f_blocks;
    for (; r < num_row_blocks; ++r) {
      const CompressedRow& row = bs.rows[r];
      if (row.cells.front().block_id != e_block_id) {
@@ -303,7 +288,7 @@ void VisibilityBasedPreconditioner::ComputeBlockPairsInPreconditioner(
        const int block2 = cell.block_id - num_eliminate_blocks;
        if (block1 <= block2) {
          if (IsBlockPairInPreconditioner(block1, block2)) {
-            block_pairs_.insert(make_pair(block1, block2));
+            block_pairs_.insert(std::make_pair(block1, block2));
          }
        }
      }
@@ -354,7 +339,7 @@ bool VisibilityBasedPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  // scaling is not needed, which is quite often in our experience.
  LinearSolverTerminationType status = Factorize();

-  if (status == LINEAR_SOLVER_FATAL_ERROR) {
+  if (status == LinearSolverTerminationType::FATAL_ERROR) {
    return false;
  }

@@ -363,7 +348,8 @@ bool VisibilityBasedPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  // belong to the edges of the degree-2 forest. In the CLUSTER_JACOBI
  // case, the preconditioner is guaranteed to be positive
  // semidefinite.
-  if (status == LINEAR_SOLVER_FAILURE && options_.type == CLUSTER_TRIDIAGONAL) {
+  if (status == LinearSolverTerminationType::FAILURE &&
+      options_.type == CLUSTER_TRIDIAGONAL) {
    VLOG(1) << "Unscaled factorization failed. Retrying with off-diagonal "
            << "scaling";
    ScaleOffDiagonalCells();
@@ -371,7 +357,7 @@ bool VisibilityBasedPreconditioner::UpdateImpl(const BlockSparseMatrix& A,
  }

  VLOG(2) << "Compute time: " << time(nullptr) - start_time;
-  return (status == LINEAR_SOLVER_SUCCESS);
+  return (status == LinearSolverTerminationType::SUCCESS);
 }

 // Consider the preconditioner matrix as meta-block matrix, whose
@@ -399,35 +385,44 @@ void VisibilityBasedPreconditioner::ScaleOffDiagonalCells() {
    // dominance. See Lemma 1 in "Visibility Based Preconditioning
    // For Bundle Adjustment".
    MatrixRef m(cell_info->values, row_stride, col_stride);
-    m.block(r, c, block_size_[block1], block_size_[block2]) *= 0.5;
+    m.block(r, c, blocks_[block1].size, blocks_[block2].size) *= 0.5;
  }
 }

 // Compute the sparse Cholesky factorization of the preconditioner
 // matrix.
 LinearSolverTerminationType VisibilityBasedPreconditioner::Factorize() {
-  // Extract the TripletSparseMatrix that is used for actually storing
+  // Extract the BlockSparseMatrix that is used for actually storing
  // S and convert it into a CompressedRowSparseMatrix.
-  const TripletSparseMatrix* tsm =
-      down_cast<BlockRandomAccessSparseMatrix*>(m_.get())->mutable_matrix();
-
-  std::unique_ptr<CompressedRowSparseMatrix> lhs;
+  const BlockSparseMatrix* bsm =
+      down_cast<BlockRandomAccessSparseMatrix*>(m_.get())->matrix();
  const CompressedRowSparseMatrix::StorageType storage_type =
      sparse_cholesky_->StorageType();
-  if (storage_type == CompressedRowSparseMatrix::UPPER_TRIANGULAR) {
-    lhs = CompressedRowSparseMatrix::FromTripletSparseMatrix(*tsm);
-    lhs->set_storage_type(CompressedRowSparseMatrix::UPPER_TRIANGULAR);
+  if (storage_type ==
+      CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR) {
+    if (!m_crs_) {
+      m_crs_ = bsm->ToCompressedRowSparseMatrix();
+      m_crs_->set_storage_type(
+          CompressedRowSparseMatrix::StorageType::UPPER_TRIANGULAR);
+    } else {
+      bsm->UpdateCompressedRowSparseMatrix(m_crs_.get());
+    }
  } else {
-    lhs = CompressedRowSparseMatrix::FromTripletSparseMatrixTransposed(*tsm);
-    lhs->set_storage_type(CompressedRowSparseMatrix::LOWER_TRIANGULAR);
+    if (!m_crs_) {
+      m_crs_ = bsm->ToCompressedRowSparseMatrixTranspose();
+      m_crs_->set_storage_type(
+          CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR);
+    } else {
+      bsm->UpdateCompressedRowSparseMatrixTranspose(m_crs_.get());
+    }
  }

  std::string message;
-  return sparse_cholesky_->Factorize(lhs.get(), &message);
+  return sparse_cholesky_->Factorize(m_crs_.get(), &message);
 }

-void VisibilityBasedPreconditioner::RightMultiply(const double* x,
-                                                  double* y) const {
+void VisibilityBasedPreconditioner::RightMultiplyAndAccumulate(
+    const double* x, double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);
  CHECK(sparse_cholesky_ != nullptr);
@@ -445,9 +440,9 @@ bool VisibilityBasedPreconditioner::IsBlockPairInPreconditioner(
  int cluster1 = cluster_membership_[block1];
  int cluster2 = cluster_membership_[block2];
  if (cluster1 > cluster2) {
-    swap(cluster1, cluster2);
+    std::swap(cluster1, cluster2);
  }
-  return (cluster_pairs_.count(make_pair(cluster1, cluster2)) > 0);
+  return (cluster_pairs_.count(std::make_pair(cluster1, cluster2)) > 0);
 }

 bool VisibilityBasedPreconditioner::IsBlockPairOffDiagonal(
@@ -459,7 +454,7 @@ bool VisibilityBasedPreconditioner::IsBlockPairOffDiagonal(
 // each vertex.
 void VisibilityBasedPreconditioner::ForestToClusterPairs(
    const WeightedGraph<int>& forest,
-    std::unordered_set<pair<int, int>, pair_hash>* cluster_pairs) const {
+    std::unordered_set<std::pair<int, int>, pair_hash>* cluster_pairs) const {
  CHECK(cluster_pairs != nullptr);
  cluster_pairs->clear();
  const std::unordered_set<int>& vertices = forest.vertices();
@@ -468,11 +463,11 @@ void VisibilityBasedPreconditioner::ForestToClusterPairs(
  // Add all the cluster pairs corresponding to the edges in the
  // forest.
  for (const int cluster1 : vertices) {
-    cluster_pairs->insert(make_pair(cluster1, cluster1));
+    cluster_pairs->insert(std::make_pair(cluster1, cluster1));
    const std::unordered_set<int>& neighbors = forest.Neighbors(cluster1);
    for (const int cluster2 : neighbors) {
      if (cluster1 < cluster2) {
-        cluster_pairs->insert(make_pair(cluster1, cluster2));
+        cluster_pairs->insert(std::make_pair(cluster1, cluster2));
      }
    }
  }
@@ -482,8 +477,8 @@ void VisibilityBasedPreconditioner::ForestToClusterPairs(
 // of all its cameras. In other words, the set of points visible to
 // any camera in the cluster.
 void VisibilityBasedPreconditioner::ComputeClusterVisibility(
-    const vector<set<int>>& visibility,
-    vector<set<int>>* cluster_visibility) const {
+    const std::vector<std::set<int>>& visibility,
+    std::vector<std::set<int>>* cluster_visibility) const {
  CHECK(cluster_visibility != nullptr);
  cluster_visibility->resize(0);
  cluster_visibility->resize(num_clusters_);
@@ -499,7 +494,7 @@ void VisibilityBasedPreconditioner::ComputeClusterVisibility(
 // vertices.
 std::unique_ptr<WeightedGraph<int>>
 VisibilityBasedPreconditioner::CreateClusterGraph(
-    const vector<set<int>>& cluster_visibility) const {
+    const std::vector<std::set<int>>& cluster_visibility) const {
  auto cluster_graph = std::make_unique<WeightedGraph<int>>();

  for (int i = 0; i < num_clusters_; ++i) {
@@ -507,15 +502,15 @@ VisibilityBasedPreconditioner::CreateClusterGraph(
  }

  for (int i = 0; i < num_clusters_; ++i) {
-    const set<int>& cluster_i = cluster_visibility[i];
+    const std::set<int>& cluster_i = cluster_visibility[i];
    for (int j = i + 1; j < num_clusters_; ++j) {
-      vector<int> intersection;
-      const set<int>& cluster_j = cluster_visibility[j];
-      set_intersection(cluster_i.begin(),
-                       cluster_i.end(),
-                       cluster_j.begin(),
-                       cluster_j.end(),
-                       back_inserter(intersection));
+      std::vector<int> intersection;
+      const std::set<int>& cluster_j = cluster_visibility[j];
+      std::set_intersection(cluster_i.begin(),
+                            cluster_i.end(),
+                            cluster_j.begin(),
+                            cluster_j.end(),
+                            std::back_inserter(intersection));

      if (intersection.size() > 0) {
        // Clusters interact strongly when they share a large number
@@ -540,7 +535,7 @@ VisibilityBasedPreconditioner::CreateClusterGraph(
 // of integers so that the cluster ids are in [0, num_clusters_).
 void VisibilityBasedPreconditioner::FlattenMembershipMap(
    const std::unordered_map<int, int>& membership_map,
-    vector<int>* membership_vector) const {
+    std::vector<int>* membership_vector) const {
  CHECK(membership_vector != nullptr);
  membership_vector->resize(0);
  membership_vector->resize(num_blocks_, -1);
@@ -576,5 +571,4 @@ void VisibilityBasedPreconditioner::FlattenMembershipMap(
  }
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/visibility_based_preconditioner.h
+++ b/extern/ceres/internal/ceres/visibility_based_preconditioner.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2017 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -55,14 +55,14 @@
 #include <utility>
 #include <vector>

+#include "ceres/block_structure.h"
 #include "ceres/graph.h"
 #include "ceres/linear_solver.h"
 #include "ceres/pair_hash.h"
 #include "ceres/preconditioner.h"
 #include "ceres/sparse_cholesky.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 class BlockRandomAccessSparseMatrix;
 class BlockSparseMatrix;
@@ -123,7 +123,7 @@ class SchurEliminatorBase;
 //   VisibilityBasedPreconditioner preconditioner(
 //      *A.block_structure(), options);
 //   preconditioner.Update(A, nullptr);
-//   preconditioner.RightMultiply(x, y);
+//   preconditioner.RightMultiplyAndAccumulate(x, y);
 class CERES_NO_EXPORT VisibilityBasedPreconditioner
    : public BlockSparseMatrixPreconditioner {
 public:
@@ -141,7 +141,7 @@ class CERES_NO_EXPORT VisibilityBasedPreconditioner
  ~VisibilityBasedPreconditioner() override;

  // Preconditioner interface
-  void RightMultiply(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x, double* y) const final;
  int num_rows() const final;

  friend class VisibilityBasedPreconditionerTest;
@@ -177,7 +177,7 @@ class CERES_NO_EXPORT VisibilityBasedPreconditioner
  int num_clusters_;

  // Sizes of the blocks in the schur complement.
-  std::vector<int> block_size_;
+  std::vector<Block> blocks_;

  // Mapping from cameras to clusters.
  std::vector<int> cluster_membership_;
@@ -194,10 +194,10 @@ class CERES_NO_EXPORT VisibilityBasedPreconditioner

  // Preconditioner matrix.
  std::unique_ptr<BlockRandomAccessSparseMatrix> m_;
+  std::unique_ptr<CompressedRowSparseMatrix> m_crs_;
  std::unique_ptr<SparseCholesky> sparse_cholesky_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #endif  // CERES_INTERNAL_VISIBILITY_BASED_PRECONDITIONER_H_
--- a/extern/ceres/internal/ceres/wall_time.cc
+++ b/extern/ceres/internal/ceres/wall_time.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,13 +30,9 @@

 #include "ceres/wall_time.h"

-#include "ceres/internal/config.h"
-
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#else
 #include <ctime>
-#endif
+
+#include "ceres/internal/config.h"

 #ifdef _WIN32
 #include <windows.h>
@@ -44,13 +40,9 @@
 #include <sys/time.h>
 #endif

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

 double WallTimeInSeconds() {
-#ifdef CERES_USE_OPENMP
-  return omp_get_wtime();
-#else
 #ifdef _WIN32
  LARGE_INTEGER count;
  LARGE_INTEGER frequency;
@@ -63,7 +55,6 @@ double WallTimeInSeconds() {
  gettimeofday(&time_val, nullptr);
  return (time_val.tv_sec + time_val.tv_usec * 1e-6);
 #endif
-#endif
 }

 EventLogger::EventLogger(const std::string& logger_name) {
@@ -74,7 +65,7 @@ EventLogger::EventLogger(const std::string& logger_name) {
  start_time_ = WallTimeInSeconds();
  last_event_time_ = start_time_;
  events_ = StringPrintf(
-      "\n%s\n                                   Delta   Cumulative\n",
+      "\n%s\n                                        Delta   Cumulative\n",
      logger_name.c_str());
 }

@@ -103,5 +94,4 @@ void EventLogger::AddEvent(const std::string& event_name) {
                absolute_time_delta);
 }

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
--- a/extern/ceres/internal/ceres/wall_time.h
+++ b/extern/ceres/internal/ceres/wall_time.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -39,13 +39,10 @@
 #include "ceres/stringprintf.h"
 #include "glog/logging.h"

-namespace ceres {
-namespace internal {
+namespace ceres::internal {

-// Returns time, in seconds, from some arbitrary starting point. If
-// OpenMP is available then the high precision openmp_get_wtime()
-// function is used. Otherwise on unixes, gettimeofday is used. The
-// granularity is in seconds on windows systems.
+// Returns time, in seconds, from some arbitrary starting point. On unixes,
+// gettimeofday is used. The granularity is microseconds.
 CERES_NO_EXPORT double WallTimeInSeconds();

 // Log a series of events, recording for each event the time elapsed
@@ -84,8 +81,7 @@ class CERES_NO_EXPORT EventLogger {
  std::string events_;
 };

-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal

 #include "ceres/internal/reenable_warnings.h"