Brings a lot of performance improvements and bug fixes. Keyframe selection in bundle-adjustment.blend goes down from 4.5 seconds to 3.0 on M2 Ultra. The reconstruction itself stays within 0.2 seconds. Full change log can be found at http://ceres-solver.org/version_history.html Pull Request: https://projects.blender.org/blender/blender/pulls/136896
457 lines
16 KiB
C++
457 lines
16 KiB
C++
// Ceres Solver - A fast non-linear least squares minimizer
|
|
// Copyright 2023 Google Inc. All rights reserved.
|
|
// http://ceres-solver.org/
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright notice,
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
// and/or other materials provided with the distribution.
|
|
// * Neither the name of Google Inc. nor the names of its contributors may be
|
|
// used to endorse or promote products derived from this software without
|
|
// specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
// POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Author: sameeragarwal@google.com (Sameer Agarwal)
|
|
|
|
#include "ceres/dense_qr.h"
|
|
|
|
#include <algorithm>
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#ifndef CERES_NO_CUDA
|
|
#include "ceres/context_impl.h"
|
|
#include "cublas_v2.h"
|
|
#include "cusolverDn.h"
|
|
#endif // CERES_NO_CUDA
|
|
|
|
#ifndef CERES_NO_LAPACK
|
|
|
|
// LAPACK routines for solving a linear least squares problem using QR
|
|
// factorization. This is done in three stages:
|
|
//
|
|
// A * x = b
|
|
// Q * R * x = b (dgeqrf)
|
|
// R * x = Q' * b (dormqr)
|
|
// x = R^{-1} * Q'* b (dtrtrs)
|
|
|
|
// clang-format off
|
|
|
|
// Compute the QR factorization of a.
|
|
//
|
|
// a is an m x n column major matrix (Denoted by "A" in the above description)
|
|
// lda is the leading dimension of a. lda >= max(1, num_rows)
|
|
// tau is an array of size min(m,n). It contains the scalar factors of the
|
|
// elementary reflectors.
|
|
// work is an array of size max(1,lwork). On exit, if info=0, work[0] contains
|
|
// the optimal size of work.
|
|
//
|
|
// if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed.
|
|
// dgeqrf computes the optimal size of the work array and returns it as work[0].
|
|
//
|
|
// info = 0, successful exit.
|
|
// info < 0, if info = -i, then the i^th argument had illegal value.
|
|
extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda,
|
|
double* tau, double* work, const int* lwork, int* info);
|
|
|
|
// Apply Q or Q' to b.
|
|
//
|
|
// b is a m times n column major matrix.
|
|
// size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right.
|
|
// trans = 'N', applies Q, trans = 'T', applies Q'.
|
|
// k is the number of elementary reflectors whose product defines the matrix Q.
|
|
// If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0.
|
|
// a is an lda x k column major matrix containing the reflectors as returned by dgeqrf.
|
|
// ldb is the leading dimension of b.
|
|
// work is an array of size max(1, lwork)
|
|
// lwork if positive is the size of work. If lwork = -1, then a
|
|
// workspace query is assumed.
|
|
//
|
|
// info = 0, successful exit.
|
|
// info < 0, if info = -i, then the i^th argument had illegal value.
|
|
extern "C" void dormqr_(const char* side, const char* trans, const int* m,
|
|
const int* n ,const int* k, double* a, const int* lda,
|
|
double* tau, double* b, const int* ldb, double* work,
|
|
const int* lwork, int* info);
|
|
|
|
// Solve a triangular system of the form A * x = b
|
|
//
|
|
// uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular.
|
|
// trans = 'N', 'T', 'C' specifies the form - A, A^T, A^H.
|
|
// DIAG = 'N', A is not unit triangular. 'U' is unit triangular.
|
|
// n is the order of the matrix A.
|
|
// nrhs number of columns of b.
|
|
// a is a column major lda x n.
|
|
// b is a column major matrix of ldb x nrhs
|
|
//
|
|
// info = 0 successful.
|
|
// = -i < 0 i^th argument is an illegal value.
|
|
// = i > 0, i^th diagonal element of A is zero.
|
|
extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,
|
|
const int* n, const int* nrhs, double* a, const int* lda,
|
|
double* b, const int* ldb, int* info);
|
|
// clang-format on
|
|
|
|
#endif
|
|
|
|
namespace ceres::internal {
|
|
|
|
DenseQR::~DenseQR() = default;
|
|
|
|
std::unique_ptr<DenseQR> DenseQR::Create(const LinearSolver::Options& options) {
|
|
std::unique_ptr<DenseQR> dense_qr;
|
|
|
|
switch (options.dense_linear_algebra_library_type) {
|
|
case EIGEN:
|
|
dense_qr = std::make_unique<EigenDenseQR>();
|
|
break;
|
|
|
|
case LAPACK:
|
|
#ifndef CERES_NO_LAPACK
|
|
dense_qr = std::make_unique<LAPACKDenseQR>();
|
|
break;
|
|
#else
|
|
LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
|
|
#endif
|
|
|
|
case CUDA:
|
|
#ifndef CERES_NO_CUDA
|
|
dense_qr = CUDADenseQR::Create(options);
|
|
break;
|
|
#else
|
|
LOG(FATAL) << "Ceres was compiled without support for CUDA.";
|
|
#endif
|
|
|
|
default:
|
|
LOG(FATAL) << "Unknown dense linear algebra library type : "
|
|
<< DenseLinearAlgebraLibraryTypeToString(
|
|
options.dense_linear_algebra_library_type);
|
|
}
|
|
return dense_qr;
|
|
}
|
|
|
|
LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows,
|
|
int num_cols,
|
|
double* lhs,
|
|
const double* rhs,
|
|
double* solution,
|
|
std::string* message) {
|
|
LinearSolverTerminationType termination_type =
|
|
Factorize(num_rows, num_cols, lhs, message);
|
|
if (termination_type == LinearSolverTerminationType::SUCCESS) {
|
|
termination_type = Solve(rhs, solution, message);
|
|
}
|
|
return termination_type;
|
|
}
|
|
|
|
LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows,
|
|
int num_cols,
|
|
double* lhs,
|
|
std::string* message) {
|
|
Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols);
|
|
qr_ = std::make_unique<QRType>(m);
|
|
*message = "Success.";
|
|
return LinearSolverTerminationType::SUCCESS;
|
|
}
|
|
|
|
LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
|
|
double* solution,
|
|
std::string* message) {
|
|
VectorRef(solution, qr_->cols()) =
|
|
qr_->solve(ConstVectorRef(rhs, qr_->rows()));
|
|
*message = "Success.";
|
|
return LinearSolverTerminationType::SUCCESS;
|
|
}
|
|
|
|
#ifndef CERES_NO_LAPACK
|
|
LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
|
|
int num_cols,
|
|
double* lhs,
|
|
std::string* message) {
|
|
int lwork = -1;
|
|
double work_size;
|
|
int info = 0;
|
|
|
|
// Compute the size of the temporary workspace needed to compute the QR
|
|
// factorization in the dgeqrf call below.
|
|
dgeqrf_(&num_rows,
|
|
&num_cols,
|
|
lhs_,
|
|
&num_rows,
|
|
tau_.data(),
|
|
&work_size,
|
|
&lwork,
|
|
&info);
|
|
if (info < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
|
|
<< "Please report it."
|
|
<< "LAPACK::dgels fatal error."
|
|
<< "Argument: " << -info << " is invalid.";
|
|
}
|
|
|
|
lhs_ = lhs;
|
|
num_rows_ = num_rows;
|
|
num_cols_ = num_cols;
|
|
|
|
lwork = static_cast<int>(work_size);
|
|
|
|
if (work_.size() < lwork) {
|
|
work_.resize(lwork);
|
|
}
|
|
if (tau_.size() < num_cols) {
|
|
tau_.resize(num_cols);
|
|
}
|
|
|
|
if (q_transpose_rhs_.size() < num_rows) {
|
|
q_transpose_rhs_.resize(num_rows);
|
|
}
|
|
|
|
// Factorize the lhs_ using the workspace that we just constructed above.
|
|
dgeqrf_(&num_rows,
|
|
&num_cols,
|
|
lhs_,
|
|
&num_rows,
|
|
tau_.data(),
|
|
work_.data(),
|
|
&lwork,
|
|
&info);
|
|
|
|
if (info < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
|
|
<< "Please report it. dgeqrf fatal error."
|
|
<< "Argument: " << -info << " is invalid.";
|
|
}
|
|
|
|
termination_type_ = LinearSolverTerminationType::SUCCESS;
|
|
*message = "Success.";
|
|
return termination_type_;
|
|
}
|
|
|
|
LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
|
|
double* solution,
|
|
std::string* message) {
|
|
if (termination_type_ != LinearSolverTerminationType::SUCCESS) {
|
|
*message = "QR factorization failed and solve called.";
|
|
return termination_type_;
|
|
}
|
|
|
|
std::copy_n(rhs, num_rows_, q_transpose_rhs_.data());
|
|
|
|
const char side = 'L';
|
|
char trans = 'T';
|
|
const int num_c_cols = 1;
|
|
const int lwork = work_.size();
|
|
int info = 0;
|
|
dormqr_(&side,
|
|
&trans,
|
|
&num_rows_,
|
|
&num_c_cols,
|
|
&num_cols_,
|
|
lhs_,
|
|
&num_rows_,
|
|
tau_.data(),
|
|
q_transpose_rhs_.data(),
|
|
&num_rows_,
|
|
work_.data(),
|
|
&lwork,
|
|
&info);
|
|
if (info < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
|
|
<< "Please report it. dormr fatal error."
|
|
<< "Argument: " << -info << " is invalid.";
|
|
}
|
|
|
|
const char uplo = 'U';
|
|
trans = 'N';
|
|
const char diag = 'N';
|
|
dtrtrs_(&uplo,
|
|
&trans,
|
|
&diag,
|
|
&num_cols_,
|
|
&num_c_cols,
|
|
lhs_,
|
|
&num_rows_,
|
|
q_transpose_rhs_.data(),
|
|
&num_rows_,
|
|
&info);
|
|
|
|
if (info < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
|
|
<< "Please report it. dormr fatal error."
|
|
<< "Argument: " << -info << " is invalid.";
|
|
} else if (info > 0) {
|
|
*message =
|
|
"QR factorization failure. The factorization is not full rank. R has "
|
|
"zeros on the diagonal.";
|
|
termination_type_ = LinearSolverTerminationType::FAILURE;
|
|
} else {
|
|
std::copy_n(q_transpose_rhs_.data(), num_cols_, solution);
|
|
termination_type_ = LinearSolverTerminationType::SUCCESS;
|
|
}
|
|
|
|
return termination_type_;
|
|
}
|
|
|
|
#endif // CERES_NO_LAPACK
|
|
|
|
#ifndef CERES_NO_CUDA
|
|
|
|
CUDADenseQR::CUDADenseQR(ContextImpl* context)
|
|
: context_(context),
|
|
lhs_{context},
|
|
rhs_{context},
|
|
tau_{context},
|
|
device_workspace_{context},
|
|
error_(context, 1) {}
|
|
|
|
LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
|
|
int num_cols,
|
|
double* lhs,
|
|
std::string* message) {
|
|
factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
|
|
lhs_.Reserve(num_rows * num_cols);
|
|
tau_.Reserve(std::min(num_rows, num_cols));
|
|
num_rows_ = num_rows;
|
|
num_cols_ = num_cols;
|
|
lhs_.CopyFromCpu(lhs, num_rows * num_cols);
|
|
int device_workspace_size = 0;
|
|
if (cusolverDnDgeqrf_bufferSize(context_->cusolver_handle_,
|
|
num_rows,
|
|
num_cols,
|
|
lhs_.data(),
|
|
num_rows,
|
|
&device_workspace_size) !=
|
|
CUSOLVER_STATUS_SUCCESS) {
|
|
*message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
device_workspace_.Reserve(device_workspace_size);
|
|
if (cusolverDnDgeqrf(context_->cusolver_handle_,
|
|
num_rows,
|
|
num_cols,
|
|
lhs_.data(),
|
|
num_rows,
|
|
tau_.data(),
|
|
reinterpret_cast<double*>(device_workspace_.data()),
|
|
device_workspace_.size(),
|
|
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
|
|
*message = "cuSolverDN::cusolverDnDgeqrf failed.";
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
int error = 0;
|
|
error_.CopyToCpu(&error, 1);
|
|
if (error < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
|
|
<< "please report it. "
|
|
<< "cuSolverDN::cusolverDnDgeqrf fatal error. "
|
|
<< "Argument: " << -error << " is invalid.";
|
|
// The following line is unreachable, but return failure just to be
|
|
// pedantic, since the compiler does not know that.
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
|
|
*message = "Success";
|
|
factorize_result_ = LinearSolverTerminationType::SUCCESS;
|
|
return LinearSolverTerminationType::SUCCESS;
|
|
}
|
|
|
|
LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
|
|
double* solution,
|
|
std::string* message) {
|
|
if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
|
|
*message = "Factorize did not complete successfully previously.";
|
|
return factorize_result_;
|
|
}
|
|
rhs_.CopyFromCpu(rhs, num_rows_);
|
|
int device_workspace_size = 0;
|
|
if (cusolverDnDormqr_bufferSize(context_->cusolver_handle_,
|
|
CUBLAS_SIDE_LEFT,
|
|
CUBLAS_OP_T,
|
|
num_rows_,
|
|
1,
|
|
num_cols_,
|
|
lhs_.data(),
|
|
num_rows_,
|
|
tau_.data(),
|
|
rhs_.data(),
|
|
num_rows_,
|
|
&device_workspace_size) !=
|
|
CUSOLVER_STATUS_SUCCESS) {
|
|
*message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
device_workspace_.Reserve(device_workspace_size);
|
|
// Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
|
|
// The result of factorization would have stored Q in a packed form in lhs_.
|
|
if (cusolverDnDormqr(context_->cusolver_handle_,
|
|
CUBLAS_SIDE_LEFT,
|
|
CUBLAS_OP_T,
|
|
num_rows_,
|
|
1,
|
|
num_cols_,
|
|
lhs_.data(),
|
|
num_rows_,
|
|
tau_.data(),
|
|
rhs_.data(),
|
|
num_rows_,
|
|
reinterpret_cast<double*>(device_workspace_.data()),
|
|
device_workspace_.size(),
|
|
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
|
|
*message = "cuSolverDN::cusolverDnDormqr failed.";
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
int error = 0;
|
|
error_.CopyToCpu(&error, 1);
|
|
if (error < 0) {
|
|
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
|
|
<< "Please report it."
|
|
<< "cuSolverDN::cusolverDnDormqr fatal error. "
|
|
<< "Argument: " << -error << " is invalid.";
|
|
}
|
|
// Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
|
|
// replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
|
|
if (cublasDtrsv(context_->cublas_handle_,
|
|
CUBLAS_FILL_MODE_UPPER,
|
|
CUBLAS_OP_N,
|
|
CUBLAS_DIAG_NON_UNIT,
|
|
num_cols_,
|
|
lhs_.data(),
|
|
num_rows_,
|
|
rhs_.data(),
|
|
1) != CUBLAS_STATUS_SUCCESS) {
|
|
*message = "cuBLAS::cublasDtrsv failed.";
|
|
return LinearSolverTerminationType::FATAL_ERROR;
|
|
}
|
|
rhs_.CopyToCpu(solution, num_cols_);
|
|
*message = "Success";
|
|
return LinearSolverTerminationType::SUCCESS;
|
|
}
|
|
|
|
std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
|
|
const LinearSolver::Options& options) {
|
|
if (options.dense_linear_algebra_library_type != CUDA ||
|
|
options.context == nullptr || !options.context->IsCudaInitialized()) {
|
|
return nullptr;
|
|
}
|
|
return std::unique_ptr<CUDADenseQR>(new CUDADenseQR(options.context));
|
|
}
|
|
|
|
#endif // CERES_NO_CUDA
|
|
|
|
} // namespace ceres::internal
|