en/latest/bld__mult_8hpp_source.html

 #pragma once

 #include "alloc_model/alloc_model_params.hpp"
 #include "bld/util_details.hpp"
 #include "bld_mult_funcs.hpp"
 #include "defs.hpp"
 #include "util/util.hpp"
 #include <gsl/gsl_sf_psi.h>

 #ifdef USE_OPENMP
 #include <thread>
 #endif

 #ifdef USE_CUDA
 #include "bld_mult_cuda_funcs.hpp"
 #include "cuda/memory.hpp"
 #include "cuda/tensor_ops.hpp"
 #endif

 namespace bnmf_algs {
 namespace bld {
 template <typename T, typename Scalar>
 tensor_t<T, 3> bld_mult(const matrix_t<T>& X, const size_t z,
                         const alloc_model::Params<Scalar>& model_params,
                         size_t max_iter = 1000, bool use_psi_appr = false,
                         double eps = 1e-50) {
     details::check_bld_params(X, z, model_params);

     const auto x = static_cast<size_t>(X.rows());
     const auto y = static_cast<size_t>(X.cols());

     // computation variables
     tensor_t<T, 3> S = details::bld_mult::init_S(X, z);
     const matrix_t<T> X_reciprocal = details::bld_mult::X_reciprocal(X, eps);

     tensor_t<T, 3> grad_plus(x, y, z);
     matrix_t<T> nom_mult(x, y);
     matrix_t<T> denom_mult(x, y);
     matrix_t<T> grad_minus(x, z);
     matrix_t<T> alpha_eph(x, z);
     vector_t<T> alpha_eph_sum(z);
     matrix_t<T> beta_eph(y, z);
     tensor_t<T, 2> S_pjk(y, z);
     tensor_t<T, 2> S_ipk(x, z);
     tensor_t<T, 2> S_ijp(x, y);
     vector_t<T> alpha;
     vector_t<T> beta;

     std::tie(alpha, beta) = details::bld_mult::init_alpha_beta(model_params);

     // initialize threads
 #ifdef USE_OPENMP
     Eigen::ThreadPool tp(std::thread::hardware_concurrency());
     Eigen::ThreadPoolDevice thread_dev(&tp,
                                        std::thread::hardware_concurrency());
 #endif

     // which psi function to use
     const auto psi_fn =
         use_psi_appr ? util::psi_appr<T> : details::gsl_psi_wrapper<T>;

     // iterations
     for (size_t eph = 0; eph < max_iter; ++eph) {
         // update S_pjk, S_ipk, S_ijp
 #ifdef USE_OPENMP
         S_pjk.device(thread_dev) = S.sum(shape<1>({0}));
         S_ipk.device(thread_dev) = S.sum(shape<1>({1}));
         S_ijp.device(thread_dev) = S.sum(shape<1>({2}));
 #else
         S_pjk = S.sum(shape<1>({0}));
         S_ipk = S.sum(shape<1>({1}));
         S_ijp = S.sum(shape<1>({2}));
 #endif

         details::bld_mult::update_alpha_eph(S_ipk, alpha, alpha_eph);
         details::bld_mult::update_beta_eph(S_pjk, beta, beta_eph);

         details::bld_mult::update_grad_plus(S, beta_eph, psi_fn, grad_plus);
         details::bld_mult::update_grad_minus(alpha_eph, psi_fn, grad_minus);

         details::bld_mult::update_nom_mult(X_reciprocal, grad_minus, S,
                                            nom_mult);
         details::bld_mult::update_denom_mult(X_reciprocal, grad_plus, S,
                                              denom_mult);
         details::bld_mult::update_S(X, nom_mult, denom_mult, grad_minus,
                                     grad_plus, S_ijp, S, eps);
     }

     return S;
 }

 #ifdef USE_CUDA

 template <typename T, typename Scalar>
 tensor_t<T, 3> bld_mult_cuda(const matrix_t<T>& X, const size_t z,
                              const alloc_model::Params<Scalar>& model_params,
                              size_t max_iter = 1000, bool use_psi_appr = false,
                              double eps = 1e-50) {
     details::check_bld_params(X, z, model_params);

     const auto x = static_cast<size_t>(X.rows());
     const auto y = static_cast<size_t>(X.cols());

     // initial S
     tensor_t<T, 3> S = details::bld_mult::init_S(X, z);

     // X_reciprocal(i, j) = 1 / X(i, j)
     const matrix_t<T> X_reciprocal = details::bld_mult::X_reciprocal(X, eps);

     matrix_t<T> grad_minus(x, z);
     matrix_t<T> alpha_eph(x, z);
     vector_t<T> alpha_eph_sum(z);
     matrix_t<T> beta_eph(y, z);
     tensor_t<T, 2> S_pjk(y, z);
     tensor_t<T, 2> S_ipk(x, z);
     tensor_t<T, 2> S_ijp(x, y);
     vector_t<T> alpha;
     vector_t<T> beta;

     std::tie(alpha, beta) = details::bld_mult::init_alpha_beta(model_params);

     // host memory wrappers to be used during copying between main memory and
     // GPU
     cuda::HostMemory3D<T> S_host(S.data(), x, y, z);
     cuda::HostMemory2D<T> S_pjk_host(S_pjk.data(), y, z);
     cuda::HostMemory2D<T> S_ipk_host(S_ipk.data(), x, z);
     cuda::HostMemory2D<T> beta_eph_host(beta_eph.data(), y, z);
     cuda::HostMemory2D<T> grad_minus_host(grad_minus.data(), x, z);

     // allocate memory on GPU
     cuda::DeviceMemory2D<T> X_device(x, y);
     cuda::DeviceMemory2D<T> X_reciprocal_device(x, y);
     cuda::DeviceMemory3D<T> S_device(x, y, z);
     std::array<cuda::DeviceMemory2D<T>, 3> device_sums = {
         cuda::DeviceMemory2D<T>(y, z), cuda::DeviceMemory2D<T>(x, z),
         cuda::DeviceMemory2D<T>(x, y)};
     cuda::DeviceMemory3D<T> grad_plus_device(x, y, z);
     cuda::DeviceMemory2D<T> beta_eph_device(y, z);
     cuda::DeviceMemory2D<T> nom_device(x, y);
     cuda::DeviceMemory2D<T> denom_device(x, y);
     cuda::DeviceMemory2D<T> grad_minus_device(x, z);

     // send initial variables to GPU
     cuda::copy3D(S_device, S_host);
     {
         // X and X_reciprocal will be sent only once to device
         cuda::HostMemory2D<const T> X_host(X.data(), x, y);
         cuda::HostMemory2D<const T> X_reciprocal_host(X_reciprocal.data(), x,
                                                       y);
         cuda::copy2D(X_device, X_host);
         cuda::copy2D(X_reciprocal_device, X_reciprocal_host);
     }

     // which psi function to use
     const auto psi_fn = use_psi_appr ? util::psi_appr<T> : gsl_sf_psi;

     // iterations
     for (size_t eph = 0; eph < max_iter; ++eph) {
         // update S_pjk, S_ipk, S_ijp
         cuda::tensor_sums(S_device, device_sums);
         cuda::copy2D(S_pjk_host, device_sums[0]);
         cuda::copy2D(S_ipk_host, device_sums[1]);

         details::bld_mult::update_beta_eph(S_pjk, beta, beta_eph);

         // update grad_plus using CUDA
         cuda::copy2D(beta_eph_device, beta_eph_host);
         details::bld_mult::update_grad_plus_cuda(S_device, beta_eph_device,
                                                  grad_plus_device);
         // update denom using CUDA
         details::bld_mult::update_denom_cuda(
             X_reciprocal_device, grad_plus_device, S_device, denom_device);

         // above two cuda calls are asynchronous; immediately start working on
         // the CPU

         details::bld_mult::update_alpha_eph(S_ipk, alpha, alpha_eph);
         details::bld_mult::update_grad_minus(alpha_eph, psi_fn, grad_minus);

         // copy synchronizes
         cuda::copy2D(grad_minus_device, grad_minus_host);
         details::bld_mult::update_nom_cuda(
             X_reciprocal_device, grad_minus_device, S_device, nom_device);

         // update S using CUDA
         const auto& S_ijp_device = device_sums[2];
         details::bld_mult::update_S_cuda(X_device, nom_device, denom_device,
                                          grad_minus_device, grad_plus_device,
                                          S_ijp_device, S_device);
     }
     cuda::copy3D(S_host, S_device);
     return S;
 }

 #endif
 } // namespace bld
 } // namespace bnmf_algs
bnmf_algs::alloc_model::Params
Structure to hold the parameters for the Allocation Model .
Definition: alloc_model_params.hpp:25

bnmf_algs::cuda::DeviceMemory3D
A wrapper template class around 3D row-major pitched memory stored in device memory (GPU memory)...
Definition: device_memory_3d.hpp:30

bnmf_algs::details::bld_mult::update_denom_mult
void update_denom_mult(const matrix_t< T > &X_reciprocal, const tensor_t< T, 3 > &grad_plus, const tensor_t< T, 3 > &S, matrix_t< T > &denom_mult)
Update denom_mult matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:272

bnmf_algs::details::bld_mult::update_nom_cuda
void update_nom_cuda(const cuda::DeviceMemory2D< Real > &X_reciprocal, const cuda::DeviceMemory2D< Real > &grad_minus, const cuda::DeviceMemory3D< Real > &S, cuda::DeviceMemory2D< Real > &nom)
Perform nom_mult update employed in bld_mult algorithm using CUDA.

bnmf_algs::details::bld_mult::update_grad_minus
void update_grad_minus(const matrix_t< T > &alpha_eph, PsiFunction psi_fn, matrix_t< T > &grad_minus)
Update grad_minus tensor used in bld_mult.
Definition: bld_mult_funcs.hpp:205

bnmf_algs::cuda::tensor_sums
void tensor_sums(const DeviceMemory3D< T > &tensor, std::array< DeviceMemory2D< T >, 3 > &result_arr)
Sum the given 3D input tensor along each of its axes and return all 2D sum tensors.

bnmf_algs::details::bld_mult::update_grad_plus
void update_grad_plus(const tensor_t< T, 3 > &S, const matrix_t< T > &beta_eph, PsiFunction psi_fn, tensor_t< T, 3 > &grad_plus)
Update grad_plus tensor used in bld_mult.
Definition: bld_mult_funcs.hpp:177

std::tie
T tie(T...args)

bnmf_algs::cuda::HostMemory3D
A wrapper template class around a row-major 3D tensor stored in main memory (host memory)...
Definition: host_memory_3d.hpp:37

bnmf_algs::details::bld_mult::update_alpha_eph
void update_alpha_eph(const tensor_t< T, 2 > &S_ipk, const vector_t< T > &alpha, matrix_t< T > &alpha_eph)
Update alpha_eph matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:130

bnmf_algs::details::bld_mult::X_reciprocal
matrix_t< T > X_reciprocal(const matrix_t< T > &X, double eps)
Compute the reciprocal  of the input matrix .
Definition: bld_mult_funcs.hpp:87

bnmf_algs::tensor_t
Eigen::Tensor< Scalar, N, Eigen::RowMajor > tensor_t
Tensor type used in the computations.
Definition: defs.hpp:52

bnmf_algs::matrix_t
Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > matrix_t
Matrix type used in the computations.
Definition: defs.hpp:41

bld_mult_funcs.hpp

bld_mult_cuda_funcs.hpp

bnmf_algs::bld::bld_mult
tensor_t< T, 3 > bld_mult(const matrix_t< T > &X, const size_t z, const alloc_model::Params< Scalar > &model_params, size_t max_iter=1000, bool use_psi_appr=false, double eps=1e-50)
Compute tensor , the solution of BLD problem , from matrix  using multiplicative update rules...
Definition: bld_mult.hpp:83

bnmf_algs::details::bld_mult::update_denom_cuda
void update_denom_cuda(const cuda::DeviceMemory2D< Real > &X_reciprocal, const cuda::DeviceMemory3D< Real > &grad_plus, const cuda::DeviceMemory3D< Real > &S, cuda::DeviceMemory2D< Real > &denom)
Perform denom update employed in bld_mult algorithm using CUDA.

std::thread::hardware_concurrency
T hardware_concurrency(T...args)

bnmf_algs::shape
Eigen::array< size_t, N > shape
Shape of vectors, matrices, tensors, etc.
Definition: defs.hpp:66

bnmf_algs::details::bld_mult::init_S
tensor_t< T, 3 > init_S(const matrix_t< T > &X, size_t z)
Initialize S tensor using a Dirichlet sample of size z with all concentration parameters set to 1...
Definition: bld_mult_funcs.hpp:36

bnmf_algs::details::bld_mult::update_grad_plus_cuda
void update_grad_plus_cuda(const cuda::DeviceMemory3D< Real > &S, const cuda::DeviceMemory2D< Real > &beta_eph, cuda::DeviceMemory3D< Real > &grad_plus)
Perform grad_plus update employed in bld_mult algorithm using CUDA.

bnmf_algs::details::bld_mult::update_S
void update_S(const matrix_t< T > &X, const matrix_t< T > &nom, const matrix_t< T > &denom, const matrix_t< T > &grad_minus, const tensor_t< T, 3 > &grad_plus, const tensor_t< T, 2 > &S_ijp, tensor_t< T, 3 > &S, double eps)
Update S tensor (output of bld_mult algorithm).
Definition: bld_mult_funcs.hpp:307

alloc_model_params.hpp

util.hpp

bnmf_algs::cuda::copy3D
void copy3D(DstPitchedMemory3D &destination, const SrcPitchedMemory3D &source)
Copy a contiguous 3D pitched memory from a host/device memory to a host/device memory using CUDA func...
Definition: copy.hpp:169

util_details.hpp

bnmf_algs::cuda::DeviceMemory2D
A wrapper template class around 2D row-major pitched memory stored in device memory (GPU memory)...
Definition: device_memory_2d.hpp:30

bnmf_algs::details::bld_mult::update_beta_eph
void update_beta_eph(const tensor_t< T, 2 > &S_pjk, const vector_t< T > &beta, matrix_t< T > &beta_eph)
Update beta_eph matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:152

bnmf_algs::cuda::HostMemory2D
A wrapper template class around a row-major matrix type stored in main memory (host memory)...
Definition: host_memory_2d.hpp:37

bnmf_algs::details::bld_mult::init_alpha_beta
std::pair< vector_t< Scalar >, vector_t< Scalar > > init_alpha_beta(const alloc_model::Params< Scalar > &params)
Initialize alpha and beta vectors used in bld_mult.
Definition: bld_mult_funcs.hpp:112

defs.hpp

bnmf_algs::cuda::copy2D
void copy2D(DstPitchedMemory2D &destination, const SrcPitchedMemory2D &source)
Copy a contiguous 2D pitched memory from a host/device memory to a host/device memory using CUDA func...
Definition: copy.hpp:131

bnmf_algs::details::bld_mult::update_nom_mult
void update_nom_mult(const matrix_t< T > &X_reciprocal, const matrix_t< T > &grad_minus, const tensor_t< T, 3 > &S, matrix_t< T > &nom_mult)
Update nom_mult matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:242

bnmf_algs::details::check_bld_params
void check_bld_params(const matrix_t< T > &X, size_t z, const alloc_model::Params< Scalar > &model_params)
Do parameter checks on BLD computing function parameters and throw an assertion error if the paramete...
Definition: util_details.hpp:20

std::array

bnmf_algs::details::bld_mult::update_S_cuda
void update_S_cuda(const cuda::DeviceMemory2D< Real > &X, const cuda::DeviceMemory2D< Real > &nom, const cuda::DeviceMemory2D< Real > &denom, const cuda::DeviceMemory2D< Real > &grad_minus, const cuda::DeviceMemory3D< Real > &grad_plus, const cuda::DeviceMemory2D< Real > &S_ijp, cuda::DeviceMemory3D< Real > &S)
Perform S update employed in bld_mult algorithm using CUDA.

bnmf_algs::vector_t
Eigen::Matrix< Scalar, 1, Eigen::Dynamic, Eigen::RowMajor > vector_t
Vector type used in the computations.
Definition: defs.hpp:27

bnmf_algs
Main namespace for bnmf-algs library.
Definition: alloc_model_funcs.hpp:12

bnmf_algs::bld::bld_mult_cuda
tensor_t< T, 3 > bld_mult_cuda(const matrix_t< T > &X, const size_t z, const alloc_model::Params< Scalar > &model_params, size_t max_iter=1000, bool use_psi_appr=false, double eps=1e-50)
Compute tensor , the solution of BLD problem , from matrix  using multiplicative update rules...
Definition: bld_mult.hpp:226

memory.hpp

tensor_ops.hpp