bnmf-algs
bld_mult.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 #include "bld/util_details.hpp"
5 #include "bld_mult_funcs.hpp"
6 #include "defs.hpp"
7 #include "util/util.hpp"
8 #include <gsl/gsl_sf_psi.h>
9 
10 #ifdef USE_OPENMP
11 #include <thread>
12 #endif
13 
14 #ifdef USE_CUDA
15 #include "bld_mult_cuda_funcs.hpp"
16 #include "cuda/memory.hpp"
17 #include "cuda/tensor_ops.hpp"
18 #endif
19 
20 namespace bnmf_algs {
21 namespace bld {
82 template <typename T, typename Scalar>
83 tensor_t<T, 3> bld_mult(const matrix_t<T>& X, const size_t z,
84  const alloc_model::Params<Scalar>& model_params,
85  size_t max_iter = 1000, bool use_psi_appr = false,
86  double eps = 1e-50) {
87  details::check_bld_params(X, z, model_params);
88 
89  const auto x = static_cast<size_t>(X.rows());
90  const auto y = static_cast<size_t>(X.cols());
91 
92  // computation variables
95 
96  tensor_t<T, 3> grad_plus(x, y, z);
97  matrix_t<T> nom_mult(x, y);
98  matrix_t<T> denom_mult(x, y);
99  matrix_t<T> grad_minus(x, z);
100  matrix_t<T> alpha_eph(x, z);
101  vector_t<T> alpha_eph_sum(z);
102  matrix_t<T> beta_eph(y, z);
103  tensor_t<T, 2> S_pjk(y, z);
104  tensor_t<T, 2> S_ipk(x, z);
105  tensor_t<T, 2> S_ijp(x, y);
106  vector_t<T> alpha;
107  vector_t<T> beta;
108 
109  std::tie(alpha, beta) = details::bld_mult::init_alpha_beta(model_params);
110 
111  // initialize threads
112 #ifdef USE_OPENMP
113  Eigen::ThreadPool tp(std::thread::hardware_concurrency());
114  Eigen::ThreadPoolDevice thread_dev(&tp,
116 #endif
117 
118  // which psi function to use
119  const auto psi_fn =
120  use_psi_appr ? util::psi_appr<T> : details::gsl_psi_wrapper<T>;
121 
122  // iterations
123  for (size_t eph = 0; eph < max_iter; ++eph) {
124  // update S_pjk, S_ipk, S_ijp
125 #ifdef USE_OPENMP
126  S_pjk.device(thread_dev) = S.sum(shape<1>({0}));
127  S_ipk.device(thread_dev) = S.sum(shape<1>({1}));
128  S_ijp.device(thread_dev) = S.sum(shape<1>({2}));
129 #else
130  S_pjk = S.sum(shape<1>({0}));
131  S_ipk = S.sum(shape<1>({1}));
132  S_ijp = S.sum(shape<1>({2}));
133 #endif
134 
135  details::bld_mult::update_alpha_eph(S_ipk, alpha, alpha_eph);
136  details::bld_mult::update_beta_eph(S_pjk, beta, beta_eph);
137 
138  details::bld_mult::update_grad_plus(S, beta_eph, psi_fn, grad_plus);
139  details::bld_mult::update_grad_minus(alpha_eph, psi_fn, grad_minus);
140 
141  details::bld_mult::update_nom_mult(X_reciprocal, grad_minus, S,
142  nom_mult);
143  details::bld_mult::update_denom_mult(X_reciprocal, grad_plus, S,
144  denom_mult);
145  details::bld_mult::update_S(X, nom_mult, denom_mult, grad_minus,
146  grad_plus, S_ijp, S, eps);
147  }
148 
149  return S;
150 }
151 
152 #ifdef USE_CUDA
153 
225 template <typename T, typename Scalar>
226 tensor_t<T, 3> bld_mult_cuda(const matrix_t<T>& X, const size_t z,
227  const alloc_model::Params<Scalar>& model_params,
228  size_t max_iter = 1000, bool use_psi_appr = false,
229  double eps = 1e-50) {
230  details::check_bld_params(X, z, model_params);
231 
232  const auto x = static_cast<size_t>(X.rows());
233  const auto y = static_cast<size_t>(X.cols());
234 
235  // initial S
237 
238  // X_reciprocal(i, j) = 1 / X(i, j)
240 
241  matrix_t<T> grad_minus(x, z);
242  matrix_t<T> alpha_eph(x, z);
243  vector_t<T> alpha_eph_sum(z);
244  matrix_t<T> beta_eph(y, z);
245  tensor_t<T, 2> S_pjk(y, z);
246  tensor_t<T, 2> S_ipk(x, z);
247  tensor_t<T, 2> S_ijp(x, y);
248  vector_t<T> alpha;
249  vector_t<T> beta;
250 
251  std::tie(alpha, beta) = details::bld_mult::init_alpha_beta(model_params);
252 
253  // host memory wrappers to be used during copying between main memory and
254  // GPU
255  cuda::HostMemory3D<T> S_host(S.data(), x, y, z);
256  cuda::HostMemory2D<T> S_pjk_host(S_pjk.data(), y, z);
257  cuda::HostMemory2D<T> S_ipk_host(S_ipk.data(), x, z);
258  cuda::HostMemory2D<T> beta_eph_host(beta_eph.data(), y, z);
259  cuda::HostMemory2D<T> grad_minus_host(grad_minus.data(), x, z);
260 
261  // allocate memory on GPU
262  cuda::DeviceMemory2D<T> X_device(x, y);
263  cuda::DeviceMemory2D<T> X_reciprocal_device(x, y);
264  cuda::DeviceMemory3D<T> S_device(x, y, z);
265  std::array<cuda::DeviceMemory2D<T>, 3> device_sums = {
268  cuda::DeviceMemory3D<T> grad_plus_device(x, y, z);
269  cuda::DeviceMemory2D<T> beta_eph_device(y, z);
270  cuda::DeviceMemory2D<T> nom_device(x, y);
271  cuda::DeviceMemory2D<T> denom_device(x, y);
272  cuda::DeviceMemory2D<T> grad_minus_device(x, z);
273 
274  // send initial variables to GPU
275  cuda::copy3D(S_device, S_host);
276  {
277  // X and X_reciprocal will be sent only once to device
278  cuda::HostMemory2D<const T> X_host(X.data(), x, y);
279  cuda::HostMemory2D<const T> X_reciprocal_host(X_reciprocal.data(), x,
280  y);
281  cuda::copy2D(X_device, X_host);
282  cuda::copy2D(X_reciprocal_device, X_reciprocal_host);
283  }
284 
285  // which psi function to use
286  const auto psi_fn = use_psi_appr ? util::psi_appr<T> : gsl_sf_psi;
287 
288  // iterations
289  for (size_t eph = 0; eph < max_iter; ++eph) {
290  // update S_pjk, S_ipk, S_ijp
291  cuda::tensor_sums(S_device, device_sums);
292  cuda::copy2D(S_pjk_host, device_sums[0]);
293  cuda::copy2D(S_ipk_host, device_sums[1]);
294 
295  details::bld_mult::update_beta_eph(S_pjk, beta, beta_eph);
296 
297  // update grad_plus using CUDA
298  cuda::copy2D(beta_eph_device, beta_eph_host);
299  details::bld_mult::update_grad_plus_cuda(S_device, beta_eph_device,
300  grad_plus_device);
301  // update denom using CUDA
303  X_reciprocal_device, grad_plus_device, S_device, denom_device);
304 
305  // above two cuda calls are asynchronous; immediately start working on
306  // the CPU
307 
308  details::bld_mult::update_alpha_eph(S_ipk, alpha, alpha_eph);
309  details::bld_mult::update_grad_minus(alpha_eph, psi_fn, grad_minus);
310 
311  // copy synchronizes
312  cuda::copy2D(grad_minus_device, grad_minus_host);
314  X_reciprocal_device, grad_minus_device, S_device, nom_device);
315 
316  // update S using CUDA
317  const auto& S_ijp_device = device_sums[2];
318  details::bld_mult::update_S_cuda(X_device, nom_device, denom_device,
319  grad_minus_device, grad_plus_device,
320  S_ijp_device, S_device);
321  }
322  cuda::copy3D(S_host, S_device);
323  return S;
324 }
325 
326 #endif
327 } // namespace bld
328 } // namespace bnmf_algs
Structure to hold the parameters for the Allocation Model .
Definition: alloc_model_params.hpp:25
A wrapper template class around 3D row-major pitched memory stored in device memory (GPU memory)...
Definition: device_memory_3d.hpp:30
void update_denom_mult(const matrix_t< T > &X_reciprocal, const tensor_t< T, 3 > &grad_plus, const tensor_t< T, 3 > &S, matrix_t< T > &denom_mult)
Update denom_mult matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:272
void update_nom_cuda(const cuda::DeviceMemory2D< Real > &X_reciprocal, const cuda::DeviceMemory2D< Real > &grad_minus, const cuda::DeviceMemory3D< Real > &S, cuda::DeviceMemory2D< Real > &nom)
Perform nom_mult update employed in bld_mult algorithm using CUDA.
void update_grad_minus(const matrix_t< T > &alpha_eph, PsiFunction psi_fn, matrix_t< T > &grad_minus)
Update grad_minus tensor used in bld_mult.
Definition: bld_mult_funcs.hpp:205
void tensor_sums(const DeviceMemory3D< T > &tensor, std::array< DeviceMemory2D< T >, 3 > &result_arr)
Sum the given 3D input tensor along each of its axes and return all 2D sum tensors.
void update_grad_plus(const tensor_t< T, 3 > &S, const matrix_t< T > &beta_eph, PsiFunction psi_fn, tensor_t< T, 3 > &grad_plus)
Update grad_plus tensor used in bld_mult.
Definition: bld_mult_funcs.hpp:177
T tie(T...args)
A wrapper template class around a row-major 3D tensor stored in main memory (host memory)...
Definition: host_memory_3d.hpp:37
void update_alpha_eph(const tensor_t< T, 2 > &S_ipk, const vector_t< T > &alpha, matrix_t< T > &alpha_eph)
Update alpha_eph matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:130
matrix_t< T > X_reciprocal(const matrix_t< T > &X, double eps)
Compute the reciprocal of the input matrix .
Definition: bld_mult_funcs.hpp:87
Eigen::Tensor< Scalar, N, Eigen::RowMajor > tensor_t
Tensor type used in the computations.
Definition: defs.hpp:52
Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > matrix_t
Matrix type used in the computations.
Definition: defs.hpp:41
tensor_t< T, 3 > bld_mult(const matrix_t< T > &X, const size_t z, const alloc_model::Params< Scalar > &model_params, size_t max_iter=1000, bool use_psi_appr=false, double eps=1e-50)
Compute tensor , the solution of BLD problem , from matrix using multiplicative update rules...
Definition: bld_mult.hpp:83
void update_denom_cuda(const cuda::DeviceMemory2D< Real > &X_reciprocal, const cuda::DeviceMemory3D< Real > &grad_plus, const cuda::DeviceMemory3D< Real > &S, cuda::DeviceMemory2D< Real > &denom)
Perform denom update employed in bld_mult algorithm using CUDA.
T hardware_concurrency(T...args)
Eigen::array< size_t, N > shape
Shape of vectors, matrices, tensors, etc.
Definition: defs.hpp:66
tensor_t< T, 3 > init_S(const matrix_t< T > &X, size_t z)
Initialize S tensor using a Dirichlet sample of size z with all concentration parameters set to 1...
Definition: bld_mult_funcs.hpp:36
void update_grad_plus_cuda(const cuda::DeviceMemory3D< Real > &S, const cuda::DeviceMemory2D< Real > &beta_eph, cuda::DeviceMemory3D< Real > &grad_plus)
Perform grad_plus update employed in bld_mult algorithm using CUDA.
void update_S(const matrix_t< T > &X, const matrix_t< T > &nom, const matrix_t< T > &denom, const matrix_t< T > &grad_minus, const tensor_t< T, 3 > &grad_plus, const tensor_t< T, 2 > &S_ijp, tensor_t< T, 3 > &S, double eps)
Update S tensor (output of bld_mult algorithm).
Definition: bld_mult_funcs.hpp:307
void copy3D(DstPitchedMemory3D &destination, const SrcPitchedMemory3D &source)
Copy a contiguous 3D pitched memory from a host/device memory to a host/device memory using CUDA func...
Definition: copy.hpp:169
A wrapper template class around 2D row-major pitched memory stored in device memory (GPU memory)...
Definition: device_memory_2d.hpp:30
void update_beta_eph(const tensor_t< T, 2 > &S_pjk, const vector_t< T > &beta, matrix_t< T > &beta_eph)
Update beta_eph matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:152
A wrapper template class around a row-major matrix type stored in main memory (host memory)...
Definition: host_memory_2d.hpp:37
std::pair< vector_t< Scalar >, vector_t< Scalar > > init_alpha_beta(const alloc_model::Params< Scalar > &params)
Initialize alpha and beta vectors used in bld_mult.
Definition: bld_mult_funcs.hpp:112
void copy2D(DstPitchedMemory2D &destination, const SrcPitchedMemory2D &source)
Copy a contiguous 2D pitched memory from a host/device memory to a host/device memory using CUDA func...
Definition: copy.hpp:131
void update_nom_mult(const matrix_t< T > &X_reciprocal, const matrix_t< T > &grad_minus, const tensor_t< T, 3 > &S, matrix_t< T > &nom_mult)
Update nom_mult matrix used in bld_mult.
Definition: bld_mult_funcs.hpp:242
void check_bld_params(const matrix_t< T > &X, size_t z, const alloc_model::Params< Scalar > &model_params)
Do parameter checks on BLD computing function parameters and throw an assertion error if the paramete...
Definition: util_details.hpp:20
void update_S_cuda(const cuda::DeviceMemory2D< Real > &X, const cuda::DeviceMemory2D< Real > &nom, const cuda::DeviceMemory2D< Real > &denom, const cuda::DeviceMemory2D< Real > &grad_minus, const cuda::DeviceMemory3D< Real > &grad_plus, const cuda::DeviceMemory2D< Real > &S_ijp, cuda::DeviceMemory3D< Real > &S)
Perform S update employed in bld_mult algorithm using CUDA.
Eigen::Matrix< Scalar, 1, Eigen::Dynamic, Eigen::RowMajor > vector_t
Vector type used in the computations.
Definition: defs.hpp:27
Main namespace for bnmf-algs library.
Definition: alloc_model_funcs.hpp:12
tensor_t< T, 3 > bld_mult_cuda(const matrix_t< T > &X, const size_t z, const alloc_model::Params< Scalar > &model_params, size_t max_iter=1000, bool use_psi_appr=false, double eps=1e-50)
Compute tensor , the solution of BLD problem , from matrix using multiplicative update rules...
Definition: bld_mult.hpp:226