bnmf-algs
sampling.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include "defs.hpp"
4 #include "util/generator.hpp"
5 #include <algorithm>
6 #include <gsl/gsl_randist.h>
7 #include <tuple>
8 
9 namespace bnmf_algs {
10 namespace util {
41 template <typename RandomIterator>
42 RandomIterator choice(RandomIterator cum_prob_begin,
43  RandomIterator cum_prob_end,
44  const bnmf_algs::util::gsl_rng_wrapper& gsl_rng) {
45  // if the distribution is empty, return end iterator
46  if (cum_prob_begin == cum_prob_end) {
47  return cum_prob_end;
48  }
49 
50  // get a uniform random number in the range [0, max_cum_prob).
51  auto max_val = *(std::prev(cum_prob_end));
52  double p = gsl_ran_flat(gsl_rng.get(), 0, max_val);
53 
54  // find the iterator using binary search
55  return std::upper_bound(cum_prob_begin, cum_prob_end, p);
56 }
57 
87 template <typename Real, typename Integer>
88 void multinomial_mode(Integer num_trials, const vector_t<Real>& prob,
89  vector_t<Integer>& count, double eps = 1e-50) {
90  BNMF_ASSERT(prob.cols() == count.cols(),
91  "Number of event probabilities and counts differ in "
92  "util::multinomial_mode");
93  BNMF_ASSERT(
94  num_trials >= 0,
95  "Number of trials must be nonnegative in util::multinomial_mode");
96 
97  if (prob.cols() == 0) {
98  return;
99  }
100 
101  // initialize counts and differences
102  const auto num_events = prob.cols();
103  vector_t<Real> count_real;
104  vector_t<Real> diff;
105  {
106  vector_t<Real> normalized_probs = prob.array() + eps;
107  normalized_probs = normalized_probs.array() / normalized_probs.sum();
108 
109  vector_t<Real> freq =
110  (num_trials + 0.5 * num_events) * normalized_probs;
111 
112  count_real = freq.array().floor();
113  diff = freq - count_real;
114  count = count_real.template cast<Integer>();
115  }
116 
117  const Integer total_count = count.sum();
118  if (total_count == num_trials) {
119  return;
120  }
121 
122  // normalized differences will be used as a heap for the iterative alg
123  std::vector<std::pair<int, Real>> fraction_heap;
124  {
125  // compute normalized differences
126  vector_t<Real> fractions;
127  if (total_count < num_trials) {
128  fractions = (1 - diff.array()) / (count_real.array() + 1);
129  } else if (total_count > num_trials) {
130  fractions = diff.array() / (count_real.array() + eps);
131  }
132 
133  // store differences and their indices
134  for (int i = 0; i < num_events; ++i) {
135  fraction_heap.emplace_back(i, fractions(i));
136  }
137  }
138 
139  // min heap ordering function
140  auto ordering = [](const std::pair<int, Real>& elem_left,
141  const std::pair<int, Real>& elem_right) {
142  // min heap
143  return elem_left.second > elem_right.second;
144  };
145  std::make_heap(fraction_heap.begin(), fraction_heap.end(), ordering);
146 
147  // only one of the loops below execute
148 
149  // distribute the remaining balls
150  for (Integer i = total_count; i < num_trials; ++i) {
151  // get the smallest normalized difference and its index
152  std::pop_heap(fraction_heap.begin(), fraction_heap.end(), ordering);
153  int min_idx = fraction_heap.back().first;
154 
155  // update
156  ++count(min_idx);
157  --diff(min_idx);
158  fraction_heap.back().second =
159  (1 - diff(min_idx)) / (count(min_idx) + 1);
160 
161  // push back to the heap
162  std::push_heap(fraction_heap.begin(), fraction_heap.end(), ordering);
163  }
164 
165  // remove excess balls
166  for (Integer i = total_count; i > num_trials; --i) {
167  // get the smallest normalized difference and its index
168  std::pop_heap(fraction_heap.begin(), fraction_heap.end(), ordering);
169  int min_idx = fraction_heap.back().first;
170 
171  // update
172  --count(min_idx);
173  ++diff(min_idx);
174  fraction_heap.back().second = diff(min_idx) / (count(min_idx) + eps);
175 
176  // push back to the heap
177  std::push_heap(fraction_heap.begin(), fraction_heap.end(), ordering);
178  }
179 }
180 
204 template <typename T>
205 matrix_t<T> dirichlet_mat(const matrix_t<T>& gamma_shape_mat,
206  size_t norm_axis) {
207  BNMF_ASSERT(norm_axis == 0 || norm_axis == 1,
208  "Axis must be 0 or 1 in util::dirichlet_mat");
209 
210  matrix_t<T> result(gamma_shape_mat.rows(), gamma_shape_mat.cols());
211 
212  // sample each entry from Gamma
213  util::gsl_rng_wrapper rnd_gen(gsl_rng_alloc(gsl_rng_taus), gsl_rng_free);
214  for (long i = 0; i < result.rows(); ++i) {
215  for (long j = 0; j < result.cols(); ++j) {
216  result(i, j) =
217  gsl_ran_gamma(rnd_gen.get(), gamma_shape_mat(i, j), 1);
218  }
219  }
220 
221  // normalize
222  if (norm_axis == 0) {
223  result = result.array().rowwise() / result.colwise().sum().array();
224  } else {
225  result = result.array().colwise() / result.rowwise().sum().array();
226  }
227 
228  return result;
229 }
230 } // namespace util
231 
243 namespace details {
244 
279 template <typename Scalar> class SampleOnesNoReplaceComputer {
280  public:
295  : m_heap(),
296  no_repl_comp([](const HeapElem& left, const HeapElem& right) {
297  // min heap with respect to timestamp
298  return left.timestamp >= right.timestamp;
299  }),
300  rnd_gen(gsl_rng_alloc(gsl_rng_taus), gsl_rng_free) {
301 
302  for (int i = 0; i < X.rows(); ++i) {
303  for (int j = 0; j < X.cols(); ++j) {
304  Scalar entry = X(i, j);
305  if (entry > 0) {
306  double timestamp =
307  gsl_ran_beta(rnd_gen.get(), entry + 1, 1);
308 
309  m_heap.emplace_back(timestamp, entry, std::make_pair(i, j));
310  }
311  }
312  }
313  std::make_heap(m_heap.begin(), m_heap.end(), no_repl_comp);
314  }
338  void operator()(size_t curr_step, std::pair<int, int>& prev_val) {
339  // min heap sampling. Root contains the entry with the smallest
340  // beta value
341  if (m_heap.empty()) {
342  return;
343  }
344  std::pop_heap(m_heap.begin(), m_heap.end(), no_repl_comp);
345 
346  auto& heap_entry = m_heap.back();
347  int ii, jj;
348  std::tie(ii, jj) = heap_entry.idx;
349 
350  // decrement entry value and update beta value
351  --heap_entry.matrix_entry;
352  if (heap_entry.matrix_entry <= 0) {
353  m_heap.pop_back();
354  } else {
355  heap_entry.timestamp +=
356  gsl_ran_beta(rnd_gen.get(), heap_entry.matrix_entry + 1, 1);
357  std::push_heap(m_heap.begin(), m_heap.end(), no_repl_comp);
358  }
359 
360  // store indices of the sample
361  prev_val.first = ii;
362  prev_val.second = jj;
363  }
364 
365  private:
369  struct HeapElem {
370  HeapElem(double timestamp, Scalar matrix_entry, std::pair<int, int> idx)
371  : timestamp(timestamp), matrix_entry(matrix_entry),
372  idx(std::move(idx)) {}
373 
377  double timestamp;
378 
383  Scalar matrix_entry;
384 
389  };
390 
391  private:
395  std::vector<HeapElem> m_heap;
396 
402 
406  util::gsl_rng_wrapper rnd_gen;
407 };
408 
426 template <typename Scalar> class SampleOnesReplaceComputer {
427  public:
441  : cum_prob(), X_cols(X.cols()), X_sum(X.array().sum()),
442  rnd_gen(util::gsl_rng_wrapper(gsl_rng_alloc(gsl_rng_taus),
443  gsl_rng_free)) {
444 
445  cum_prob = vector_t<Scalar>(X.rows() * X.cols());
446  auto X_arr = X.array();
447  cum_prob(0) = X_arr(0);
448  for (int i = 1; i < X_arr.size(); ++i) {
449  cum_prob(i) = cum_prob(i - 1) + X_arr(i);
450  }
451  }
452 
474  void operator()(size_t curr_step, std::pair<int, int>& prev_val) {
475  // inverse CDF sampling
476  Scalar* begin = cum_prob.data();
477  Scalar* end = cum_prob.data() + cum_prob.cols();
478  auto it = util::choice(begin, end, rnd_gen);
479  long m = it - begin;
480 
481  prev_val.first = static_cast<int>(m / X_cols);
482  prev_val.second = static_cast<int>(m % X_cols);
483  }
484 
485  // computation variables
486  private:
487  vector_t<Scalar> cum_prob;
488  long X_cols;
489  double X_sum;
490  util::gsl_rng_wrapper rnd_gen;
491 };
492 
493 template <typename T> void check_sample_ones_params(const matrix_t<T>& X) {
494  BNMF_ASSERT(X.array().size() != 0,
495  "Matrix X must have at least one element");
496  BNMF_ASSERT((X.array() >= 0).all(), "Matrix X must be nonnegative");
497  BNMF_ASSERT((X.array() > std::numeric_limits<double>::epsilon()).any(),
498  "Matrix X must have at least one nonzero element");
499 }
500 } // namespace details
501 
502 namespace util {
524 template <typename T>
528 
529  auto num_samples = static_cast<size_t>(X.array().sum());
530  // don't know the initial value
531  std::pair<int, int> init_val;
532  util::Generator<std::pair<int, int>,
534  gen(init_val, num_samples + 1,
536 
537  // get rid of the first empty value
538  ++gen.begin();
539 
540  return gen;
541 }
542 
565 template <typename T>
566 util::Generator<std::pair<int, int>, details::SampleOnesReplaceComputer<T>>
567 sample_ones_replace(const matrix_t<T>& X, size_t num_samples) {
569 
570  // don't know the initial value
571  std::pair<int, int> init_val;
572  util::Generator<std::pair<int, int>, details::SampleOnesReplaceComputer<T>>
573  gen(init_val, num_samples + 1,
575 
576  // get rid of the first empty value
577  ++gen.begin();
578 
579  return gen;
580 }
581 } // namespace util
582 } // namespace bnmf_algs
SampleOnesNoReplaceComputer(const matrix_t< Scalar > &X)
Construct a new SampleOnesNoReplaceComputer.
Definition: sampling.hpp:294
A template generator that generates values from an initial value by applying a computation to the pre...
Definition: generator.hpp:279
T tie(T...args)
void operator()(size_t curr_step, std::pair< int, int > &prev_val)
Function call operator that will compute the next sample in-place.
Definition: sampling.hpp:474
util::Generator< std::pair< int, int >, details::SampleOnesNoReplaceComputer< T > > sample_ones_noreplace(const matrix_t< T > &X)
Return a bnmf_algs::util::Generator that will generate a sequence of samples from the given matrix wi...
Definition: sampling.hpp:526
T epsilon(T...args)
SampleOnesReplaceComputer(const matrix_t< Scalar > &X)
Construct a new SampleOnesComputer.
Definition: sampling.hpp:440
Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > matrix_t
Matrix type used in the computations.
Definition: defs.hpp:41
T upper_bound(T...args)
T pop_heap(T...args)
T end(T...args)
RandomIterator choice(RandomIterator cum_prob_begin, RandomIterator cum_prob_end, const bnmf_algs::util::gsl_rng_wrapper &gsl_rng)
Choose a random sample according to the given cumulative probability distribution.
Definition: sampling.hpp:42
T prev(T...args)
util::Generator< std::pair< int, int >, details::SampleOnesReplaceComputer< T > > sample_ones_replace(const matrix_t< T > &X, size_t num_samples)
Return a bnmf_algs::util::Generator that will generate a sequence of samples from the given matrix wi...
Definition: sampling.hpp:567
iter_type begin()
Return the iterator pointing to the previously computed value.
Definition: generator.hpp:421
void check_sample_ones_params(const matrix_t< T > &X)
Definition: sampling.hpp:493
T make_pair(T...args)
void operator()(size_t curr_step, std::pair< int, int > &prev_val)
Function call operator that will compute the next sample without replacement in-place.
Definition: sampling.hpp:338
T move(T...args)
void multinomial_mode(Integer num_trials, const vector_t< Real > &prob, vector_t< Integer > &count, double eps=1e-50)
Find the mode of a multinomial distribution using Finucan&#39;s algorithm.
Definition: sampling.hpp:88
T push_heap(T...args)
T get(T...args)
std::unique_ptr< gsl_rng, decltype(&gsl_rng_free)> gsl_rng_wrapper
RAII wrapper around gsl_rng types.
Definition: defs.hpp:73
T begin(T...args)
T back(T...args)
T make_heap(T...args)
Computer type that will compute the next sample from a given matrix with replacement when its operato...
Definition: sampling.hpp:426
Eigen::Matrix< Scalar, 1, Eigen::Dynamic, Eigen::RowMajor > vector_t
Vector type used in the computations.
Definition: defs.hpp:27
Main namespace for bnmf-algs library.
Definition: alloc_model_funcs.hpp:12
Computer type that will compute the next sample from a given matrix without replacement when its oper...
Definition: sampling.hpp:279
matrix_t< T > dirichlet_mat(const matrix_t< T > &gamma_shape_mat, size_t norm_axis)
Construct a matrix containing Dirichlet distribution samples on its rows or columns.
Definition: sampling.hpp:205
T emplace_back(T...args)