piranha/doxygen/base__series__multiplier_8hpp_source.html

 /* Copyright 2009-2017 Francesco Biscani (bluescarni@gmail.com)

 This file is part of the Piranha library.

 The Piranha library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 3 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The Piranha library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the Piranha library.  If not,
 see https://www.gnu.org/licenses/. */

 #ifndef PIRANHA_DETAIL_BASE_SERIES_MULTIPLIER_HPP
 #define PIRANHA_DETAIL_BASE_SERIES_MULTIPLIER_HPP

 #include <algorithm>
 #include <array>
 #include <boost/numeric/conversion/cast.hpp>
 #include <cmath>
 #include <cstddef>
 #include <iterator>
 #include <limits>
 #include <mutex>
 #include <random>
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
 #include <vector>

 #include <piranha/config.hpp>
 #include <piranha/detail/atomic_flag_array.hpp>
 #include <piranha/detail/atomic_lock_guard.hpp>
 #include <piranha/exceptions.hpp>
 #include <piranha/key_is_multipliable.hpp>
 #include <piranha/math.hpp>
 #include <piranha/mp_integer.hpp>
 #include <piranha/mp_rational.hpp>
 #include <piranha/safe_cast.hpp>
 #include <piranha/series.hpp>
 #include <piranha/settings.hpp>
 #include <piranha/symbol_utils.hpp>
 #include <piranha/thread_pool.hpp>
 #include <piranha/tuning.hpp>
 #include <piranha/type_traits.hpp>

 namespace piranha
 {

 namespace detail
 {

 template <typename Series, typename Derived, typename = void>
 struct base_series_multiplier_impl {
     using term_type = typename Series::term_type;
     using container_type = typename std::decay<decltype(std::declval<Series>()._container())>::type;
     using c_size_type = typename container_type::size_type;
     using v_size_type = typename std::vector<term_type const *>::size_type;
     template <typename Term,
               typename std::enable_if<!is_less_than_comparable<typename Term::key_type>::value, int>::type = 0>
     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<Term const *> &v1,
                             std::vector<Term const *> &v2)
     {
         // If the key is not less-than comparable, we can only copy over the pointers as they are.
         std::transform(c1.begin(), c1.end(), std::back_inserter(v1), [](const term_type &t) { return &t; });
         std::transform(c2.begin(), c2.end(), std::back_inserter(v2), [](const term_type &t) { return &t; });
     }
     template <typename Term,
               typename std::enable_if<is_less_than_comparable<typename Term::key_type>::value, int>::type = 0>
     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<Term const *> &v1,
                             std::vector<Term const *> &v2)
     {
         // Fetch the number of threads from the derived class.
         const unsigned n_threads = static_cast<Derived *>(this)->m_n_threads;
         piranha_assert(n_threads > 0u);
         // Threading functor.
         auto thread_func
             = [n_threads](unsigned thread_idx, const container_type *c, std::vector<term_type const *> *v) {
                   piranha_assert(thread_idx < n_threads);
                   // Total bucket count.
                   const auto b_count = c->bucket_count();
                   // Buckets per thread.
                   const auto bpt = b_count / n_threads;
                   // End index.
                   const auto end
                       = static_cast<c_size_type>((thread_idx == n_threads - 1u) ? b_count : (bpt * (thread_idx + 1u)));
                   // Sorter.
                   auto sorter = [](term_type const *p1, term_type const *p2) { return p1->m_key < p2->m_key; };
                   v_size_type j = 0u;
                   for (auto start = static_cast<c_size_type>(bpt * thread_idx); start < end; ++start) {
                       const auto &b = c->_get_bucket_list(start);
                       v_size_type tmp = 0u;
                       for (const auto &t : b) {
                           v->push_back(&t);
                           ++tmp;
                       }
                       std::stable_sort(v->data() + j, v->data() + j + tmp, sorter);
                       j += tmp;
                   }
               };
         if (n_threads == 1u) {
             thread_func(0u, &c1, &v1);
             thread_func(0u, &c2, &v2);
             return;
         }
         auto thread_wrapper = [&thread_func, n_threads](const container_type *c, std::vector<term_type const *> *v) {
             // In the multi-threaded case, each thread needs to work on a separate vector.
             // We will merge the vectors later.
             using vv_t = std::vector<std::vector<Term const *>>;
             using vv_size_t = typename vv_t::size_type;
             vv_t vv(safe_cast<vv_size_t>(n_threads));
             // Go with the threads.
             future_list<void> ff_list;
             try {
                 for (unsigned i = 0u; i < n_threads; ++i) {
                     ff_list.push_back(thread_pool::enqueue(i, thread_func, i, c, &(vv[static_cast<vv_size_t>(i)])));
                 }
                 // First let's wait for everything to finish.
                 ff_list.wait_all();
                 // Then, let's handle the exceptions.
                 ff_list.get_all();
             } catch (...) {
                 ff_list.wait_all();
                 throw;
             }
             // Last, we need to merge everything into v.
             for (const auto &vi : vv) {
                 v->insert(v->end(), vi.begin(), vi.end());
             }
         };
         thread_wrapper(&c1, &v1);
         thread_wrapper(&c2, &v2);
     }
 };

 template <typename Series, typename Derived>
 struct base_series_multiplier_impl<
     Series, Derived, typename std::enable_if<is_mp_rational<typename Series::term_type::cf_type>::value>::type> {
     // Useful shortcuts.
     using term_type = typename Series::term_type;
     using rat_type = typename term_type::cf_type;
     using int_type = typename std::decay<decltype(std::declval<rat_type>().num())>::type;
     using container_type = typename std::decay<decltype(std::declval<Series>()._container())>::type;
     void fill_term_pointers(const container_type &c1, const container_type &c2, std::vector<term_type const *> &v1,
                             std::vector<term_type const *> &v2)
     {
         // Compute the least common multiplier.
         m_lcm = 1;
         auto it_f = c1.end();
         int_type g;
         for (auto it = c1.begin(); it != it_f; ++it) {
             math::gcd3(g, m_lcm, it->m_cf.den());
             math::mul3(m_lcm, m_lcm, it->m_cf.den());
             divexact(m_lcm, m_lcm, g);
         }
         it_f = c2.end();
         for (auto it = c2.begin(); it != it_f; ++it) {
             math::gcd3(g, m_lcm, it->m_cf.den());
             math::mul3(m_lcm, m_lcm, it->m_cf.den());
             divexact(m_lcm, m_lcm, g);
         }
         // All these computations involve only positive numbers,
         // the GCD must always be positive.
         piranha_assert(m_lcm.sgn() == 1);
         // Copy over the terms and renormalise to lcm.
         it_f = c1.end();
         for (auto it = c1.begin(); it != it_f; ++it) {
             // NOTE: these divisions are exact, we could take advantage of that.
             m_terms1.push_back(term_type(rat_type(m_lcm / it->m_cf.den() * it->m_cf.num(), int_type(1)), it->m_key));
         }
         it_f = c2.end();
         for (auto it = c2.begin(); it != it_f; ++it) {
             m_terms2.push_back(term_type(rat_type(m_lcm / it->m_cf.den() * it->m_cf.num(), int_type(1)), it->m_key));
         }
         // Copy over the pointers.
         std::transform(m_terms1.begin(), m_terms1.end(), std::back_inserter(v1), [](const term_type &t) { return &t; });
         std::transform(m_terms2.begin(), m_terms2.end(), std::back_inserter(v2), [](const term_type &t) { return &t; });
         piranha_assert(v1.size() == c1.size());
         piranha_assert(v2.size() == c2.size());
     }
     std::vector<term_type> m_terms1;
     std::vector<term_type> m_terms2;
     int_type m_lcm;
 };
 }


 // Some performance ideas:
 // - optimisation in case one series has 1 term with unitary key and both series same type: multiply directly
 // coefficients;
 // - optimisation for coefficient series that merges all args, similar to the rational optimisation;
 // - optimisation for load balancing similar to the poly multiplier.
 template <typename Series>
 class base_series_multiplier : private detail::base_series_multiplier_impl<Series, base_series_multiplier<Series>>
 {
     PIRANHA_TT_CHECK(is_series, Series);
     // Make friends with the base, so it can access protected/private members of this.
     friend struct detail::base_series_multiplier_impl<Series, base_series_multiplier<Series>>;
     // Alias for the series' container type.
     using container_type = uncvref_t<decltype(std::declval<const Series &>()._container())>;

 public:
     using v_ptr = std::vector<typename Series::term_type const *>;
     using size_type = typename v_ptr::size_type;
     using bucket_size_type = typename Series::size_type;

 private:
     // The default limit functor: it will include all terms in the second series.
     struct default_limit_functor {
         default_limit_functor(const base_series_multiplier &m) : m_size2(m.m_v2.size()) {}
         size_type operator()(const size_type &) const
         {
             return m_size2;
         }
         const size_type m_size2;
     };
     // The purpose of this helper is to move in a coefficient series during insertion. For series,
     // we know that moves leave the series in a valid state, and series multiplications do not benefit
     // from an already-constructed destination - hence it is convenient to move them rather than copy.
     template <typename Term, typename std::enable_if<!is_series<typename Term::cf_type>::value, int>::type = 0>
     static Term &term_insertion(Term &t)
     {
         return t;
     }
     template <typename Term, typename std::enable_if<is_series<typename Term::cf_type>::value, int>::type = 0>
     static Term term_insertion(Term &t)
     {
         return Term{std::move(t.m_cf), t.m_key};
     }
     // Implementation of finalise().
     template <typename T, typename std::enable_if<is_mp_rational<typename T::term_type::cf_type>::value, int>::type = 0>
     void finalise_impl(T &s) const
     {
         // Nothing to do if the lcm is unitary.
         if (math::is_unitary(this->m_lcm)) {
             return;
         }
         // NOTE: this has to be the square of the lcm, as in addition to uniformising
         // the denominators in each series we are also multiplying the two series.
         const auto l2 = this->m_lcm * this->m_lcm;
         auto &container = s._container();
         // Single thread implementation.
         if (m_n_threads == 1u) {
             for (const auto &t : container) {
                 t.m_cf._set_den(l2);
                 t.m_cf.canonicalise();
             }
             return;
         }
         // Multi-thread implementation.
         // Buckets per thread.
         const bucket_size_type bpt = static_cast<bucket_size_type>(container.bucket_count() / m_n_threads);
         auto thread_func = [l2, &container, this, bpt](unsigned t_idx) {
             bucket_size_type start_idx = static_cast<bucket_size_type>(t_idx * bpt);
             // Special handling for the last thread.
             const bucket_size_type end_idx = t_idx == (this->m_n_threads - 1u)
                                                  ? container.bucket_count()
                                                  : static_cast<bucket_size_type>((t_idx + 1u) * bpt);
             for (; start_idx != end_idx; ++start_idx) {
                 auto &list = container._get_bucket_list(start_idx);
                 for (const auto &t : list) {
                     t.m_cf._set_den(l2);
                     t.m_cf.canonicalise();
                 }
             }
         };
         // Go with the threads.
         future_list<decltype(thread_func(0u))> ff_list;
         try {
             for (unsigned i = 0u; i < m_n_threads; ++i) {
                 ff_list.push_back(thread_pool::enqueue(i, thread_func, i));
             }
             // First let's wait for everything to finish.
             ff_list.wait_all();
             // Then, let's handle the exceptions.
             ff_list.get_all();
         } catch (...) {
             ff_list.wait_all();
             throw;
         }
     }
     template <typename T,
               typename std::enable_if<!is_mp_rational<typename T::term_type::cf_type>::value, int>::type = 0>
     void finalise_impl(T &) const
     {
     }

 public:

     explicit base_series_multiplier(const Series &s1, const Series &s2) : m_ss(s1.get_symbol_set())
     {
         if (unlikely(s1.get_symbol_set() != s2.get_symbol_set())) {
             piranha_throw(std::invalid_argument, "incompatible arguments sets");
         }
         // The largest series goes first.
         const Series *p1 = &s1, *p2 = &s2;
         if (s1.size() < s2.size()) {
             std::swap(p1, p2);
         }
         // This is just an optimisation, no troubles if there is a truncation due to static_cast.
         m_v1.reserve(static_cast<size_type>(p1->size()));
         m_v2.reserve(static_cast<size_type>(p2->size()));
         container_type const *ctr1 = &p1->_container(), *ctr2 = &p2->_container();
         // NOTE: if the zero element of Series is not absorbing, we need to create a temporary zero series in place
         // of any factor that is zero, and then use it in the multiplication. This ensures a correct series
         // multiplication result for coefficient types (such as IEEE floats) for which 0 times x is not necessarily
         // always 0. The temporary zero series is stored in the m_zero_f member as a collection of 1 term with
         // zero coefficient.
         if (!zero_is_absorbing<Series>::value) {
             using term_type = typename Series::term_type;
             using cf_type = typename term_type::cf_type;
             using key_type = typename term_type::key_type;
             if (p1->empty()) {
                 m_zero_f1.insert(term_type{cf_type(0), key_type(s1.get_symbol_set())});
                 ctr1 = &m_zero_f1;
             }
             if (p2->empty()) {
                 m_zero_f2.insert(term_type{cf_type(0), key_type(s1.get_symbol_set())});
                 ctr2 = &m_zero_f2;
             }
         }
         // Set the number of threads.
         m_n_threads = (ctr1->size() && ctr2->size())
                           ? thread_pool::use_threads(integer(ctr1->size()) * ctr2->size(),
                                                      integer(settings::get_min_work_per_thread()))
                           : 1u;
         this->fill_term_pointers(*ctr1, *ctr2, m_v1, m_v2);
     }

 private:
     base_series_multiplier() = delete;
     base_series_multiplier(const base_series_multiplier &) = delete;
     base_series_multiplier(base_series_multiplier &&) = delete;
     base_series_multiplier &operator=(const base_series_multiplier &) = delete;
     base_series_multiplier &operator=(base_series_multiplier &&) = delete;

 protected:

     template <typename MultFunctor, typename LimitFunctor>
     void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1,
                                 const LimitFunctor &lf) const
     {
         PIRANHA_TT_CHECK(is_function_object, MultFunctor, void, const size_type &, const size_type &);
         if (unlikely(start1 > end1 || start1 > m_v1.size() || end1 > m_v1.size())) {
             piranha_throw(std::invalid_argument, "invalid bounds in blocked_multiplication");
         }
         // Block size and number of regular blocks.
         const size_type bsize = safe_cast<size_type>(tuning::get_multiplication_block_size()),
                         nblocks1 = static_cast<size_type>((end1 - start1) / bsize),
                         nblocks2 = static_cast<size_type>(m_v2.size() / bsize);
         // Start and end of last (possibly irregular) blocks.
         const size_type i_ir_start = static_cast<size_type>(nblocks1 * bsize + start1), i_ir_end = end1;
         const size_type j_ir_start = static_cast<size_type>(nblocks2 * bsize), j_ir_end = m_v2.size();
         for (size_type n1 = 0u; n1 < nblocks1; ++n1) {
             const size_type i_start = static_cast<size_type>(n1 * bsize + start1),
                             i_end = static_cast<size_type>(i_start + bsize);
             // regulars1 * regulars2
             for (size_type n2 = 0u; n2 < nblocks2; ++n2) {
                 const size_type j_start = static_cast<size_type>(n2 * bsize),
                                 j_end = static_cast<size_type>(j_start + bsize);
                 for (size_type i = i_start; i < i_end; ++i) {
                     const size_type limit = std::min<size_type>(lf(i), j_end);
                     for (size_type j = j_start; j < limit; ++j) {
                         mf(i, j);
                     }
                 }
             }
             // regulars1 * rem2
             for (size_type i = i_start; i < i_end; ++i) {
                 const size_type limit = std::min<size_type>(lf(i), j_ir_end);
                 for (size_type j = j_ir_start; j < limit; ++j) {
                     mf(i, j);
                 }
             }
         }
         // rem1 * regulars2
         for (size_type n2 = 0u; n2 < nblocks2; ++n2) {
             const size_type j_start = static_cast<size_type>(n2 * bsize),
                             j_end = static_cast<size_type>(j_start + bsize);
             for (size_type i = i_ir_start; i < i_ir_end; ++i) {
                 const size_type limit = std::min<size_type>(lf(i), j_end);
                 for (size_type j = j_start; j < limit; ++j) {
                     mf(i, j);
                 }
             }
         }
         // rem1 * rem2.
         for (size_type i = i_ir_start; i < i_ir_end; ++i) {
             const size_type limit = std::min<size_type>(lf(i), j_ir_end);
             for (size_type j = j_ir_start; j < limit; ++j) {
                 mf(i, j);
             }
         }
     }

     template <typename MultFunctor>
     void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1) const
     {
         blocked_multiplication(mf, start1, end1, default_limit_functor{*this});
     }

     template <std::size_t MultArity, typename MultFunctor, typename LimitFunctor>
     bucket_size_type estimate_final_series_size(const LimitFunctor &lf) const
     {
         PIRANHA_TT_CHECK(is_function_object, MultFunctor, void, const size_type &, const size_type &);
         PIRANHA_TT_CHECK(std::is_constructible, MultFunctor, const base_series_multiplier &, Series &);
         PIRANHA_TT_CHECK(is_function_object, LimitFunctor, size_type, const size_type &);
         // Cache these.
         const size_type size1 = m_v1.size(), size2 = m_v2.size();
         constexpr std::size_t result_size = MultArity;
         // If one of the two series is empty, just return 0.
         if (unlikely(!size1 || !size2)) {
             return 1u;
         }
         // If either series has a size of 1, just return size1 * size2 * result_size.
         if (size1 == 1u || size2 == 1u) {
             return static_cast<bucket_size_type>(integer(size1) * size2 * result_size);
         }
         // NOTE: Hard-coded number of trials.
         // NOTE: here consider that in case of extremely sparse series with few terms this will incur in noticeable
         // overhead, since we will need many term-by-term before encountering the first duplicate.
         const unsigned n_trials = 15u;
         // NOTE: Hard-coded value for the estimation multiplier.
         // NOTE: This value should be tuned for performance/memory usage tradeoffs.
         const unsigned multiplier = 2u;
         // Number of threads to use. If there are more threads than trials, then reduce
         // the number of actual threads to use.
         // NOTE: this is a bit different from usual, where we do not care if the workload per thread is zero.
         // We do like this because n_trials is a small number and there still seems to be benefit in running
         // just 1 trial per thread.
         const unsigned n_threads = (n_trials >= m_n_threads) ? m_n_threads : n_trials;
         piranha_assert(n_threads > 0u);
         // Trials per thread. This will always be at least 1.
         const unsigned tpt = n_trials / n_threads;
         piranha_assert(tpt >= 1u);
         // The cumulative estimate.
         integer c_estimate(0);
         // Sync mutex - actually used only in multithreading.
         std::mutex mut;
         // The estimation functor.
         auto estimator = [&lf, size1, n_threads, tpt, this, &c_estimate, &mut](unsigned thread_idx) {
             piranha_assert(thread_idx < n_threads);
             // Vectors of indices into m_v1.
             std::vector<size_type> v_idx1(safe_cast<typename std::vector<size_type>::size_type>(size1));
             std::iota(v_idx1.begin(), v_idx1.end(), size_type(0));
             // Copy in order to reset to initial state later.
             const auto v_idx1_copy = v_idx1;
             // Random number engine.
             std::mt19937 engine;
             // Uniform int distribution.
             using dist_type = std::uniform_int_distribution<size_type>;
             dist_type dist;
             // Init the accumulated estimation for averaging later.
             integer acc(0);
             // Number of trials for this thread - usual special casing for the last thread.
             const unsigned cur_trials = (thread_idx == n_threads - 1u) ? (n_trials - thread_idx * tpt) : tpt;
             // This should always be guaranteed because tpt is never 0.
             piranha_assert(cur_trials > 0u);
             // Create and setup the temp series.
             Series tmp;
             tmp.set_symbol_set(m_ss);
             // Create the multiplier.
             MultFunctor mf(*this, tmp);
             // Go with the trials.
             for (auto n = 0u; n < cur_trials; ++n) {
                 // Seed the engine. The seed should be the global trial number, accounting for multiple
                 // threads. This way the estimation will not depend on the number of threads.
                 engine.seed(static_cast<std::mt19937::result_type>(tpt * thread_idx + n));
                 // Reset the indices vector and re-randomise it.
                 // NOTE: we need to do this as every run inside this for loop must be completely independent
                 // of any previous run, we cannot keep any state.
                 v_idx1 = v_idx1_copy;
                 std::shuffle(v_idx1.begin(), v_idx1.end(), engine);
                 // The counter. This will be increased each time a term-by-term multiplication
                 // does not generate a duplicate term.
                 size_type count = 0u;
                 // This will be used to determine the average number of terms in s2
                 // that participate in the multiplication.
                 integer acc_s2(0);
                 auto it1 = v_idx1.begin();
                 for (; it1 != v_idx1.end(); ++it1) {
                     // Get the limit idx in s2.
                     const size_type limit = lf(*it1);
                     // This is the upper limit of an open ended interval, so it needs
                     // to be decreased by one in order to be used in dist. If zero, it means
                     // there are no terms in v2 that can be multiplied by the current term in t1.
                     if (limit == 0u) {
                         continue;
                     }
                     acc_s2 += limit;
                     // Pick a random index in m_v2 within the limit.
                     const size_type idx2
                         = dist(engine, typename dist_type::param_type(static_cast<size_type>(0u),
                                                                       static_cast<size_type>(limit - 1u)));
                     // Perform term multiplication.
                     mf(*it1, idx2);
                     // Check for unlikely overflows when increasing count.
                     if (unlikely(result_size > std::numeric_limits<size_type>::max()
                                  || count > std::numeric_limits<size_type>::max() - result_size)) {
                         piranha_throw(std::overflow_error, "overflow error");
                     }
                     if (tmp.size() != count + result_size) {
                         break;
                     }
                     // Increase cycle variables.
                     count = static_cast<size_type>(count + result_size);
                 }
                 integer add;
                 if (it1 == v_idx1.end()) {
                     // We never found a duplicate. count is now the number of terms in s1
                     // which actually participate in the multiplication, while acc_s2 / count
                     // is the average number of terms in s2 that participate in the multiplication.
                     // The result will be then count * acc_s2 / count = acc_s2.
                     add = acc_s2;
                 } else {
                     // If we found a duplicate, we use the heuristic.
                     add = integer(multiplier) * count * count;
                 }
                 // Fix if zero, so that the average later never results in zero.
                 if (add.sgn() == 0) {
                     add = 1;
                 }
                 acc += add;
                 // Reset tmp.
                 tmp._container().clear();
             }
             // Accumulate in the shared variable.
             if (n_threads == 1u) {
                 // No locking needed.
                 c_estimate += acc;
             } else {
                 std::lock_guard<std::mutex> lock(mut);
                 c_estimate += acc;
             }
         };
         // Run the estimation functor.
         if (n_threads == 1u) {
             estimator(0u);
         } else {
             future_list<void> f_list;
             try {
                 for (unsigned i = 0u; i < n_threads; ++i) {
                     f_list.push_back(thread_pool::enqueue(i, estimator, i));
                 }
                 // First let's wait for everything to finish.
                 f_list.wait_all();
                 // Then, let's handle the exceptions.
                 f_list.get_all();
             } catch (...) {
                 f_list.wait_all();
                 throw;
             }
         }
         piranha_assert(c_estimate >= n_trials);
         // Return the mean.
         return static_cast<bucket_size_type>(c_estimate / n_trials);
     }

     template <std::size_t MultArity, typename MultFunctor>
     bucket_size_type estimate_final_series_size() const
     {
         return estimate_final_series_size<MultArity, MultFunctor>(default_limit_functor{*this});
     }

     template <bool FastMode>
     class plain_multiplier
     {
         using term_type = typename Series::term_type;
         using key_type = typename term_type::key_type;
         PIRANHA_TT_CHECK(key_is_multipliable, typename term_type::cf_type, key_type);
         using it_type = decltype(std::declval<Series &>()._container().end());
         static constexpr std::size_t m_arity = key_type::multiply_arity;

     public:

         explicit plain_multiplier(const base_series_multiplier &bsm, Series &retval)
             : m_v1(bsm.m_v1), m_v2(bsm.m_v2), m_retval(retval), m_c_end(retval._container().end())
         {
         }

     private:
         plain_multiplier(const plain_multiplier &) = delete;
         plain_multiplier(plain_multiplier &&) = delete;
         plain_multiplier &operator=(const plain_multiplier &) = delete;
         plain_multiplier &operator=(plain_multiplier &&) = delete;

     public:

         void operator()(const size_type &i, const size_type &j) const
         {
             // First perform the multiplication.
             key_type::multiply(m_tmp_t, *m_v1[i], *m_v2[j], m_retval.get_symbol_set());
             for (std::size_t n = 0u; n < m_arity; ++n) {
                 auto &tmp_term = m_tmp_t[n];
                 if (FastMode) {
                     auto &container = m_retval._container();
                     // Try to locate the term into retval.
                     auto bucket_idx = container._bucket(tmp_term);
                     const auto it = container._find(tmp_term, bucket_idx);
                     if (it == m_c_end) {
                         container._unique_insert(term_insertion(tmp_term), bucket_idx);
                     } else {
                         it->m_cf += tmp_term.m_cf;
                     }
                 } else {
                     m_retval.insert(term_insertion(tmp_term));
                 }
             }
         }

     private:
         mutable std::array<term_type, m_arity> m_tmp_t;
         const std::vector<term_type const *> &m_v1;
         const std::vector<term_type const *> &m_v2;
         Series &m_retval;
         const it_type m_c_end;
     };

     static void sanitise_series(Series &retval, unsigned n_threads)
     {
         using term_type = typename Series::term_type;
         if (unlikely(n_threads == 0u)) {
             piranha_throw(std::invalid_argument, "invalid number of threads");
         }
         auto &container = retval._container();
         const auto &args = retval.get_symbol_set();
         // Reset the size to zero before doing anything.
         container._update_size(static_cast<bucket_size_type>(0u));
         // Single-thread implementation.
         if (n_threads == 1u) {
             const auto it_end = container.end();
             for (auto it = container.begin(); it != it_end;) {
                 if (unlikely(!it->is_compatible(args))) {
                     piranha_throw(std::invalid_argument, "incompatible term");
                 }
                 if (unlikely(container.size() == std::numeric_limits<bucket_size_type>::max())) {
                     piranha_throw(std::overflow_error, "overflow error in the number of terms of a series");
                 }
                 // First update the size, it will be scaled back in the erase() method if necessary.
                 container._update_size(static_cast<bucket_size_type>(container.size() + 1u));
                 if (unlikely(it->is_zero(args))) {
                     it = container.erase(it);
                 } else {
                     ++it;
                 }
             }
             return;
         }
         // Multi-thread implementation.
         const auto b_count = container.bucket_count();
         std::mutex m;
         integer global_count(0);
         auto eraser = [b_count, &container, &m, &args, &global_count](const bucket_size_type &start,
                                                                       const bucket_size_type &end) {
             piranha_assert(start <= end && end <= b_count);
             (void)b_count;
             bucket_size_type count = 0u;
             std::vector<term_type> term_list;
             // Examine and count the terms bucket-by-bucket.
             for (bucket_size_type i = start; i != end; ++i) {
                 term_list.clear();
                 const auto &bl = container._get_bucket_list(i);
                 const auto it_f = bl.end();
                 for (auto it = bl.begin(); it != it_f; ++it) {
                     // Check first for compatibility.
                     if (unlikely(!it->is_compatible(args))) {
                         piranha_throw(std::invalid_argument, "incompatible term");
                     }
                     // Check for ignorability.
                     if (unlikely(it->is_zero(args))) {
                         term_list.push_back(*it);
                     }
                     // Update the count of terms.
                     if (unlikely(count == std::numeric_limits<bucket_size_type>::max())) {
                         piranha_throw(std::overflow_error, "overflow error in the number of terms of a series");
                     }
                     count = static_cast<bucket_size_type>(count + 1u);
                 }
                 for (auto it = term_list.begin(); it != term_list.end(); ++it) {
                     // NOTE: must use _erase to avoid concurrent modifications
                     // to the number of elements in the table.
                     container._erase(container._find(*it, i));
                     // Account for the erased term.
                     piranha_assert(count > 0u);
                     count = static_cast<bucket_size_type>(count - 1u);
                 }
             }
             // Update the global count.
             std::lock_guard<std::mutex> lock(m);
             global_count += count;
         };
         future_list<decltype(eraser(bucket_size_type(), bucket_size_type()))> f_list;
         try {
             for (unsigned i = 0u; i < n_threads; ++i) {
                 const auto start = static_cast<bucket_size_type>((b_count / n_threads) * i),
                            end = static_cast<bucket_size_type>(
                                (i == n_threads - 1u) ? b_count : (b_count / n_threads) * (i + 1u));
                 f_list.push_back(thread_pool::enqueue(i, eraser, start, end));
             }
             // First let's wait for everything to finish.
             f_list.wait_all();
             // Then, let's handle the exceptions.
             f_list.get_all();
         } catch (...) {
             f_list.wait_all();
             // NOTE: there's not need to clear retval here - it was already in an inconsistent
             // state coming into this method. We rather need to make sure sanitise_series() is always
             // called in a try/catch block that clears retval in case of errors.
             throw;
         }
         // Final update of the total count.
         container._update_size(static_cast<bucket_size_type>(global_count));
     }

     template <typename LimitFunctor>
     Series plain_multiplication(const LimitFunctor &lf) const
     {
         // Shortcuts.
         using term_type = typename Series::term_type;
         using cf_type = typename term_type::cf_type;
         using key_type = typename term_type::key_type;
         PIRANHA_TT_CHECK(key_is_multipliable, cf_type, key_type);
         constexpr std::size_t m_arity = key_type::multiply_arity;
         // Setup the return value with the merged symbol set.
         Series retval;
         retval.set_symbol_set(m_ss);
         // Do not do anything if one of the two series is empty.
         if (unlikely(m_v1.empty() || m_v2.empty())) {
             return retval;
         }
         const size_type size1 = m_v1.size(), size2 = m_v2.size();
         (void)size2;
         piranha_assert(size1 && size2);
         // Convert n_threads to size_type for convenience.
         const size_type n_threads = safe_cast<size_type>(m_n_threads);
         piranha_assert(n_threads);
         // Determine if we should estimate the size. We check the threshold, but we always
         // need to estimate in multithreaded mode.
         bool estimate = true;
         const auto e_thr = tuning::get_estimate_threshold();
         if (integer(m_v1.size()) * m_v2.size() < integer(e_thr) * e_thr && n_threads == 1u) {
             estimate = false;
         }
         if (estimate) {
             // Estimate and rehash.
             const auto est = estimate_final_series_size<m_arity, plain_multiplier<false>>(lf);
             // NOTE: use numeric cast here as safe_cast is expensive, going through an integer-double conversion,
             // and in this case the behaviour of numeric_cast is appropriate.
             const auto n_buckets = boost::numeric_cast<bucket_size_type>(
                 std::ceil(static_cast<double>(est) / retval._container().max_load_factor()));
             piranha_assert(n_buckets > 0u);
             // Check if we want to use the parallel memory set.
             // NOTE: it is important here that we use the same n_threads for multiplication and memset as
             // we tie together pinned threads with potentially different NUMA regions.
             const unsigned n_threads_rehash = tuning::get_parallel_memory_set() ? static_cast<unsigned>(n_threads) : 1u;
             retval._container().rehash(n_buckets, n_threads_rehash);
         }
         if (n_threads == 1u) {
             try {
                 // Single-thread case.
                 if (estimate) {
                     blocked_multiplication(plain_multiplier<true>(*this, retval), 0u, size1, lf);
                     // If we estimated beforehand, we need to sanitise the series.
                     sanitise_series(retval, static_cast<unsigned>(n_threads));
                 } else {
                     blocked_multiplication(plain_multiplier<false>(*this, retval), 0u, size1, lf);
                 }
                 finalise_series(retval);
                 return retval;
             } catch (...) {
                 retval._container().clear();
                 throw;
             }
         }
         // Multi-threaded case.
         piranha_assert(estimate);
         // Init the vector of spinlocks.
         detail::atomic_flag_array sl_array(safe_cast<std::size_t>(retval._container().bucket_count()));
         // Init the future list.
         future_list<void> f_list;
         // Thread block size.
         const auto block_size = size1 / n_threads;
         try {
             for (size_type idx = 0u; idx < n_threads; ++idx) {
                 // Thread functor.
                 auto tf = [idx, this, block_size, n_threads, &sl_array, &retval, &lf]() {
                     // Used to store the result of term multiplication.
                     std::array<term_type, key_type::multiply_arity> tmp_t;
                     // End of retval container (thread-safe).
                     const auto c_end = retval._container().end();
                     // Block functor.
                     // NOTE: this is very similar to the plain functor, but it does the bucket locking
                     // additionally.
                     auto f = [&c_end, &tmp_t, this, &retval, &sl_array](const size_type &i, const size_type &j) {
                         // Run the term multiplication.
                         key_type::multiply(tmp_t, *(this->m_v1[i]), *(this->m_v2[j]), retval.get_symbol_set());
                         for (std::size_t n = 0u; n < key_type::multiply_arity; ++n) {
                             auto &container = retval._container();
                             auto &tmp_term = tmp_t[n];
                             // Try to locate the term into retval.
                             auto bucket_idx = container._bucket(tmp_term);
                             // Lock the bucket.
                             detail::atomic_lock_guard alg(sl_array[static_cast<std::size_t>(bucket_idx)]);
                             const auto it = container._find(tmp_term, bucket_idx);
                             if (it == c_end) {
                                 container._unique_insert(term_insertion(tmp_term), bucket_idx);
                             } else {
                                 it->m_cf += tmp_term.m_cf;
                             }
                         }
                     };
                     // Thread block limit.
                     const auto e1
                         = (idx == n_threads - 1u) ? this->m_v1.size() : static_cast<size_type>((idx + 1u) * block_size);
                     this->blocked_multiplication(f, static_cast<size_type>(idx * block_size), e1, lf);
                 };
                 f_list.push_back(thread_pool::enqueue(static_cast<unsigned>(idx), tf));
             }
             f_list.wait_all();
             f_list.get_all();
             sanitise_series(retval, static_cast<unsigned>(n_threads));
             finalise_series(retval);
         } catch (...) {
             f_list.wait_all();
             // Clean up retval as it might be in an inconsistent state.
             retval._container().clear();
             throw;
         }
         return retval;
     }

     Series plain_multiplication() const
     {
         return plain_multiplication(default_limit_functor{*this});
     }

     void finalise_series(Series &s) const
     {
         finalise_impl(s);
     }

 protected:
     mutable v_ptr m_v1;
     mutable v_ptr m_v2;
     const symbol_fset m_ss;

     unsigned m_n_threads;

 private:
     // See the constructor for an explanation.
     container_type m_zero_f1;
     container_type m_zero_f2;
 };
 }

 #endif
piranha::base_series_multiplier::bucket_size_type
typename Series::size_type bucket_size_type
The size type of Series.
Definition: base_series_multiplier.hpp:242

piranha::base_series_multiplier::plain_multiplier::operator()
void operator()(const size_type &i, const size_type &j) const
Call operator.
Definition: base_series_multiplier.hpp:787

piranha::future_list::wait_all
void wait_all()
Wait on all the futures.
Definition: thread_pool.hpp:582

piranha::key_is_multipliable
Type trait for multipliable key.
Definition: key_is_multipliable.hpp:80

piranha::base_series_multiplier::blocked_multiplication
void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1, const LimitFunctor &lf) const
Blocked multiplication.
Definition: base_series_multiplier.hpp:442

piranha::is_function_object
Function object type trait.
Definition: type_traits.hpp:668

piranha::base_series_multiplier::m_v2
v_ptr m_v2
Vector of const pointers to the terms in the smaller series.
Definition: base_series_multiplier.hpp:1129

piranha::base_series_multiplier::blocked_multiplication
void blocked_multiplication(const MultFunctor &mf, const size_type &start1, const size_type &end1) const
Blocked multiplication (convenience overload).
Definition: base_series_multiplier.hpp:509

mppp::mp_integer
Multiprecision integer class.
Definition: mp++.hpp:869

piranha::base_series_multiplier::plain_multiplier
A plain multiplier functor.
Definition: base_series_multiplier.hpp:744

piranha::integer
mp_integer< 1 > integer
Alias for piranha::mp_integer with 1 limb of static storage.
Definition: mp_integer.hpp:63

piranha::base_series_multiplier::plain_multiplication
Series plain_multiplication(const LimitFunctor &lf) const
A plain series multiplication routine.
Definition: base_series_multiplier.hpp:978

piranha::base_series_multiplier::finalise_series
void finalise_series(Series &s) const
Finalise series.
Definition: base_series_multiplier.hpp:1120

piranha::base_series_multiplier::estimate_final_series_size
bucket_size_type estimate_final_series_size() const
Estimate size of series multiplication (convenience overload)
Definition: base_series_multiplier.hpp:719

piranha::thread_pool_::use_threads
static unsigned use_threads(const Int &work_size, const Int &min_work_per_thread)
Compute number of threads to use.
Definition: thread_pool.hpp:460

piranha::base_series_multiplier::estimate_final_series_size
bucket_size_type estimate_final_series_size(const LimitFunctor &lf) const
Estimate size of series multiplication.
Definition: base_series_multiplier.hpp:556

exceptions.hpp
Exceptions.

piranha::base_series_multiplier::plain_multiplication
Series plain_multiplication() const
A plain series multiplication routine (convenience overload).
Definition: base_series_multiplier.hpp:1100

std
STL namespace.

piranha::future_list::get_all
void get_all()
Get all the futures.
Definition: thread_pool.hpp:597

piranha::thread_pool_::enqueue
static enqueue_t< F &&, Args &&... > enqueue(unsigned n, F &&f, Args &&... args)
Enqueue task.
Definition: thread_pool.hpp:322

piranha::base_series_multiplier::base_series_multiplier
base_series_multiplier(const Series &s1, const Series &s2)
Constructor.
Definition: base_series_multiplier.hpp:355

piranha::base_series_multiplier::m_ss
const symbol_fset m_ss
The symbol set of the series used during construction.
Definition: base_series_multiplier.hpp:1131

piranha::base_series_multiplier::size_type
typename v_ptr::size_type size_type
The size type of base_series_multiplier::v_ptr.
Definition: base_series_multiplier.hpp:240

piranha_throw
#define piranha_throw(exception_type,...)
Exception-throwing macro.
Definition: exceptions.hpp:118

piranha::symbol_fset
boost::container::flat_set< std::string > symbol_fset
Flat set of symbols.
Definition: symbol_utils.hpp:61

piranha::zero_is_absorbing
Detect if zero is a multiplicative absorber.
Definition: type_traits.hpp:1147

piranha::base_series_multiplier::m_v1
v_ptr m_v1
Vector of const pointers to the terms in the larger series.
Definition: base_series_multiplier.hpp:1127

piranha::future_list
Class to store a list of futures.
Definition: thread_pool.hpp:513

piranha::tuning::get_estimate_threshold
static unsigned long get_estimate_threshold()
Get the series estimation threshold.
Definition: tuning.hpp:159

piranha
Root piranha namespace.
Definition: array_key.hpp:52

type_traits.hpp
Type traits.

piranha::math::mul3
auto mul3(T &a, const T &b, const T &c) -> decltype(mul3_impl< T >()(a, b, c))
Ternary multiplication.
Definition: math.hpp:2726

piranha::base_series_multiplier::plain_multiplier::plain_multiplier
plain_multiplier(const base_series_multiplier &bsm, Series &retval)
Constructor.
Definition: base_series_multiplier.hpp:761

piranha::tuning::get_multiplication_block_size
static unsigned long get_multiplication_block_size()
Get the multiplication block size.
Definition: tuning.hpp:117

piranha::tuning::get_parallel_memory_set
static bool get_parallel_memory_set()
Get the parallel_memory_set flag.
Definition: tuning.hpp:81

piranha::base_series_multiplier::sanitise_series
static void sanitise_series(Series &retval, unsigned n_threads)
Sanitise series.
Definition: base_series_multiplier.hpp:845

mppp::mp_integer::sgn
int sgn() const
Sign.
Definition: mp++.hpp:1611

piranha::is_series
Type trait to detect series types.
Definition: series_fwd.hpp:49

piranha::base_series_multiplier::m_n_threads
unsigned m_n_threads
Number of threads.
Definition: base_series_multiplier.hpp:1138

piranha::settings_::get_min_work_per_thread
static unsigned long long get_min_work_per_thread()
Get the minimum work per thread.
Definition: settings.hpp:233

piranha::math::gcd3
auto gcd3(T &out, const T &a, const T &b) -> decltype(gcd3_impl< T >()(out, a, b))
Ternary GCD.
Definition: math.hpp:2947

piranha::base_series_multiplier::v_ptr
std::vector< typename Series::term_type const * > v_ptr
Alias for a vector of const pointers to series terms.
Definition: base_series_multiplier.hpp:238

piranha::base_series_multiplier
Base series multiplier.
Definition: base_series_multiplier.hpp:228

piranha::safe_cast
safe_cast_type< To, From > safe_cast(const From &x)
Safe cast.
Definition: safe_cast.hpp:219

piranha::math::is_unitary
bool is_unitary(const T &x)
Unitary test.
Definition: math.hpp:242

piranha::future_list::push_back
void push_back(std::future< T > &&f)
Move-insert a future.
Definition: thread_pool.hpp:561