source/libtomo/accel/common.hh
// Copyright (c) 2015, UChicago Argonne, LLC. All rights reserved.
// Copyright 2015. UChicago Argonne, LLC. This software was produced
// under U.S. Government contract DE-AC02-06CH11357 for Argonne National
// Laboratory (ANL), which is operated by UChicago Argonne, LLC for the
// U.S. Department of Energy. The U.S. Government has rights to use,
// reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR
// UChicago Argonne, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
// ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is
// modified to produce derivative works, such modified software should
// be clearly marked, so as not to confuse it with the version available
// from ANL.
// Additionally, redistribution and use in source and binary forms, with
// or without modification, are permitted provided that the following
// conditions are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation andwith the
// distribution.
// * Neither the name of UChicago Argonne, LLC, Argonne National
// Laboratory, ANL, the U.S. Government, nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY UChicago Argonne, LLC AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UChicago
// Argonne, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// ---------------------------------------------------------------
// TOMOPY header
#pragma once
#include "constants.hh"
#include "macros.hh"
#include "typedefs.hh"
#include <stdexcept>
//======================================================================================//
//
// The following section provides functions for the initialization of the tasking library
//
//======================================================================================//
// get a unique pointer to a thread-pool
//
inline void
CreateThreadPool(unique_thread_pool_t& tp, num_threads_t& pool_size)
{
auto min_threads = num_threads_t(1);
if(pool_size <= 0)
{
// compute some properties (expected python threads, max threads)
auto pythreads = PTL::GetEnv("TOMOPY_PYTHON_THREADS", HW_CONCURRENCY);
# if defined(TOMOPY_USE_CUDA)
// general oversubscription when CUDA is enabled
auto max_threads =
(HW_CONCURRENCY + HW_CONCURRENCY) / std::max(pythreads, min_threads);
# else
// if known that CPU only, just try to use all cores
auto max_threads = HW_CONCURRENCY / std::max(pythreads, min_threads);
# endif
auto nthreads =
std::max(PTL::GetEnv("TOMOPY_NUM_THREADS", max_threads), min_threads);
pool_size = nthreads;
}
// always specify at least one thread even if not creating threads
pool_size = std::max(pool_size, min_threads);
// explicitly set number of threads to 0 so OpenCV doesn't try to create
// threads
#if defined(TOMOPY_USE_OPENCV)
cv::setNumThreads(0);
#endif
// use unique pointer per-thread so manager gets deleted when thread gets deleted
// create the thread-pool instance
tomopy::ThreadPool::Config cfg;
cfg.pool_size = pool_size;
tp = unique_thread_pool_t(new tomopy::ThreadPool(cfg));
// ensure this thread is assigned id, assign variable so no unused result warning
auto tid = GetThisThreadID();
// initialize the thread-local data information
auto*& thread_data = PTL::ThreadData::GetInstance();
if(!thread_data)
thread_data = new PTL::ThreadData(tp.get());
// tell thread that initialized thread-pool to process tasks
// (typically main thread will only wait for other threads)
thread_data->is_main = true;
// tell thread that it is not currently within task
thread_data->within_task = false;
// notify
PTL::AutoLock l(PTL::TypeMutex<decltype(std::cout)>());
std::cout << "\n"
<< "[" << tid << "] Initialized tasking run manager with " << tp->size()
<< " threads..." << std::endl;
}
//======================================================================================//
// This class enables the selection of a device at runtime
//
struct DeviceOption
{
public:
using string_t = std::string;
int index = -1;
string_t key = "";
string_t description = "";
DeviceOption() {}
DeviceOption(const int& _idx, const string_t& _key, const string_t& _desc)
: index(_idx)
, key(_key)
, description(_desc)
{
}
static void spacer(std::ostream& os, const char c = '-')
{
std::stringstream ss;
ss.fill(c);
ss << std::setw(90) << ""
<< "\n";
os << ss.str();
}
friend bool operator==(const DeviceOption& lhs, const DeviceOption& rhs)
{
return (lhs.key == rhs.key && lhs.index == rhs.index);
}
friend bool operator==(const DeviceOption& itr, const string_t& cmp)
{
return (!is_numeric(cmp)) ? (itr.key == tolower(cmp))
: (itr.index == from_string<int>(cmp));
}
friend bool operator!=(const DeviceOption& lhs, const DeviceOption& rhs)
{
return !(lhs == rhs);
}
friend bool operator!=(const DeviceOption& itr, const string_t& cmp)
{
return !(itr == cmp);
}
static void header(std::ostream& os)
{
std::stringstream ss;
ss << "\n";
spacer(ss, '=');
ss << "Available GPU options:\n";
ss << "\t" << std::left << std::setw(5) << "INDEX"
<< " \t" << std::left << std::setw(12) << "KEY"
<< " " << std::left << std::setw(40) << "DESCRIPTION"
<< "\n";
os << ss.str();
}
static void footer(std::ostream& os)
{
std::stringstream ss;
ss << "\nTo select an option for runtime, set 'device' parameter to an "
<< "INDEX or KEY above\n";
spacer(ss, '=');
os << ss.str();
}
friend std::ostream& operator<<(std::ostream& os, const DeviceOption& opt)
{
std::stringstream ss;
ss << "\t" << std::right << std::setw(5) << opt.index << " \t" << std::left
<< std::setw(12) << opt.key << " " << std::left << std::setw(40)
<< opt.description;
os << ss.str();
return os;
}
// helper function for converting to lower-case
inline static std::string tolower(std::string val)
{
for(auto& itr : val)
itr = scast<char>(::tolower(itr));
return val;
}
// helper function to convert string to another type
template <typename _Tp>
static _Tp from_string(const std::string& val)
{
std::stringstream ss;
_Tp ret;
ss << val;
ss >> ret;
return ret;
}
// helper function to determine if numeric represented as string
inline static bool is_numeric(const std::string& val)
{
if(val.length() > 0)
{
auto f = val.find_first_of("0123456789");
if(f == std::string::npos) // no numbers
return false;
auto l = val.find_last_of("0123456789");
if(val.length() <= 2) // 1, 2., etc.
return true;
else
return (f != l); // 1.0, 1e3, 23, etc.
}
return false;
}
};
//======================================================================================//
// this function selects the device to run the reconstruction on
//
inline DeviceOption
GetDevice(const std::string& preferred)
{
auto pythreads = PTL::GetEnv("TOMOPY_PYTHON_THREADS", HW_CONCURRENCY);
using DeviceOptionList = std::deque<DeviceOption>;
DeviceOptionList options = {};
std::string default_key = "cpu";
#if defined(TOMOPY_USE_OPENCV)
options.push_back(DeviceOption(0, "cpu", "Run on CPU (OpenCV)"));
#endif
#if defined(TOMOPY_USE_CUDA)
auto num_devices = cuda_device_count();
if(num_devices > 0)
{
options.push_back(DeviceOption(1, "gpu", "Run on GPU (CUDA NPP)"));
default_key = "gpu";
# if defined(TOMOPY_USE_NVTX)
// initialize nvtx data
init_nvtx();
# endif
// print device info
cuda_device_query();
}
else
{
PTL::AutoLock l(PTL::TypeMutex<decltype(std::cout)>());
std::cerr << "\n##### No CUDA device(s) available #####\n" << std::endl;
}
#endif
if(options.empty())
{
throw std::runtime_error("No devices found! Check that TomoPy was "
"compiled with OpenCV or CUDA.");
}
// find the default entry
auto default_itr =
std::find_if(options.begin(), options.end(),
[&](const DeviceOption& itr) { return (itr == default_key); });
//------------------------------------------------------------------------//
// print the options the first time it is encountered
auto print_options = [&]()
{
static std::atomic_uint _once;
auto _count = _once++;
if(_count % pythreads > 0)
{
if(_count + 1 == pythreads)
{
_once.store(0);
}
return;
}
std::stringstream ss;
DeviceOption::header(ss);
for(const auto& itr : options)
{
ss << itr;
if(itr == *default_itr)
ss << "\t(default)";
ss << "\n";
}
DeviceOption::footer(ss);
PTL::AutoLock l(PTL::TypeMutex<decltype(std::cout)>());
std::cout << "\n" << ss.str() << std::endl;
};
//------------------------------------------------------------------------//
// print the option selection first time it is encountered
auto print_selection = [&](DeviceOption& selected_opt)
{
static std::atomic_uint _once;
auto _count = _once++;
if(_count % pythreads > 0)
{
if(_count + 1 == pythreads)
{
_once.store(0);
}
return;
}
std::stringstream ss;
DeviceOption::spacer(ss, '-');
ss << "Selected device: " << selected_opt << "\n";
DeviceOption::spacer(ss, '-');
PTL::AutoLock l(PTL::TypeMutex<decltype(std::cout)>());
std::cout << ss.str() << std::endl;
};
//------------------------------------------------------------------------//
// print the GPU execution type options
print_options();
default_key = default_itr->key;
auto key = preferred;
auto selection = std::find_if(options.begin(), options.end(),
[&](const DeviceOption& itr) { return (itr == key); });
if(selection == options.end())
selection =
std::find_if(options.begin(), options.end(),
[&](const DeviceOption& itr) { return itr == default_key; });
print_selection(*selection);
return *selection;
}
//======================================================================================//
// synchronize a CUDA stream
inline void
stream_sync(cudaStream_t _stream)
{
#if defined(__NVCC__) && defined(TOMOPY_USE_CUDA)
cudaStreamSynchronize(_stream);
CUDA_CHECK_LAST_STREAM_ERROR(_stream);
#else
PTL::ConsumeParameters(_stream);
#endif
}
//======================================================================================//
// function for printing an array
//
template <typename _Tp, std::size_t _N>
std::ostream&
operator<<(std::ostream& os, const std::array<_Tp, _N>& arr)
{
std::stringstream ss;
ss.setf(os.flags());
for(std::size_t i = 0; i < _N; ++i)
{
ss << arr[i];
if(i + 1 < _N)
{
ss << ", ";
}
}
os << ss.str();
return os;
}
//======================================================================================//
// for generic printing operations in a clean fashion
//
namespace internal
{
//
//----------------------------------------------------------------------------------//
/// Alias template for enable_if
template <bool B, typename T>
using enable_if_t = typename std::enable_if<B, T>::type;
/// Alias template for decay
template <class T>
using decay_t = typename std::decay<T>::type;
struct apply_impl
{
//----------------------------------------------------------------------------------//
// end of recursive expansion
//
template <std::size_t _N, std::size_t _Nt, typename _Operator, typename _TupleA,
typename _TupleB, typename... _Args, enable_if_t<(_N == _Nt), int> = 0>
static void unroll(_TupleA&& _tupA, _TupleB&& _tupB, _Args&&... _args)
{
// call constructor
using TypeA = decltype(std::get<_N>(_tupA));
using TypeB = decltype(std::get<_N>(_tupB));
using OperatorType = typename std::tuple_element<_N, _Operator>::type;
OperatorType(std::forward<TypeA>(std::get<_N>(_tupA)),
std::forward<TypeB>(std::get<_N>(_tupB)),
std::forward<_Args>(_args)...);
}
//----------------------------------------------------------------------------------//
// recursive expansion until _N == _Nt
//
template <std::size_t _N, std::size_t _Nt, typename _Operator, typename _TupleA,
typename _TupleB, typename... _Args, enable_if_t<(_N < _Nt), int> = 0>
static void unroll(_TupleA&& _tupA, _TupleB&& _tupB, _Args&&... _args)
{
// call constructor
using TypeA = decltype(std::get<_N>(_tupA));
using TypeB = decltype(std::get<_N>(_tupB));
using OperatorType = typename std::tuple_element<_N, _Operator>::type;
OperatorType(std::forward<TypeA>(std::get<_N>(_tupA)),
std::forward<TypeB>(std::get<_N>(_tupB)),
std::forward<_Args>(_args)...);
// recursive call
unroll<_N + 1, _Nt, _Operator, _TupleA, _TupleB, _Args...>(
std::forward<_TupleA>(_tupA), std::forward<_TupleB>(_tupB),
std::forward<_Args>(_args)...);
}
};
//======================================================================================//
struct apply
{
//----------------------------------------------------------------------------------//
// invoke the recursive expansion
template <typename _Operator, typename _TupleA, typename _TupleB, typename... _Args,
std::size_t _N = std::tuple_size<decay_t<_TupleA>>::value,
std::size_t _Nb = std::tuple_size<decay_t<_TupleB>>::value>
static void unroll(_TupleA&& _tupA, _TupleB&& _tupB, _Args&&... _args)
{
static_assert(_N == _Nb, "tuple_size A must match tuple_size B");
apply_impl::template unroll<0, _N - 1, _Operator, _TupleA, _TupleB, _Args...>(
std::forward<_TupleA>(_tupA), std::forward<_TupleB>(_tupB),
std::forward<_Args>(_args)...);
}
};
//--------------------------------------------------------------------------------------//
// generic operator for printing
//
template <typename Type>
struct GenericPrinter
{
GenericPrinter(const std::string& _prefix, const Type& obj, std::ostream& os,
intmax_t _prefix_width, intmax_t _obj_width,
std::ios_base::fmtflags format_flags, bool endline)
{
std::stringstream ss;
ss.setf(format_flags);
ss << std::setw(_prefix_width) << std::right << _prefix << " = "
<< std::setw(_obj_width) << obj;
if(endline)
ss << std::endl;
os << ss.str();
}
};
} // end namespace internal