source/libtomo/accel/utils.hh
// Copyright (c) 2015, UChicago Argonne, LLC. All rights reserved.
// Copyright 2015. UChicago Argonne, LLC. This software was produced
// under U.S. Government contract DE-AC02-06CH11357 for Argonne National
// Laboratory (ANL), which is operated by UChicago Argonne, LLC for the
// U.S. Department of Energy. The U.S. Government has rights to use,
// reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR
// UChicago Argonne, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
// ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is
// modified to produce derivative works, such modified software should
// be clearly marked, so as not to confuse it with the version available
// from ANL.
// Additionally, redistribution and use in source and binary forms, with
// or without modification, are permitted provided that the following
// conditions are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation andwith the
// distribution.
// * Neither the name of UChicago Argonne, LLC, Argonne National
// Laboratory, ANL, the U.S. Government, nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY UChicago Argonne, LLC AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UChicago
// Argonne, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// ---------------------------------------------------------------
// TOMOPY header
#pragma once
//--------------------------------------------------------------------------------------//
#include "macros.hh"
BEGIN_EXTERN_C
#include "libtomo/accel.h"
END_EXTERN_C
//--------------------------------------------------------------------------------------//
#if defined(WIN32)
# define _Complex
#endif
//--------------------------------------------------------------------------------------//
// a non-string environment option with a string identifier
template <typename Tp>
using EnvChoice = std::tuple<Tp, std::string, std::string>;
//--------------------------------------------------------------------------------------//
// list of environment choices with non-string and string identifiers
template <typename Tp>
using EnvChoiceList = std::set<EnvChoice<Tp>>;
//--------------------------------------------------------------------------------------//
template <typename Tp>
Tp
GetChoice(const EnvChoiceList<Tp>& _choices, const std::string& str_var)
{
auto asupper = [](std::string var) {
for(auto& itr : var)
itr = toupper(itr);
return var;
};
std::string upp_var = asupper(str_var);
Tp var = Tp();
// check to see if string matches a choice
for(const auto& itr : _choices)
{
if(asupper(std::get<1>(itr)) == upp_var)
{
// record value defined by environment
return std::get<0>(itr);
}
}
std::istringstream iss(str_var);
iss >> var;
// check to see if string matches a choice
for(const auto& itr : _choices)
{
if(var == std::get<0>(itr))
{
// record value defined by environment
return var;
}
}
// the value set in env did not match any choices
std::stringstream ss;
ss << "\n### Environment setting error @ " << __FUNCTION__ << " (line " << __LINE__
<< ")! Invalid selection \"" << str_var << "\". Valid choices are:\n";
for(const auto& itr : _choices)
ss << "\t\"" << std::get<0>(itr) << "\" or \"" << std::get<1>(itr) << "\" ("
<< std::get<2>(itr) << ")\n";
std::cerr << ss.str() << std::endl;
abort();
}
#if defined(TOMOPY_USE_OPENCV)
# define CPU_NN CV_INTER_NN
# define CPU_LINEAR CV_INTER_LINEAR
# define CPU_AREA CV_INTER_AREA
# define CPU_CUBIC CV_INTER_CUBIC
# define CPU_LANCZOS CV_INTER_LANCZOS4
//--------------------------------------------------------------------------------------//
template <typename _Tp>
struct OpenCVDataType
{
template <typename _Up = _Tp>
static constexpr int value()
{
static_assert(std::is_same<_Up, _Tp>::value, "OpenCV data type not overloaded");
return -1;
}
};
# define DEFINE_OPENCV_DATA_TYPE(pod_type, opencv_type) \
template <> \
struct OpenCVDataType<pod_type> \
{ \
template <typename _Up = pod_type> \
static constexpr int value() \
{ \
return opencv_type; \
} \
};
// floating point types
DEFINE_OPENCV_DATA_TYPE(float, CV_32F)
DEFINE_OPENCV_DATA_TYPE(double, CV_64F)
// signed integer types
DEFINE_OPENCV_DATA_TYPE(int8_t, CV_8S)
DEFINE_OPENCV_DATA_TYPE(int16_t, CV_16S)
DEFINE_OPENCV_DATA_TYPE(int32_t, CV_32S)
// unsigned integer types
DEFINE_OPENCV_DATA_TYPE(uint8_t, CV_8U)
DEFINE_OPENCV_DATA_TYPE(uint16_t, CV_16U)
# undef DEFINE_OPENCV_DATA_TYPE // don't pollute
//--------------------------------------------------------------------------------------//
inline int
GetOpenCVInterpolationMode(const std::string& preferred)
{
EnvChoiceList<int> choices = {
EnvChoice<int>(CPU_NN, "NN", "nearest neighbor interpolation"),
EnvChoice<int>(CPU_LINEAR, "LINEAR", "bilinear interpolation"),
EnvChoice<int>(CPU_CUBIC, "CUBIC", "bicubic interpolation")
};
return GetChoice<int>(choices, preferred);
}
//--------------------------------------------------------------------------------------//
inline cv::Mat
opencv_affine_transform(const cv::Mat& warp_src, double theta, const int& nx,
const int& ny, int eInterp, double scale = 1.0)
{
cv::Mat warp_dst = cv::Mat::zeros(nx, ny, warp_src.type());
double cx = 0.5 * ny + ((ny % 2 == 0) ? 0.5 : 0.0);
double cy = 0.5 * nx + ((nx % 2 == 0) ? 0.5 : 0.0);
cv::Point center = cv::Point(cx, cy);
cv::Mat rot = cv::getRotationMatrix2D(center, theta, scale);
cv::warpAffine(warp_src, warp_dst, rot, warp_src.size(), eInterp);
return warp_dst;
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
cxx_rotate_ip(array_t<_Tp>& dst, const _Tp* src, double theta, const int& nx,
const int& ny, int eInterp, double scale = 1.0)
{
cv::Mat warp_src = cv::Mat::zeros(nx, ny, OpenCVDataType<_Tp>::value());
memcpy(warp_src.ptr(), src, nx * ny * sizeof(float));
cv::Mat warp_rot =
opencv_affine_transform(warp_src, theta * degrees, nx, ny, eInterp, scale);
memcpy(dst.data(), warp_rot.ptr(), nx * ny * sizeof(float));
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
array_t<_Tp>
cxx_rotate(const _Tp* src, double theta, const intmax_t& nx, const intmax_t& ny,
int eInterp, double scale = 1.0)
{
array_t<_Tp> dst(nx * ny, _Tp());
cxx_rotate_ip(dst, src, theta, nx, ny, eInterp, scale);
return dst;
}
//--------------------------------------------------------------------------------------//
inline iarray_t
cxx_compute_sum_dist(int dy, int dt, int dx, int nx, int ny, const float* theta)
{
auto compute = [&](const iarray_t& ones, iarray_t& sum_dist, int p)
{
for(int s = 0; s < dy; ++s)
{
for(int d = 0; d < dx; ++d)
{
int32_t* _sum_dist = sum_dist.data() + (s * nx * ny) + (d * nx);
const int32_t* _ones = ones.data() + (d * nx);
for(int n = 0; n < nx; ++n)
{
_sum_dist[n] += (_ones[n] > 0) ? 1 : 0;
}
}
}
};
iarray_t rot(nx * ny, 0);
iarray_t tmp(nx * ny, 1);
iarray_t sum_dist(dy * nx * ny, 0);
for(int p = 0; p < dt; ++p)
{
float theta_p_rad = fmodf(theta[p] + halfpi, twopi);
cxx_rotate_ip(rot, tmp.data(), -theta_p_rad, nx, ny, CPU_NN);
compute(rot, sum_dist, p);
}
return sum_dist;
}
#endif // TOMOPY_USE_OPENCV
//======================================================================================//
//
#if defined(TOMOPY_USE_CUDA)
//======================================================================================//
// interpolation types
# define GPU_NN NPPI_INTER_NN
# define GPU_LINEAR NPPI_INTER_LINEAR
# define GPU_CUBIC NPPI_INTER_CUBIC
//======================================================================================//
inline int
GetNppInterpolationMode(const std::string& preferred)
{
EnvChoiceList<int> choices = {
EnvChoice<int>(GPU_NN, "NN", "nearest neighbor interpolation"),
EnvChoice<int>(GPU_LINEAR, "LINEAR", "bilinear interpolation"),
EnvChoice<int>(GPU_CUBIC, "CUBIC", "bicubic interpolation")
};
return GetChoice<int>(choices, preferred);
}
//======================================================================================//
//
# if defined(__NVCC__)
//
//======================================================================================//
inline int
GetBlockSize(const int& init = 32)
{
static thread_local int _instance = PTL::GetEnv<int>("TOMOPY_BLOCK_SIZE", init);
return _instance;
}
//======================================================================================//
inline int
GetGridSize(const int& init = 0)
{
// default value of zero == calculated according to block and loop size
static thread_local int _instance = PTL::GetEnv<int>("TOMOPY_GRID_SIZE", init);
return _instance;
}
//======================================================================================//
inline int
ComputeGridSize(const int& size, const int& block_size = GetBlockSize())
{
return (size + block_size - 1) / block_size;
}
//======================================================================================//
inline dim3
GetBlockDims(const dim3& init = dim3(32, 32, 1))
{
int _x = PTL::GetEnv<int>("TOMOPY_BLOCK_SIZE_X", init.x);
int _y = PTL::GetEnv<int>("TOMOPY_BLOCK_SIZE_Y", init.y);
int _z = PTL::GetEnv<int>("TOMOPY_BLOCK_SIZE_Z", init.z);
return dim3(_x, _y, _z);
}
//======================================================================================//
inline dim3
GetGridDims(const dim3& init = dim3(0, 0, 0))
{
// default value of zero == calculated according to block and loop size
int _x = PTL::GetEnv<int>("TOMOPY_GRID_SIZE_X", init.x);
int _y = PTL::GetEnv<int>("TOMOPY_GRID_SIZE_Y", init.y);
int _z = PTL::GetEnv<int>("TOMOPY_GRID_SIZE_Z", init.z);
return dim3(_x, _y, _z);
}
//======================================================================================//
inline dim3
ComputeGridDims(const dim3& dims, const dim3& blocks = GetBlockDims())
{
return dim3(ComputeGridSize(dims.x, blocks.x), ComputeGridSize(dims.y, blocks.y),
ComputeGridSize(dims.z, blocks.z));
}
//======================================================================================//
inline int&
this_thread_device()
{
// this creates a globally accessible function for determining the device
// the thread is assigned to
//
# if defined(TOMOPY_USE_CUDA)
static std::atomic<int> _ntid(0);
static thread_local int _instance =
(cuda_device_count() > 0) ? ((_ntid++) % cuda_device_count()) : 0;
return _instance;
# else
static thread_local int _instance = 0;
return _instance;
# endif
}
//======================================================================================//
inline void
event_sync(cudaEvent_t _event)
{
cudaEventSynchronize(_event);
CUDA_CHECK_LAST_ERROR();
}
//======================================================================================//
template <typename _Tp>
_Tp*
gpu_malloc(uintmax_t size)
{
_Tp* _gpu;
CUDA_CHECK_CALL(cudaMalloc(&_gpu, size * sizeof(_Tp)));
if(_gpu == nullptr)
{
int _device = 0;
cudaGetDevice(&_device);
std::stringstream ss;
ss << "Error allocating memory on GPU " << _device << " of size "
<< (size * sizeof(_Tp)) << " and type " << typeid(_Tp).name()
<< " (type size = " << sizeof(_Tp) << ")";
throw std::runtime_error(ss.str().c_str());
}
return _gpu;
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
cpu2gpu_memcpy(_Tp* _gpu, const _Tp* _cpu, uintmax_t size, cudaStream_t stream)
{
cudaMemcpyAsync(_gpu, _cpu, size * sizeof(_Tp), cudaMemcpyHostToDevice, stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
gpu2cpu_memcpy(_Tp* _cpu, const _Tp* _gpu, uintmax_t size, cudaStream_t stream)
{
cudaMemcpyAsync(_cpu, _gpu, size * sizeof(_Tp), cudaMemcpyDeviceToHost, stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
gpu2gpu_memcpy(_Tp* _dst, const _Tp* _src, uintmax_t size, cudaStream_t stream)
{
cudaMemcpyAsync(_dst, _src, size * sizeof(_Tp), cudaMemcpyDeviceToDevice, stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
gpu_memset(_Tp* _gpu, int value, uintmax_t size, cudaStream_t stream)
{
cudaMemsetAsync(_gpu, value, size * sizeof(_Tp), stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
}
//======================================================================================//
template <typename _Tp>
_Tp*
gpu_malloc_and_memcpy(const _Tp* _cpu, uintmax_t size, cudaStream_t stream)
{
_Tp* _gpu = gpu_malloc<_Tp>(size);
cudaMemcpyAsync(_gpu, _cpu, size * sizeof(_Tp), cudaMemcpyHostToDevice, stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
return _gpu;
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
_Tp*
gpu_malloc_and_memset(uintmax_t size, int value, cudaStream_t stream)
{
_Tp* _gpu = gpu_malloc<_Tp>(size);
cudaMemsetAsync(_gpu, value, size * sizeof(_Tp), stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
return _gpu;
}
//--------------------------------------------------------------------------------------//
template <typename _Tp>
void
gpu2cpu_memcpy_and_free(_Tp* _cpu, _Tp* _gpu, uintmax_t size, cudaStream_t stream)
{
cudaMemcpyAsync(_cpu, _gpu, size * sizeof(_Tp), cudaMemcpyDeviceToHost, stream);
CUDA_CHECK_LAST_STREAM_ERROR(stream);
cudaFree(_gpu);
CUDA_CHECK_LAST_ERROR();
}
//======================================================================================//
inline cudaStream_t*
create_streams(const int nstreams, unsigned int flag = cudaStreamDefault)
{
cudaStream_t* streams = new cudaStream_t[nstreams];
for(int i = 0; i < nstreams; ++i)
{
cudaStreamCreateWithFlags(&streams[i], flag);
CUDA_CHECK_LAST_STREAM_ERROR(streams[i]);
}
return streams;
}
//======================================================================================//
inline void
destroy_streams(cudaStream_t* streams, const int nstreams)
{
for(int i = 0; i < nstreams; ++i)
{
cudaStreamSynchronize(streams[i]);
CUDA_CHECK_LAST_STREAM_ERROR(streams[i]);
cudaStreamDestroy(streams[i]);
CUDA_CHECK_LAST_ERROR();
}
delete[] streams;
}
//======================================================================================//
// compute the sum_dist for the rotations
uint32_t*
cuda_compute_sum_dist(int dy, int dt, int dx, int nx, int ny, const float* theta);
//======================================================================================//
// warm up
//======================================================================================//
template <typename _Tp>
__global__ void
cuda_warmup_kernel(_Tp* _dst, uintmax_t size, const _Tp factor)
{
auto i0 = blockIdx.x * blockDim.x + threadIdx.x;
auto istride = blockDim.x * gridDim.x;
for(auto i = i0; i < size; i += istride)
*_dst += static_cast<_Tp>(factor);
}
//======================================================================================//
// sum kernels
//======================================================================================//
template <typename _Tp, typename _Up = _Tp>
__global__ void
cuda_sum_kernel(_Tp* dst, const _Up* src, uintmax_t size, const _Tp factor)
{
auto i0 = blockIdx.x * blockDim.x + threadIdx.x;
auto istride = blockDim.x * gridDim.x;
for(auto i = i0; i < size; i += istride)
dst[i] += static_cast<_Tp>(factor * src[i]);
}
//--------------------------------------------------------------------------------------//
template <typename _Tp, typename _Up = _Tp>
__global__ void
cuda_atomic_sum_kernel(_Tp* dst, const _Up* src, uintmax_t size, const _Tp factor)
{
auto i0 = blockIdx.x * blockDim.x + threadIdx.x;
auto istride = blockDim.x * gridDim.x;
for(auto i = i0; i < size; i += istride)
atomicAdd(&dst[i], static_cast<_Tp>(factor * src[i]));
}
//======================================================================================//
// reduction
//======================================================================================//
__global__ void
deviceReduceKernel(const float* in, float* out, int N);
//--------------------------------------------------------------------------------------//
__global__ void
sum_kernel_block(float* sum, const float* input, int n);
//--------------------------------------------------------------------------------------//
DLL float
deviceReduce(const float* in, float* out, int N);
//--------------------------------------------------------------------------------------//
DLL float
reduce(float* _in, float* _out, int size);
//======================================================================================//
// rotate
//======================================================================================//
DLL int32_t*
cuda_rotate(const int32_t* src, const float theta_rad, const float theta_deg,
const int nx, const int ny, cudaStream_t stream, const int eInterp);
//--------------------------------------------------------------------------------------//
DLL float*
cuda_rotate(const float* src, const float theta_rad, const float theta_deg, const int nx,
const int ny, cudaStream_t stream, const int eInterp);
//--------------------------------------------------------------------------------------//
DLL void
cuda_rotate_ip(int32_t* dst, const int32_t* src, const float theta_rad,
const float theta_deg, const int nx, const int ny, cudaStream_t stream,
const int eInterp);
//--------------------------------------------------------------------------------------//
DLL void
cuda_rotate_ip(float* dst, const float* src, const float theta_rad, const float theta_deg,
const int nx, const int ny, cudaStream_t stream, const int eInterp);
//======================================================================================//
# endif // NVCC
#endif // TOMOPY_USE_CUDA