source/libtomo/accel/gpu/utils.cu from tomopy/tomopy

source/libtomo/accel/gpu/utils.cu
Summary

Maintainability

Test Coverage

Issues
//  Copyright (c) 2015, UChicago Argonne, LLC. All rights reserved.
//  Copyright 2015. UChicago Argonne, LLC. This software was produced
//  under U.S. Government contract DE-AC02-06CH11357 for Argonne National
//  Laboratory (ANL), which is operated by UChicago Argonne, LLC for the
//  U.S. Department of Energy. The U.S. Government has rights to use,
//  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
//  UChicago Argonne, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
//  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is
//  modified to produce derivative works, such modified software should
//  be clearly marked, so as not to confuse it with the version available
//  from ANL.
//  Additionally, redistribution and use in source and binary forms, with
//  or without modification, are permitted provided that the following
//  conditions are met:
//      * Redistributions of source code must retain the above copyright
//        notice, this list of conditions and the following disclaimer.
//      * Redistributions in binary form must reproduce the above copyright
//        notice, this list of conditions and the following disclaimer in
//        the documentation andwith the
//        distribution.
//      * Neither the name of UChicago Argonne, LLC, Argonne National
//        Laboratory, ANL, the U.S. Government, nor the names of its
//        contributors may be used to endorse or promote products derived
//        from this software without specific prior written permission.
//  THIS SOFTWARE IS PROVIDED BY UChicago Argonne, LLC AND CONTRIBUTORS
//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
//  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UChicago
//  Argonne, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
//  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
//  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
//  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
//  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
//  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
//  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
//  POSSIBILITY OF SUCH DAMAGE.
//  ---------------------------------------------------------------
//   TOMOPY CUDA implementation

//======================================================================================//

#include "common.hh"
#include "data.hh"
#include "utils.hh"

#include <atomic>

//======================================================================================//

#if defined(TOMOPY_USE_NVTX)
extern nvtxEventAttributes_t nvtx_total;
extern nvtxEventAttributes_t nvtx_iteration;
extern nvtxEventAttributes_t nvtx_slice;
extern nvtxEventAttributes_t nvtx_projection;
extern nvtxEventAttributes_t nvtx_update;
extern nvtxEventAttributes_t nvtx_rotate;
#endif

namespace
{
std::atomic<int> _npp_stream_sync{ 0 };
}

//======================================================================================//

//  gridDim:    This variable contains the dimensions of the grid.
//  blockIdx:   This variable contains the block index within the grid.
//  blockDim:   This variable and contains the dimensions of the block.
//  threadIdx:  This variable contains the thread index within the block.

//======================================================================================//
//
//  compute sum_dist
//
//======================================================================================//

__global__ void
cuda_sum_dist_compute(int dy, int dx, int nx, int ny, const int32_t* ones,
                      uint32_t* sum_dist, int p)
{
    int nx0      = blockIdx.x * blockDim.x + threadIdx.x;
    int nxstride = blockDim.x * gridDim.x;
    int dx0      = blockIdx.y * blockDim.y + threadIdx.y;
    int dxstride = blockDim.y * gridDim.y;
    int dy0      = blockIdx.z * blockDim.z + threadIdx.z;
    int dystride = blockDim.z * gridDim.z;

    for(int s = dy0; s < dy; s += dystride)
    {
        for(int d = dx0; d < dx; d += dxstride)
        {
            uint32_t*      _sum_dist = sum_dist + (s * nx * ny) + (d * nx);
            const int32_t* _ones     = ones + (d * nx);
            for(int n = nx0; n < nx; n += nxstride)
            {
                atomicAdd(&_sum_dist[n], (_ones[n] > 0) ? 1 : 0);
            }
        }
    }
}

//======================================================================================//

uint32_t*
cuda_compute_sum_dist(int dy, int dt, int dx, int nx, int ny, const float* theta)
{
    // due to some really strange issue with streams, we use the default stream here
    // because after this has been executed more than once (i.e. we do SIRT and then
    // MLEM or MLEM and then SIRT), NPP returns error code -1000.
    // it has nothing to do with algorithm strangely... and only occurs here
    // where we rotate integers. This does not affect floats...

    auto block = GetBlockDims(dim3(32, 32, 1));
    auto grid  = ComputeGridDims(dim3(nx, dt, dy), block);

    int32_t*  rot      = gpu_malloc<int32_t>(nx * ny);
    int32_t*  tmp      = gpu_malloc_and_memset<int32_t>(nx * ny, 1, 0);
    uint32_t* sum_dist = gpu_malloc_and_memset<uint32_t>(dy * nx * ny, 0, 0);
    CUDA_CHECK_LAST_ERROR();

    assert(rot != nullptr);
    assert(tmp != nullptr);
    assert(sum_dist != nullptr);

    for(int p = 0; p < dt; ++p)
    {
        float theta_p_rad = fmodf(theta[p] + halfpi, twopi);
        float theta_p_deg = theta_p_rad * degrees;

        gpu_memset<int32_t>(rot, 0, nx * nx, 0);
        CUDA_CHECK_LAST_ERROR();  // debug mode only

        cuda_rotate_ip(rot, tmp, -theta_p_rad, -theta_p_deg, nx, ny, 0, GPU_NN);
        CUDA_CHECK_LAST_ERROR();  // debug mode only

        cuda_sum_dist_compute<<<grid, block, 0, 0>>>(dy, dx, nx, ny, rot, sum_dist, p);
        CUDA_CHECK_LAST_ERROR();  // debug mode only

        stream_sync(0);
    }

    // destroy
    cudaFree(tmp);
    cudaFree(rot);

    return sum_dist;
}

//======================================================================================//
//
//  rotate
//
//======================================================================================//

template <typename _Tp>
void
print_array(const _Tp* data, int nx, int ny, const std::string& desc)
{
    std::stringstream ss;
    ss << desc << "\n\n";
    ss << std::fixed;
    ss.precision(3);
    for(int j = 0; j < ny; ++j)
    {
        ss << "  ";
        for(int i = 0; i < nx; ++i)
        {
            ss << std::setw(8) << data[j * nx + i] << " ";
        }
        ss << std::endl;
    }
    std::cout << ss.str() << std::endl;
}

//======================================================================================//
//
//  rotate
//
//======================================================================================//

void
cuda_rotate_kernel(int32_t* dst, const int32_t* src, const float theta_rad,
                   const float theta_deg, const int nx, const int ny,
                   int eInterp = GPU_NN, cudaStream_t stream = 0)
{
    // use static + comma operator to set the stream the first time
    static bool _first = (nppSetStream(stream), false);
    (void) _first;

    bool _decrement = false;
    // if stream is current stream continue
    while(nppGetStream() != stream ||
          _npp_stream_sync.load(std::memory_order_relaxed) == 0)
    {
        // when this hits zero we update stream
        if(++_npp_stream_sync == 1)
        {
            _decrement = true;
            nppSetStream(stream);
            break;
        }
        else
        {
            --_npp_stream_sync;
        }
    }

    if(nppGetStream() != stream)
        throw std::runtime_error("Error! Wrong stream!");
    NVTX_RANGE_PUSH(&nvtx_rotate);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    auto getRotationMatrix2D = [&](double m[2][3], double scale) {
        double alpha    = scale * cos(theta_rad);
        double beta     = scale * sin(theta_rad);
        double center_x = (0.5 * nx) - 0.5;
        double center_y = (0.5 * ny) - 0.5;

        m[0][0] = alpha;
        m[0][1] = beta;
        m[0][2] = (1.0 - alpha) * center_x - beta * center_y;
        m[1][0] = -beta;
        m[1][1] = alpha;
        m[1][2] = beta * center_x + (1.0 - alpha) * center_y;
    };

    NppiSize siz;
    siz.width  = nx;
    siz.height = ny;

    NppiRect roi;
    roi.x      = 0;
    roi.y      = 0;
    roi.width  = nx;
    roi.height = ny;

    int    step = nx * sizeof(int32_t);
    double rot[2][3];
    getRotationMatrix2D(rot, 1.0);

    NppStatus ret =
        nppiWarpAffine_32s_C1R(src, siz, step, roi, dst, step, roi, rot, eInterp);

    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    if(ret != NPP_SUCCESS)
        fprintf(stderr, "[%lu] %s returned non-zero NPP status: %i @ %s:%i. src = %p\n",
                GetMessageID(), __FUNCTION__, ret, __FILE__, __LINE__, (void*) src);

    NVTX_RANGE_POP(stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    if(_decrement)
        --_npp_stream_sync;
}

//======================================================================================//
//
//  rotate
//
//======================================================================================//

void
cuda_rotate_kernel(float* dst, const float* src, const float theta_rad,
                   const float theta_deg, const int nx, const int ny,
                   int eInterp = GPU_CUBIC, cudaStream_t stream = 0)
{
    // use static + comma operator to set the stream the first time
    static bool _first = (nppSetStream(stream), false);
    (void) _first;

    bool _decrement = false;
    // if stream is current stream continue
    while(nppGetStream() != stream ||
          _npp_stream_sync.load(std::memory_order_relaxed) == 0)
    {
        // when this hits zero we update stream
        if(++_npp_stream_sync == 1)
        {
            _decrement = true;
            nppSetStream(stream);
            break;
        }
        else
        {
            --_npp_stream_sync;
        }
    }

    if(nppGetStream() != stream)
        throw std::runtime_error("Error! Wrong stream!");
    // stream_sync(stream);
    // nppSetStream(stream);
    NVTX_RANGE_PUSH(&nvtx_rotate);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    auto getRotationMatrix2D = [&](double m[2][3], double scale) {
        double alpha    = scale * cos(theta_rad);
        double beta     = scale * sin(theta_rad);
        double center_x = (0.5 * nx) - 0.5;
        double center_y = (0.5 * ny) - 0.5;

        m[0][0] = alpha;
        m[0][1] = beta;
        m[0][2] = (1.0 - alpha) * center_x - beta * center_y;
        m[1][0] = -beta;
        m[1][1] = alpha;
        m[1][2] = beta * center_x + (1.0 - alpha) * center_y;
    };

    NppiSize siz;
    siz.width  = nx;
    siz.height = ny;

    NppiRect roi;
    roi.x      = 0;
    roi.y      = 0;
    roi.width  = nx;
    roi.height = ny;

    int    step = nx * sizeof(float);
    double rot[2][3];
    getRotationMatrix2D(rot, 1.0);

#define USE_NPPI_ROTATE
#if defined(USE_NPPI_ROTATE)
    NppStatus ret = nppiRotate_32f_C1R(src, siz, step, roi, dst, step, roi, theta_deg,
                                       rot[0][2], rot[1][2], eInterp);
#else
    NppStatus ret =
        nppiWarpAffine_32f_C1R(src, siz, step, roi, dst, step, roi, rot, eInterp);
#endif

    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    if(ret != NPP_SUCCESS)
        fprintf(stderr, "[%lu] %s returned non-zero NPP status: %i @ %s:%i. src = %p\n",
                GetMessageID(), __FUNCTION__, ret, __FILE__, __LINE__, (void*) src);

    NVTX_RANGE_POP(stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);

    if(_decrement)
        --_npp_stream_sync;
}

//======================================================================================//

__global__ void
cuda_rotate_internal_kernel(float* dst, const float* src, float theta, const int nx,
                            const int ny)
{
    // this is flawed and should not be production
    int   src_size = nx * ny;
    float xoff     = (0.5f * nx) - 0.5f;
    float yoff     = (0.5f * ny) - 0.5f;

    int j0      = blockIdx.x * blockDim.x + threadIdx.x;
    int jstride = blockDim.x * gridDim.x;

    for(int j = j0; j < ny; j += jstride)
    {
        for(int i = 0; i < nx; ++i)
        {
            // indices in 2D
            float rx = float(i) - xoff;
            float ry = float(j) - yoff;
            // transformation
            float tx = rx * cosf(theta) + -ry * sinf(theta);
            float ty = rx * sinf(theta) + ry * cosf(theta);
            // indices in 2D
            float x = (tx + xoff);
            float y = (ty + yoff);
            // index in 1D array
            int  rz    = j * nx + i;
            auto index = [&](int _x, int _y) { return _y * nx + _x; };
            // within bounds
            int   x1    = floorf(tx + xoff);
            int   y1    = floorf(ty + yoff);
            int   x2    = x1 + 1;
            int   y2    = y1 + 1;
            float fxy1  = 0.0f;
            float fxy2  = 0.0f;
            int   ixy11 = index(x1, y1);
            int   ixy21 = index(x2, y1);
            int   ixy12 = index(x1, y2);
            int   ixy22 = index(x2, y2);
            if(ixy11 >= 0 && ixy11 < src_size)
                fxy1 += (x2 - x) * src[ixy11];
            if(ixy21 >= 0 && ixy21 < src_size)
                fxy1 += (x - x1) * src[ixy21];
            if(ixy12 >= 0 && ixy12 < src_size)
                fxy2 += (x2 - x) * src[ixy12];
            if(ixy22 >= 0 && ixy22 < src_size)
                fxy2 += (x - x1) * src[ixy22];
            dst[rz] += (y2 - y) * fxy1 + (y - y1) * fxy2;
        }
    }
}

//======================================================================================//

int32_t*
cuda_rotate(const int32_t* src, const float theta_rad, const float theta_deg,
            const int nx, const int ny, cudaStream_t stream, const int eInterp)
{
    int32_t* _dst = gpu_malloc<int32_t>(nx * ny);
    cuda_rotate_kernel(_dst, src, theta_rad, theta_deg, nx, ny, eInterp, stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);
    return _dst;
}

//======================================================================================//

void
cuda_rotate_ip(int32_t* dst, const int32_t* src, const float theta_rad,
               const float theta_deg, const int nx, const int ny, cudaStream_t stream,
               const int eInterp)
{
    cuda_rotate_kernel(dst, src, theta_rad, theta_deg, nx, ny, eInterp, stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);
}

//======================================================================================//

float*
cuda_rotate(const float* src, const float theta_rad, const float theta_deg, const int nx,
            const int ny, cudaStream_t stream, const int eInterp)
{
    float* _dst = gpu_malloc<float>(nx * ny);
    cuda_rotate_kernel(_dst, src, theta_rad, theta_deg, nx, ny, eInterp, stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);
    return _dst;
}

//======================================================================================//

void
cuda_rotate_ip(float* dst, const float* src, const float theta_rad, const float theta_deg,
               const int nx, const int ny, cudaStream_t stream, const int eInterp)
{
    cuda_rotate_kernel(dst, src, theta_rad, theta_deg, nx, ny, eInterp, stream);
    CUDA_CHECK_LAST_STREAM_ERROR(stream);
}

//======================================================================================//