source/libtomo/accel/gpu/mlem.cu from tomopy/tomopy

source/libtomo/accel/gpu/mlem.cu
Summary

Maintainability

Test Coverage

Issues
//  Copyright (c) 2015, UChicago Argonne, LLC. All rights reserved.
//  Copyright 2015. UChicago Argonne, LLC. This software was produced
//  under U.S. Government contract DE-AC02-06CH11357 for Argonne National
//  Laboratory (ANL), which is operated by UChicago Argonne, LLC for the
//  U.S. Department of Energy. The U.S. Government has rights to use,
//  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
//  UChicago Argonne, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
//  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is
//  modified to produce derivative works, such modified software should
//  be clearly marked, so as not to confuse it with the version available
//  from ANL.
//  Additionally, redistribution and use in source and binary forms, with
//  or without modification, are permitted provided that the following
//  conditions are met:
//      * Redistributions of source code must retain the above copyright
//        notice, this list of conditions and the following disclaimer.
//      * Redistributions in binary form must reproduce the above copyright
//        notice, this list of conditions and the following disclaimer in
//        the documentation andwith the
//        distribution.
//      * Neither the name of UChicago Argonne, LLC, Argonne National
//        Laboratory, ANL, the U.S. Government, nor the names of its
//        contributors may be used to endorse or promote products derived
//        from this software without specific prior written permission.
//  THIS SOFTWARE IS PROVIDED BY UChicago Argonne, LLC AND CONTRIBUTORS
//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
//  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UChicago
//  Argonne, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
//  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
//  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
//  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
//  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
//  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
//  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
//  POSSIBILITY OF SUCH DAMAGE.
//  ---------------------------------------------------------------
//   TOMOPY CUDA implementation

#include "common.hh"
#include "constants.hh"
#include "data.hh"
#include "utils.hh"

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdlib>
#include <memory>
#include <numeric>

//======================================================================================//

#if defined(TOMOPY_USE_NVTX)
extern nvtxEventAttributes_t nvtx_total;
extern nvtxEventAttributes_t nvtx_iteration;
extern nvtxEventAttributes_t nvtx_slice;
extern nvtxEventAttributes_t nvtx_projection;
extern nvtxEventAttributes_t nvtx_update;
extern nvtxEventAttributes_t nvtx_rotate;
#endif

//======================================================================================//

typedef GpuData::init_data_t  init_data_t;
typedef GpuData::data_array_t data_array_t;

//======================================================================================//

__global__ void
cuda_mlem_pixels_kernel(int p, int nx, int dx, float* recon, const float* data)
{
    int d0      = blockIdx.x * blockDim.x + threadIdx.x;
    int dstride = blockDim.x * gridDim.x;

    for(int d = d0; d < dx; d += dstride)
    {
        float sum = 0.0f;
        for(int i = 0; i < nx; ++i)
            sum += recon[d * nx + i];
        if(sum != 0.0f)
        {
            float upd = data[p * dx + d] / sum;
            if(upd == upd)
                for(int i = 0; i < nx; ++i)
                    recon[d * nx + i] += upd;
        }
    }
}

//======================================================================================//

__global__ void
cuda_mlem_update_kernel(float* recon, const float* update, const uint32_t* sum_dist,
                        int dx, int size)
{
    int i0      = blockIdx.x * blockDim.x + threadIdx.x;
    int istride = blockDim.x * gridDim.x;

    for(int i = i0; i < size; i += istride)
    {
        if(sum_dist[i] != 0 && dx != 0 && update[i] == update[i])
            recon[i] *= update[i] / scast<float>(sum_dist[i]) / scast<float>(dx);
    }
}

//======================================================================================//

void
mlem_gpu_compute_projection(data_array_t& gpu_data, int p, int dy, int dt, int dx, int nx,
                            int ny, const float* theta)
{
    auto cache = gpu_data[GetThisThreadID() % gpu_data.size()];

    // ensure running on proper device
    cuda_set_device(cache->device());

    // calculate some values
    float        theta_p_rad = fmodf(theta[p] + halfpi, twopi);
    float        theta_p_deg = theta_p_rad * degrees;
    int          block       = cache->block();
    int          grid        = cache->compute_grid(dx);
    cudaStream_t stream      = cache->stream();

    // synchronize the stream (do this frequently to avoid backlog)
    stream_sync(stream);
    CUDA_FAST_CHECK_LAST_ERROR();

    // reset destination arrays (NECESSARY! or will cause NaNs)
    // only do once bc for same theta, same pixels get overwritten
    cache->reset();

    for(int s = 0; s < dy; ++s)
    {
        const float* recon  = cache->recon() + s * nx * ny;
        const float* data   = cache->data() + s * dt * dx;
        float*       update = cache->update() + s * nx * ny;
        float*       rot    = cache->rot() + s * nx * ny;
        float*       tmp    = cache->tmp() + s * nx * ny;

        // forward-rotate
        cuda_rotate_ip(rot, recon, -theta_p_rad, -theta_p_deg, nx, ny, stream,
                       cache->interpolation());
        CUDA_CHECK_LAST_STREAM_ERROR(stream);

        // compute simdata
        cuda_mlem_pixels_kernel<<<grid, block, 0, stream>>>(p, nx, dx, rot, data);
        CUDA_CHECK_LAST_STREAM_ERROR(stream);

        // back-rotate
        cuda_rotate_ip(tmp, rot, theta_p_rad, theta_p_deg, nx, ny, stream,
                       cache->interpolation());
        CUDA_CHECK_LAST_STREAM_ERROR(stream);

        // update shared update array
        cuda_atomic_sum_kernel<<<grid, block, 0, stream>>>(update, tmp, nx * ny, 1.0f);
        CUDA_CHECK_LAST_STREAM_ERROR(stream);

        // synchronize the stream (do this frequently to avoid backlog)
        // stream_sync(stream);
    }
}

//======================================================================================//

void
mlem_cuda(const float* cpu_data, int dy, int dt, int dx, const float*, const float* theta,
          float* cpu_recon, int ngridx, int ngridy, int num_iter, RuntimeOptions* opts)
{
    printf("[%lu]> %s : nitr = %i, dy = %i, dt = %i, dx = %i, nx = %i, ny = %i\n",
           GetThisThreadID(), __FUNCTION__, num_iter, dy, dt, dx, ngridx, ngridy);

    // thread counter for device assignment
    static std::atomic<int> ntid;

    // compute some properties (expected python threads, max threads, device assignment)
    int pythread_num = ntid++;
    int device       = pythread_num % cuda_device_count();  // assign to device
    device           = PTL::GetEnv("TOMOPY_DEVICE_NUM", device) % cuda_device_count();

    TIMEMORY_AUTO_TIMER("");

    // GPU allocated copies
    cuda_set_device(device);
    printf("[%lu] Running on device %i...\n", GetThisThreadID(), device);

    uintmax_t    recon_pixels = scast<uintmax_t>(dy * ngridx * ngridy);
    auto         block        = GetBlockSize();
    auto         grid         = ComputeGridSize(recon_pixels, block);
    auto         main_stream  = create_streams(1);
    float*       update    = gpu_malloc_and_memset<float>(recon_pixels, 0, *main_stream);
    init_data_t  init_data = GpuData::initialize(opts, device, dy, dt, dx, ngridx, ngridy,
                                                cpu_recon, cpu_data, update);
    data_array_t gpu_data  = std::get<0>(init_data);
    float*       recon     = std::get<1>(init_data);
    float*       data      = std::get<2>(init_data);
    uint32_t*    sum_dist  = cuda_compute_sum_dist(dy, dt, dx, ngridx, ngridy, theta);

    NVTX_RANGE_PUSH(&nvtx_total);

    for(int i = 0; i < num_iter; i++)
    {
        // timing and profiling
        TIMEMORY_AUTO_TIMER("");
        NVTX_RANGE_PUSH(&nvtx_iteration);
        START_TIMER(t_start);

        // sync the main stream
        stream_sync(*main_stream);

        // reset global update and sum_dist
        gpu_memset(update, 0, recon_pixels, *main_stream);

        // sync
        GpuData::sync(gpu_data);

        // execute the loop over slices and projection angles
        execute<data_array_t>(opts, dt, std::ref(gpu_data), mlem_gpu_compute_projection,
                              dy, dt, dx, ngridx, ngridy, theta);

        // sync the thread streams
        GpuData::sync(gpu_data);

        // sync the main stream
        stream_sync(*main_stream);

        // update the global recon with global update and sum_dist
        cuda_mlem_update_kernel<<<grid, block, 0, *main_stream>>>(recon, update, sum_dist,
                                                                  dx, recon_pixels);

        // stop profile range and report timing
        NVTX_RANGE_POP(0);
        REPORT_TIMER(t_start, "iteration", i, num_iter);
    }

    // copy to cpu
    gpu2cpu_memcpy<float>(cpu_recon, recon, recon_pixels, *main_stream);

    // sync and destroy main stream
    destroy_streams(main_stream, 1);

    // cleanup
    cudaFree(recon);
    cudaFree(data);
    cudaFree(update);
    cudaFree(sum_dist);

    NVTX_RANGE_POP(0);

    // sync the device
    cudaDeviceSynchronize();
}

//======================================================================================//