deeplearning4j/deeplearning4j

View on GitHub
contrib/performance/benchmarking/impl/LightBenchmarkSuit.cpp

Summary

Maintainability
Test Coverage
/* ******************************************************************************
 *
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 *  See the NOTICE file distributed with this work for additional
 *  information regarding copyright ownership.
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

//
// @author raver119@gmail.com
//

#include <ops/declarable/CustomOperations.h>
#include "performance/benchmarking/LightBenchmarkSuit.h"

#ifdef RELEASE_BUILD
#define WARMUP 5
#define NUM_ITER 100

#else

#define WARMUP 5
#define NUM_ITER 100

#endif

namespace sd {

    template <typename T>
    static std::string transformBenchmark() {
        std::string output;
        output += "transformBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());

        BenchmarkHelper helper(WARMUP, NUM_ITER);
        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20 - 4MB
        BoolParameters inplace("inplace");

        ParametersBatch batch({&length, &inplace});

        auto generator = PARAMETRIC_XZ() {
            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
            arr->assign(1.0);
            x.push_back(arr);
            if(p.getIntParam("inplace") == 1){
                z.push_back(arr);
            } else {
                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
            }
        };

        ScalarBenchmark sbRelu(scalar::Ops::RELU, "RELU");
        sbRelu.setY(NDArrayFactory::create_<T>(0.0));

        TransformBenchmark tbSigmoid(transform::StrictOps::Sigmoid, "sigmoid");
        //TransformBenchmark tbSoftmax(transform::StrictOps::SoftMax, "softmax");

        output += helper.runOperationSuit(&sbRelu, generator, batch, "RELU");
        output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Sigmoid");
        //output += helper.runOperationSuit(&tbSigmoid, generator, batch, "Softmax");

        return output;
    }

    template <typename T>
    static std::string scalarBenchmark() {
        std::string output;
        output += "scalarBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());

        BenchmarkHelper helper(WARMUP, NUM_ITER);

        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20
        BoolParameters inplace("inplace");

        ParametersBatch batch({&length, &inplace});

        auto generator = PARAMETRIC_XZ() {
            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
            arr->assign(1.0);
            x.push_back(arr);
            if(p.getIntParam("inplace") == 1){
                z.push_back(arr);
            } else {
                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
            }
        };

        ScalarBenchmark sbAdd(scalar::Ops::Add, "sAdd");
        ScalarBenchmark sbDiv(scalar::Ops::Divide, "sDiv");
        ScalarBenchmark sbPow(scalar::Ops::Pow, "sPow");


        sbAdd.setY(NDArrayFactory::create_<T>(3.14159265359));
        sbDiv.setY(NDArrayFactory::create_<T>(3.14159265359));
        sbPow.setY(NDArrayFactory::create_<T>(3.14159265359));


        output += helper.runOperationSuit(&sbAdd, generator, batch, "Scalar Addition - x.add(3.14159265359)");
        output += helper.runOperationSuit(&sbDiv, generator, batch, "Scalar Division - x.div(3.14159265359)");
        output += helper.runOperationSuit(&sbPow, generator, batch, "Scalar Power - x.pow(3.14159265359)");

        return output;
    }


    template <typename T>
    static std::string pairwiseBenchmark() {
        std::string output;
        output += "pairwiseBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());

        BenchmarkHelper helper(WARMUP, NUM_ITER);
        IntPowerParameters length("length", 2, 8, 20, 4);      //2^4 to 2^20 in steps of 4 - 2^4, 2^8, 2^16, 2^20
        BoolParameters inplace("inplace");

        ParametersBatch batch({&length, &inplace});

        auto generator = PARAMETRIC_XYZ() {
            auto arr1 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
            auto arr2 = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});
            x.push_back(arr1);
            y.push_back(arr2);
            if(p.getIntParam("inplace") == 1){
                z.push_back(arr1);
            } else {
                z.push_back(NDArrayFactory::create_<T>('c', {p.getIntParam("length")}));
            }
        };

        PairwiseBenchmark pb1(pairwise::Ops::Add, "Add");
        output += helper.runOperationSuit(&pb1, generator, batch, "Pairwise Add");

        PairwiseBenchmark pb2(pairwise::Ops::Divide, "Divide");
        output += helper.runOperationSuit(&pb2, generator, batch, "Pairwise Divide");

        return output;
    }

    static std::string mismatchedOrderAssign() {
        std::string output;
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        IntPowerParameters rows("rows", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20
        BoolParameters cf("cf");

        ParametersBatch batch({&rows, &cf});

        auto generator = PARAMETRIC_XZ() {
            int numElements = 4194304;    //2^24
            int rows = p.getIntParam("rows");
            int cols = numElements / rows;
            bool c = p.getIntParam("cf");

            auto arr = NDArrayFactory::create_<float>(c ? 'c' : 'f', {rows, cols});
            auto arr2 = NDArrayFactory::create_<float>(c ? 'f' : 'c', {rows, cols});
            x.push_back(arr);
            z.push_back(arr2);
        };

        TransformBenchmark tb(transform::AnyOps::Assign, "assign");
        output += helper.runOperationSuit(&tb, generator, batch, "C->F and F->C Assign F32");

        //Also test: NCHW to NHWC and back
        BoolParameters nchw("nchw");
        int mb = 8;
        int hw = 64;
        int c = 3;
        ParametersBatch batch2({&nchw});
        auto generator2 = PARAMETRIC_XZ() {
            bool nchw = p.getIntParam("nchw");

            if(nchw) {
                auto orig = NDArrayFactory::create_<float>('c', {mb, c, hw, hw});
                orig->permutei({0,2,3,1});
                x.push_back(orig);
                z.push_back(NDArrayFactory::create_<float>('c', {mb, hw, hw, c}));
            } else {
                auto orig = NDArrayFactory::create_<float>('c', {mb, hw, hw, c});
                orig->permutei({0,3,1,2});
                x.push_back(orig);
                z.push_back(NDArrayFactory::create_<float>('c', {mb, c, hw, hw}));
            }
        };

        TransformBenchmark tb2(transform::AnyOps::Assign, "assign_nchw");
        output += helper.runOperationSuit(&tb2, generator2, batch2, "nchw->nhwc and nhwc->nchw Assign FP32");
        return output;
    }

    template <typename T>
    static std::string gemmBenchmark() {
        std::string output;
        output += "gemm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        for (int o = 0; o <= 1; o++) {
            char resultOrder = (o == 0 ? 'f' : 'c');
            IntPowerParameters sz("sz", 2, 4, 10, 2);          //2^4=16, ..., 2^10=1024   ->  4 elements

            ParametersBatch b({&sz});

            auto generator = PARAMETRIC_XYZ() {
                auto a = p.getIntParam("sz");
                auto b = p.getIntParam("sz");
                auto c = p.getIntParam("sz");
                std::vector<sd::LongType> shapeA;
                std::vector<sd::LongType> shapeB;
                shapeA = {a, b};
                shapeB = {b, c};
                auto A = NDArrayFactory::create_<T>('c', shapeA);
                auto B = NDArrayFactory::create_<T>('c', shapeB);
                auto C = NDArrayFactory::create_<T>(resultOrder, {a, c});

                x.push_back(A);
                y.push_back(B);
                z.push_back(C);
            };

            std::string n;
            n += "Gemm - cOrder=";
            n += resultOrder;

            MatrixBenchmark mb(1.0, 0.0, false, false, n);

            output += helper.runOperationSuit(&mb, generator, b, n.c_str());
        }

        return output;
    }

    template <typename T>
    static std::string reduceFullBenchmark() {
        std::string output;
        output += "reduceFullBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());

        BenchmarkHelper helper(WARMUP, NUM_ITER);

        IntPowerParameters length("length", 2, 8, 20, 4);      //2^8, 2^12, 2^16, 2^20

        ParametersBatch batch({&length});

        auto generator = PARAMETRIC_XYZ() {
            auto arr = NDArrayFactory::create_<T>('c', {p.getIntParam("length")});

            x.push_back(arr);
            y.push_back(nullptr);
            z.push_back(NDArrayFactory::create_<T>(0.0f));
        };

        ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
        ReductionBenchmark rbProd(reduce::SameOps::Prod, "prod");
        ReductionBenchmark rbMax(reduce::SameOps::Max, "max");

        output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
        output += helper.runOperationSuit(&rbProd, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Product - Full Array Reduction");
        output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Maximum - Full Array Reduction");

        //Index reduction
        sd::ops::argmax opArgmax;
        DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
        auto generator3 = PARAMETRIC_D(){
            auto ctx = new Context(1);

            ctx->setInputArray(0, NDArrayFactory::create_<T>('c', {p.getIntParam("length")}), true);
            ctx->setInputArray(1, NDArrayFactory::create_<sd::LongType>((sd::LongType)0), true);
            ctx->setOutputArray(0, NDArrayFactory::create_<sd::LongType>(0), true);

            return ctx;
        };
        output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Argmax Full Array Reduction");
        return output;
    }

    template <typename T>
    static std::string reduceDimBenchmark(){
        std::string output;
        output += "reduceDimBenchmark " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());

        BenchmarkHelper helper(WARMUP, NUM_ITER);

        int length[] = {1024*1024};
        int pow[] = {10};

        for( int i=0; i<1; i++ ){
            IntPowerParameters rows("rows", 2, 0, pow[i], 2);
            BoolParameters dim("dim");


            ParametersBatch batch({&rows, &dim});

            auto generator = PARAMETRIC_XYZ() {
                int rows = p.getIntParam("rows");
                int cols = length[i] / rows;
                int dim = p.getIntParam("dim");
                auto arr = NDArrayFactory::create_<T>('c', {rows, cols});


                x.push_back(arr);
                y.push_back(NDArrayFactory::create_<sd::LongType>(dim));

                NDArray* result;
                if(dim == 0){
                    result = NDArrayFactory::create_<T>('c', {cols});
                } else {
                    result = NDArrayFactory::create_<T>('c', {rows});
                }
                z.push_back(result);
            };

            ReductionBenchmark rbSum(reduce::SameOps::Sum, "sum");
            ReductionBenchmark rbMax(reduce::SameOps::Max, "max");

            std::string s1("Sum Along Dimension - ");
            s1 += std::to_string(length[i]);
            std::string s3("Maximum Along Dimension - ");
            s3 += std::to_string(length[i]);

            output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s1.c_str());
            output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, s3.c_str());



            auto generator3 = PARAMETRIC_D(){
                auto ctx = new Context(1);
                int rows = p.getIntParam("rows");
                int cols = length[i] / rows;
                int dim = p.getIntParam("dim");
                auto arr = NDArrayFactory::create_<T>('c', {rows, cols});

                auto dimArg = new sd::LongType[1];
                dimArg[0] = dim;
                ctx->setIArguments(dimArg, 1);
                delete[] dimArg;

                ctx->setInputArray(0, arr, true);

                NDArray* result;
                if(dim == 0){
                    result = NDArrayFactory::create_<sd::LongType>('c', {cols});
                } else {
                    result = NDArrayFactory::create_<sd::LongType>('c', {rows});
                }
                ctx->setOutputArray(0, result, true);
                return ctx;
            };

            std::string s5("Argmax Along Dimension - ");
            s5 += std::to_string(length[i]);

            sd::ops::argmax opArgmax;
            DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
            output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
        }
        return output;
    }

    template <typename T>
    static std::string conv2d(){
        std::string output;
        output += "conv2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        //Convolution2D op
        BoolParameters nhwc("nhwc");
        PredefinedParameters k("k", {2, 3});

        ParametersBatch batch({&nhwc, &k});
        sd::ops::conv2d conv2d;
        DeclarableBenchmark benchmark(conv2d, "conv2d");

        int hw = 64;

        auto generator = PARAMETRIC_D() {
            auto ctx = new Context(1);
            int n = p.getIntParam("nhwc");
            int khw = p.getIntParam("k");

            if (n == 0) {
                auto input = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
                auto output = NDArrayFactory::create_<T>('c', {8, 3, hw, hw});
                ctx->setInputArray(0, input, true);
                ctx->setOutputArray(0, output, true);
            } else {
                auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
                auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, 3});
                ctx->setInputArray(0, input, true);
                ctx->setOutputArray(0, output, true);
            }

            auto b = NDArrayFactory::create_<T>('c', {3});
            auto w = NDArrayFactory::create_<T>('c', {khw, khw, 3, 3});   // [kH, kW, iC, oC] always

            ctx->setInputArray(1, w, true);
            ctx->setInputArray(2, b, true);

            auto args = new sd::LongType[10];
            args[0] = args[1] = khw; //Kernel
            args[2] = args[3] = 1;//Stride
            args[4] = args[5] = 0;  //Pad
            args[6] = args[7] = 1;  //Dilation
            args[8] = 1;     //SAME
            args[9] = n;//0-nchw, 1=nhwc
            ctx->setIArguments(args, 10);
            delete[] args;

            return ctx;
        };

        output += helper.runOperationSuit(&benchmark, generator, batch, "Conv2d");
        return output;
    }

    template <typename T>
    static std::string pool2d() {
        std::string output;
        output += "pool2d " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        //Convolution2D op
        BoolParameters nhwc("nhwc");
        PredefinedParameters k("k", {2, 3});

        ParametersBatch batch({&nhwc, &k});

        int c = 3;
        int hw = 64;

        auto generator = PARAMETRIC_D() {
            auto ctx = new Context(1);
            int n = p.getIntParam("nhwc");
            int khw = p.getIntParam("k");

            if (n == 0) {
                auto input = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
                auto output = NDArrayFactory::create_<T>('c', {8, c, hw, hw});
                ctx->setInputArray(0, input, true);
                ctx->setOutputArray(0, output, true);
            } else {
                auto input = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
                auto output = NDArrayFactory::create_<T>('c', {8, hw, hw, c});
                ctx->setInputArray(0, input, true);
                ctx->setOutputArray(0, output, true);
            }

            auto args = new sd::LongType[11];
            args[0] = args[1] = khw; //Kernel
            args[2] = args[3] = 1;//Stride
            args[4] = args[5] = 0;  //Pad
            args[6] = args[7] = 1;  //Dilation
            args[8] = 1;     //SAME
            args[9] = 0;     //Divisor mode - 0 = exclude padding in divisor
            args[10] = n;//0-nchw, 1=nhwc
            ctx->setIArguments(args, 11);
            delete[] args;

            return ctx;
        };

        sd::ops::avgpool2d avgpool2d;
        DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
        output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pool 2d");

        sd::ops::maxpool2d maxpool2d;
        DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
        output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pool 2d");
        return output;
    }

    template <typename T>
    static std::string lstmBenchmark() {
        std::string output;
        output += "lstm " + DataTypeUtils::asString(DataTypeUtils::fromT<T>());
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        BoolParameters format("format");    //0=TNS=[seqLen,mb,size]; 1=NST=[mb,size,seqLen]
        PredefinedParameters mb("mb", {1, 8});
        int n = 128;

        ParametersBatch batch({&format, &mb});
        sd::ops::lstmBlock lstmBlock;
        DeclarableBenchmark benchmark(lstmBlock, "lstm");

        int seqLength = 8;

        auto generator = PARAMETRIC_D() {
            auto ctx = new Context(1);
            int f = p.getIntParam("format");
            int m = p.getIntParam("mb");

            sd::LongType l = 0;
            ctx->setInputArray(0, NDArrayFactory::create_<sd::LongType>(l), true);  //Max TS length (unused)


            if (f == 0) {
                //TNS format
                ctx->setInputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);     //x
                ctx->setOutputArray(0, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //i
                ctx->setOutputArray(1, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //c
                ctx->setOutputArray(2, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //f
                ctx->setOutputArray(3, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //o
                ctx->setOutputArray(4, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //z
                ctx->setOutputArray(5, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //h
                ctx->setOutputArray(6, NDArrayFactory::create_<T>('c', {seqLength, m, n}), true);    //y
            } else {
                //NST format
                ctx->setInputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);     //x
                ctx->setOutputArray(0, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //i
                ctx->setOutputArray(1, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //c
                ctx->setOutputArray(2, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //f
                ctx->setOutputArray(3, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //o
                ctx->setOutputArray(4, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //z
                ctx->setOutputArray(5, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //h
                ctx->setOutputArray(6, NDArrayFactory::create_<T>('f', {m, n, seqLength}), true);    //y
            }

            auto cLast = NDArrayFactory::create_<T>('c', {m, n});
            auto yLast = NDArrayFactory::create_<T>('c', {m, n});
            auto W = NDArrayFactory::create_<T>('c', {2 * n, 4 * n});
            auto Wci = NDArrayFactory::create_<T>('c', {n});
            auto Wcf = NDArrayFactory::create_<T>('c', {n});
            auto Wco = NDArrayFactory::create_<T>('c', {n});
            auto b = NDArrayFactory::create_<T>('c', {4 * n});

            ctx->setInputArray(2, cLast, true);
            ctx->setInputArray(3, yLast, true);
            ctx->setInputArray(4, W, true);
            ctx->setInputArray(5, Wci, true);
            ctx->setInputArray(6, Wcf, true);
            ctx->setInputArray(7, Wco, true);
            ctx->setInputArray(8, b, true);

            auto iargs = new sd::LongType[2];
            iargs[0] = 0;   //No peephole
            iargs[1] = f;
            ctx->setIArguments(iargs, 2);
            delete[] iargs;

            auto targs = new double[2];
            targs[0] = 1.0; //forget bias
            targs[1] = 0.0; //cell clipping value
            ctx->setTArguments(targs, 2);
            delete[] targs;
            return ctx;
        };

        output += helper.runOperationSuit(&benchmark, generator, batch, "LSTMBlock");
        return output;
    }

    static std::string broadcast2d() {
        std::string output;
        BenchmarkHelper helper(WARMUP, NUM_ITER);

        int rows = 65536;
        IntPowerParameters cols("cols", 2, 2, 12, 4);      //2^2 to 2^12 in steps of 2 - 2^1=2, ..., 2^10=1024
        BoolParameters axis("axis");
        BoolParameters inplace("inplace");

        ParametersBatch batch({&cols, &axis, &inplace});

        auto generator = PARAMETRIC_D() {
            auto a = p.getIntParam("axis");
            auto arr = NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")});

            auto ctx = new Context(1);
            ctx->setInputArray(0, arr, true);
            if(a == 0){
                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {rows, 1}), true);
            } else {
                ctx->setInputArray(1, NDArrayFactory::create_<float>('c', {1, p.getIntParam("cols")}), true);
            }
            if (p.getIntParam("inplace") == 1) {
                ctx->setOutputArray(0, arr);
                ctx->markInplace(true);
            } else {
                ctx->setOutputArray(0, NDArrayFactory::create_<float>('c', {rows, p.getIntParam("cols")}), true);
            }
            return ctx;
        };

        std::string s("add");
        sd::ops::add op;
        DeclarableBenchmark benchmark(op, "add");
        output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
        return output;
    }

    std::string LightBenchmarkSuit::runSuit() {
#ifdef RELEASE_BUILD
        std::vector<sd::DataType> dtypes({sd::DataType::FLOAT32, sd::DataType::HALF});
#else
        std::vector<sd::DataType> dtypes({sd::DataType::FLOAT32});
#endif

        std::string result;

        for (auto t:dtypes) {
            nd4j_printf("Running LightBenchmarkSuite.transformBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += transformBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.scalarBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += scalarBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.reduceDimBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += reduceDimBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.gemmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += gemmBenchmark, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.conv2d [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += conv2d, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.pool2d [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += pool2d, (), LIBND4J_TYPES);

            nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
            BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES);

        }

        nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", "");
        result += broadcast2d();
        nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", "");
        result += mismatchedOrderAssign();

        return result;
    }
}