cuML C++ API: src_prims/linalg/custom_accum.h Source File

 /*

  * SPDX-FileCopyrightText: Copyright (c) 2018-2021, NVIDIA CORPORATION.

  * SPDX-License-Identifier: Apache-2.0

  */


 #pragma once


 #include <cutlass/cutlass.h>

 #include <cutlass/fragment.h>


 namespace MLCommon {

 namespace LinAlg {


 template <typename AccumulatorsPerThread_,

           typename ThreadsPerWarp_,

           typename ScalarA_,

           typename ScalarB_,

           typename ScalarC_>

 struct ThreadDiffSquaredAdd {

   typedef cutlass::Shape<1, 1, 1, 1> InstructionShape;

   typedef AccumulatorsPerThread_ AccumulatorsPerThread;

   typedef ThreadsPerWarp_ ThreadsPerWarp;

   typedef

     typename cutlass::ShapeMul<AccumulatorsPerThread, ThreadsPerWarp>::Shape AccumulatorsPerWarp;

   typedef ScalarA_ ScalarA;

   typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW> FragmentA;

   typedef ScalarB_ ScalarB;

   typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH> FragmentB;

   typedef ScalarC_ ScalarC;

   typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>

     Accumulators;


   CUTLASS_DEVICE ThreadDiffSquaredAdd() {}


   CUTLASS_DEVICE void multiply_add(FragmentA const& a,

                                    FragmentB const& b,

                                    Accumulators const& c,

                                    Accumulators& d)

   {

     for (int j = 0; j < AccumulatorsPerThread::kH; ++j) {

       for (int i = 0; i < AccumulatorsPerThread::kW; ++i) {

         auto diff      = a[i] - b[j];

         const auto idx = j * AccumulatorsPerThread::kW + i;

         d[idx]         = diff * diff + c[idx];

       }

     }

   }

 };


 template <typename AccumulatorsPerThread_,

           typename ThreadsPerWarp_,

           typename ScalarA_,

           typename ScalarB_,

           typename ScalarC_>

 struct ThreadL1NormAdd {

   typedef cutlass::Shape<1, 1, 1, 1> InstructionShape;

   typedef AccumulatorsPerThread_ AccumulatorsPerThread;

   typedef ThreadsPerWarp_ ThreadsPerWarp;

   typedef

     typename cutlass::ShapeMul<AccumulatorsPerThread, ThreadsPerWarp>::Shape AccumulatorsPerWarp;

   typedef ScalarA_ ScalarA;

   typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW> FragmentA;

   typedef ScalarB_ ScalarB;

   typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH> FragmentB;

   typedef ScalarC_ ScalarC;

   typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>

     Accumulators;


   CUTLASS_DEVICE ThreadL1NormAdd() {}


   CUTLASS_DEVICE void multiply_add(FragmentA const& a,

                                    FragmentB const& b,

                                    Accumulators const& c,

                                    Accumulators& d)

   {

     for (int j = 0; j < AccumulatorsPerThread::kH; ++j) {

       for (int i = 0; i < AccumulatorsPerThread::kW; ++i) {

         auto diff      = a[i] < b[j] ? b[j] - a[i] : a[i] - b[j];

         const auto idx = j * AccumulatorsPerThread::kW + i;

         d[idx]         = diff + c[idx];

       }

     }

   }

 };


 };  // end namespace LinAlg

 };  // end namespace MLCommon

MLCommon
Definition: Timer.h:9

MLCommon::LinAlg::ThreadDiffSquaredAdd
Template performing matrix diff-squared-add operation within a thread.
Definition: custom_accum.h:20

MLCommon::LinAlg::ThreadDiffSquaredAdd::multiply_add
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = (a-b)^2 + c.
Definition: custom_accum.h:48

MLCommon::LinAlg::ThreadDiffSquaredAdd::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:26

MLCommon::LinAlg::ThreadDiffSquaredAdd::ThreadDiffSquaredAdd
CUTLASS_DEVICE ThreadDiffSquaredAdd()
Ctor.
Definition: custom_accum.h:45

MLCommon::LinAlg::ThreadDiffSquaredAdd::InstructionShape
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:22

MLCommon::LinAlg::ThreadDiffSquaredAdd::ScalarC
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:39

MLCommon::LinAlg::ThreadDiffSquaredAdd::ScalarA
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:31

MLCommon::LinAlg::ThreadDiffSquaredAdd::AccumulatorsPerWarp
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:29

MLCommon::LinAlg::ThreadDiffSquaredAdd::FragmentB
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:37

MLCommon::LinAlg::ThreadDiffSquaredAdd::Accumulators
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:42

MLCommon::LinAlg::ThreadDiffSquaredAdd::AccumulatorsPerThread
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:24

MLCommon::LinAlg::ThreadDiffSquaredAdd::ScalarB
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:35

MLCommon::LinAlg::ThreadDiffSquaredAdd::FragmentA
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:33

MLCommon::LinAlg::ThreadL1NormAdd
Template performing matrix L1-norm operation within a thread.
Definition: custom_accum.h:69

MLCommon::LinAlg::ThreadL1NormAdd::FragmentA
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:82

MLCommon::LinAlg::ThreadL1NormAdd::ScalarB
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:84

MLCommon::LinAlg::ThreadL1NormAdd::AccumulatorsPerWarp
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:78

MLCommon::LinAlg::ThreadL1NormAdd::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:75

MLCommon::LinAlg::ThreadL1NormAdd::Accumulators
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:91

MLCommon::LinAlg::ThreadL1NormAdd::FragmentB
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:86

MLCommon::LinAlg::ThreadL1NormAdd::ScalarC
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:88

MLCommon::LinAlg::ThreadL1NormAdd::AccumulatorsPerThread
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:73

MLCommon::LinAlg::ThreadL1NormAdd::ScalarA
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:80

MLCommon::LinAlg::ThreadL1NormAdd::multiply_add
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = |a-b| + c.
Definition: custom_accum.h:97

MLCommon::LinAlg::ThreadL1NormAdd::ThreadL1NormAdd
CUTLASS_DEVICE ThreadL1NormAdd()
Ctor.
Definition: custom_accum.h:94

MLCommon::LinAlg::ThreadL1NormAdd::InstructionShape
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:71