19 #include <cutlass/cutlass.h>
20 #include <cutlass/fragment.h>
26 template <
typename AccumulatorsPerThread_,
27 typename ThreadsPerWarp_,
44 typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW>
FragmentA;
48 typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH>
FragmentB;
52 typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>
64 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
65 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
66 auto diff = a[i] - b[j];
67 const auto idx = j * AccumulatorsPerThread::kW + i;
68 d[idx] = diff * diff + c[idx];
75 template <
typename AccumulatorsPerThread_,
76 typename ThreadsPerWarp_,
93 typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW>
FragmentA;
97 typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH>
FragmentB;
101 typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>
113 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
114 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
115 auto diff = a[i] < b[j] ? b[j] - a[i] : a[i] - b[j];
116 const auto idx = j * AccumulatorsPerThread::kW + i;
117 d[idx] = diff + c[idx];
Definition: kernelparams.h:21
Template performing matrix diff-squared-add operation within a thread.
Definition: custom_accum.h:31
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = (a-b)^2 + c.
Definition: custom_accum.h:59
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:37
CUTLASS_DEVICE ThreadDiffSquaredAdd()
Ctor.
Definition: custom_accum.h:56
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:33
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:50
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:42
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:40
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:48
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:53
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:35
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:46
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:44
Template performing matrix L1-norm operation within a thread.
Definition: custom_accum.h:80
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:93
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:95
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:89
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:86
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:102
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:97
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:99
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:84
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:91
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = |a-b| + c.
Definition: custom_accum.h:108
CUTLASS_DEVICE ThreadL1NormAdd()
Ctor.
Definition: custom_accum.h:105
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:82