8 #include <cutlass/cutlass.h>
9 #include <cutlass/fragment.h>
15 template <
typename AccumulatorsPerThread_,
16 typename ThreadsPerWarp_,
33 typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW>
FragmentA;
37 typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH>
FragmentB;
41 typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>
53 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
54 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
55 auto diff = a[i] - b[j];
56 const auto idx = j * AccumulatorsPerThread::kW + i;
57 d[idx] = diff * diff + c[idx];
64 template <
typename AccumulatorsPerThread_,
65 typename ThreadsPerWarp_,
82 typedef cutlass::Fragment<ScalarA, AccumulatorsPerThread::kW>
FragmentA;
86 typedef cutlass::Fragment<ScalarB, AccumulatorsPerThread::kH>
FragmentB;
90 typedef cutlass::Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW, 16>
102 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
103 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
104 auto diff = a[i] < b[j] ? b[j] - a[i] : a[i] - b[j];
105 const auto idx = j * AccumulatorsPerThread::kW + i;
106 d[idx] = diff + c[idx];
Template performing matrix diff-squared-add operation within a thread.
Definition: custom_accum.h:20
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = (a-b)^2 + c.
Definition: custom_accum.h:48
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:26
CUTLASS_DEVICE ThreadDiffSquaredAdd()
Ctor.
Definition: custom_accum.h:45
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:22
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:39
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:31
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:29
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:37
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:42
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:24
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:35
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:33
Template performing matrix L1-norm operation within a thread.
Definition: custom_accum.h:69
cutlass::Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: custom_accum.h:82
ScalarB_ ScalarB
The type for B.
Definition: custom_accum.h:84
cutlass::ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: custom_accum.h:78
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: custom_accum.h:75
cutlass::Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: custom_accum.h:91
cutlass::Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: custom_accum.h:86
ScalarC_ ScalarC
The type for C and D.
Definition: custom_accum.h:88
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: custom_accum.h:73
ScalarA_ ScalarA
The type for A.
Definition: custom_accum.h:80
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = |a-b| + c.
Definition: custom_accum.h:97
CUTLASS_DEVICE ThreadL1NormAdd()
Ctor.
Definition: custom_accum.h:94
cutlass::Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: custom_accum.h:71