floating_conversion.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/utilities/export.hpp>
21 
22 #include <cuda/std/cmath>
23 #include <cuda/std/limits>
24 #include <cuda/std/type_traits>
25 
26 #include <cstring>
27 
28 namespace CUDF_EXPORT numeric {
29 
37 namespace detail {
38 
46 template <typename T,
47  CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
48  std::is_same_v<T, __uint128_t>)>
50 {
51 #ifdef __CUDA_ARCH__
52  if constexpr (std::is_same_v<T, uint64_t>) {
53  return 64 - __clzll(static_cast<int64_t>(value));
54  } else if constexpr (std::is_same_v<T, uint32_t>) {
55  return 32 - __clz(static_cast<int32_t>(value));
56  } else if constexpr (std::is_same_v<T, __uint128_t>) {
57  // 128 bit type, must break up into high and low components
58  auto const high_bits = static_cast<int64_t>(value >> 64);
59  auto const low_bits = static_cast<int64_t>(value);
60  return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
61  }
62 #else
63  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
64  if (value == 0) { return 0; }
65 
66  if constexpr (std::is_same_v<T, uint64_t>) {
67  return 64 - __builtin_clzll(value);
68  } else if constexpr (std::is_same_v<T, uint32_t>) {
69  return 32 - __builtin_clz(value);
70  } else if constexpr (std::is_same_v<T, __uint128_t>) {
71  // 128 bit type, must break up into high and low components
72  auto const high_bits = static_cast<uint64_t>(value >> 64);
73  if (high_bits == 0) {
74  return 64 - __builtin_clzll(static_cast<uint64_t>(value));
75  } else {
76  return 128 - __builtin_clzll(high_bits);
77  }
78  }
79 #endif
80 }
81 
87 template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
89  // This struct assumes we're working with IEEE 754 floating-point values.
90  // Details on the IEEE-754 floating-point format:
91  // Format: https://learn.microsoft.com/en-us/cpp/build/ieee-floating-point-representation
92  // Float Visualizer: https://www.h-schmidt.net/FloatConverter/IEEE754.html
93  static_assert(cuda::std::numeric_limits<FloatingType>::is_iec559, "Assumes IEEE 754");
94 
96  using IntegralType =
97  cuda::std::conditional_t<cuda::std::is_same_v<FloatingType, float>, uint32_t, uint64_t>;
98 
99  // The high bit is the sign bit (0 for positive, 1 for negative).
101  static constexpr int num_floating_bits = sizeof(FloatingType) * CHAR_BIT;
103  static constexpr int sign_bit_index = num_floating_bits - 1;
105  static constexpr IntegralType sign_mask = (IntegralType(1) << sign_bit_index);
106 
107  // The low 23 / 52 bits (for float / double) are the mantissa.
108  // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
109  // The value of the mantissa is in the range [1, 2).
111  static constexpr int num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
113  static constexpr int num_stored_mantissa_bits = num_significand_bits - 1;
115  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits);
117  static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
118 
119  // And in between are the bits used to store the biased power-of-2 exponent.
121  static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1;
123  static constexpr IntegralType unshifted_exponent_mask =
124  (IntegralType(1) << num_exponent_bits) - 1;
126  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits;
127 
128  // To store positive and negative exponents as unsigned values, the stored value for
129  // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
131  static constexpr int exponent_bias = cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
132 
139  CUDF_HOST_DEVICE inline static IntegralType bit_cast_to_integer(FloatingType floating)
140  {
141  // Convert floating to integer
142  IntegralType integer_rep;
143  memcpy(&integer_rep, &floating, sizeof(floating));
144  return integer_rep;
145  }
146 
153  CUDF_HOST_DEVICE inline static FloatingType bit_cast_to_floating(IntegralType integer)
154  {
155  // Convert back to float
156  FloatingType floating;
157  memcpy(&floating, &integer, sizeof(floating));
158  return floating;
159  }
160 
167  CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep)
168  {
169  // It's a zero if every non-sign bit is zero
170  return ((integer_rep & ~sign_mask) == 0);
171  }
172 
179  CUDF_HOST_DEVICE inline static bool get_is_negative(IntegralType integer_rep)
180  {
181  // Extract the sign bit:
182  return static_cast<bool>(sign_mask & integer_rep);
183  }
184 
194  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
195  IntegralType integer_rep)
196  {
197  // Extract the significand
198  auto significand = (integer_rep & mantissa_mask);
199 
200  // Extract the exponent bits.
201  auto const exponent_bits = integer_rep & exponent_mask;
202 
203  // Notes on special values of exponent_bits:
204  // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input.
205  // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller).
206  int floating_pow2;
207  if (exponent_bits == 0) {
208  // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i)
209  // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1),
210  // and B_i is the value of that bit (0 or 1)
211  // So e.g. for the minimum denormal, only the lowest bit is set:
212  // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149
213  // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074
214  floating_pow2 = 1 - exponent_bias;
215 
216  // Line-up denormal to same (understood) bit as normal numbers
217  // This is so bit-shifting starts at the same bit index
218  auto const lineup_shift = num_significand_bits - count_significant_bits(significand);
219  significand <<= lineup_shift;
220  floating_pow2 -= lineup_shift;
221  } else {
222  // Extract the exponent value: shift the bits down and subtract the bias.
223  auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits;
224  floating_pow2 = static_cast<int>(shifted_exponent_bits) - exponent_bias;
225 
226  // Set the high bit for the understood 1/2
227  significand |= understood_bit_mask;
228  }
229 
230  // To convert the mantissa to an integer, we effectively applied #-mantissa-bits
231  // powers of 2 to convert the fractional value to an integer, so subtract them off here
232  int const pow2 = floating_pow2 - num_stored_mantissa_bits;
233 
234  return {significand, pow2};
235  }
236 
244  CUDF_HOST_DEVICE inline static FloatingType set_is_negative(FloatingType floating,
245  bool is_negative)
246  {
247  // Convert floating to integer
248  IntegralType integer_rep = bit_cast_to_integer(floating);
249 
250  // Set the sign bit. Note that the input floating-point number must be positive (bit = 0).
251  integer_rep |= (IntegralType(is_negative) << sign_bit_index);
252 
253  // Convert back to float
254  return bit_cast_to_floating(integer_rep);
255  }
256 
266  CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2)
267  {
268  // Note that the input floating-point number is positive (& whole), so we don't have to
269  // worry about the sign here; the sign will be set later in set_is_negative()
270 
271  // Convert floating to integer
272  auto integer_rep = bit_cast_to_integer(floating);
273 
274  // Extract the currently stored (biased) exponent
275  using SignedType = std::make_signed_t<IntegralType>;
276  auto exponent_bits = integer_rep & exponent_mask;
277  auto stored_pow2 = static_cast<SignedType>(exponent_bits >> num_stored_mantissa_bits);
278 
279  // Add the additional power-of-2
280  stored_pow2 += pow2;
281 
282  // Check for exponent over/under-flow.
283  if (stored_pow2 <= 0) {
284  // Denormal (zero handled prior to input)
285 
286  // Early out if bit shift will zero it anyway.
287  // Note: We must handle this explicitly, as too-large a bit-shift is UB
288  auto const bit_shift = -stored_pow2 + 1; //+1 due to understood bit set below
289  if (bit_shift > num_stored_mantissa_bits) { return 0.0; }
290 
291  // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit)
292  integer_rep &= (~exponent_mask);
293 
294  // The input floating-point number has an "understood" bit that we need to set
295  // prior to bit-shifting. Set the understood bit.
296  integer_rep |= understood_bit_mask;
297 
298  // Convert to denormal: bit shift off the low bits
299  integer_rep >>= bit_shift;
300  } else if (stored_pow2 >= static_cast<SignedType>(unshifted_exponent_mask)) {
301  // Overflow: Set infinity
302  return cuda::std::numeric_limits<FloatingType>::infinity();
303  } else {
304  // Normal number: Clear existing exponent bits and set new ones
305  exponent_bits = static_cast<IntegralType>(stored_pow2) << num_stored_mantissa_bits;
306  integer_rep &= (~exponent_mask);
307  integer_rep |= exponent_bits;
308  }
309 
310  // Convert back to float
311  return bit_cast_to_floating(integer_rep);
312  }
313 };
314 
324 template <int Pow10>
325 constexpr __uint128_t large_power_of_10()
326 {
327  // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
328  static_assert(Pow10 >= 19);
329  if constexpr (Pow10 == 19)
330  return __uint128_t(10000000000000000000ULL);
331  else
332  return large_power_of_10<Pow10 - 1>() * __uint128_t(10);
333 }
334 
343 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
344 CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
345 {
346  // Computing division this way is much faster than the alternatives.
347  // Division is not implemented in GPU hardware, and the compiler will often implement it as a
348  // multiplication of the reciprocal of the denominator, requiring a conversion to floating point.
349  // Ths is especially slow for larger divides that have to use the FP64 pipeline, where threads
350  // bottleneck.
351 
352  // Instead, if the compiler can see exactly what number it is dividing by, it can
353  // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
354  // For the compiler to see the value though, array lookup (with pow10 as the index)
355  // is not sufficient: We have to use a switch statement. Although this introduces a branch,
356  // it is still much faster than doing the divide any other way.
357  // Perhaps an array can be used in C++23 with the assume attribute?
358 
359  // Since we're optimizing division this way, we have to do this for multiplication as well.
360  // That's because doing them in different ways (switch, array, runtime-computation, etc.)
361  // increases the register pressure on all kernels that use fixed_point types, specifically slowing
362  // down some of the PYMOD and join benchmarks.
363 
364  // This is split up into separate functions for 32-, 64-, and 128-bit denominators.
365  // That way we limit the templated, inlined code generation to the exponents that are
366  // capable of being represented. Combining them together into a single function again
367  // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
368  // It also dramatically slows down the compile time.
369 
370  switch (pow10) {
371  case 0: return value;
372  case 1: return value / 10U;
373  case 2: return value / 100U;
374  case 3: return value / 1000U;
375  case 4: return value / 10000U;
376  case 5: return value / 100000U;
377  case 6: return value / 1000000U;
378  case 7: return value / 10000000U;
379  case 8: return value / 100000000U;
380  case 9: return value / 1000000000U;
381  default: return 0;
382  }
383 }
384 
393 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
394 CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
395 {
396  return value / ipow<uint64_t, Radix::BASE_10>(pow10);
397 }
398 
407 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
408 CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
409 {
410  return value / ipow<__uint128_t, Radix::BASE_10>(pow10);
411 }
412 
421 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
422 CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
423 {
424  // See comments in divide_power10_32bit() for discussion.
425  switch (pow10) {
426  case 0: return value;
427  case 1: return value * 10U;
428  case 2: return value * 100U;
429  case 3: return value * 1000U;
430  case 4: return value * 10000U;
431  case 5: return value * 100000U;
432  case 6: return value * 1000000U;
433  case 7: return value * 10000000U;
434  case 8: return value * 100000000U;
435  case 9: return value * 1000000000U;
436  default: return 0;
437  }
438 }
439 
448 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
449 CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
450 {
451  return value * ipow<uint64_t, Radix::BASE_10>(pow10);
452 }
453 
462 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
463 CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
464 {
465  return value * ipow<__uint128_t, Radix::BASE_10>(pow10);
466 }
467 
480 template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
481 CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10)
482 {
483  // Use this function if you have no knowledge of what pow10 might be
484  // If you do, prefer calling the bit-size-specific versions
485  if constexpr (sizeof(Rep) <= 4) {
486  return multiply_power10_32bit(value, pow10);
487  } else if constexpr (sizeof(Rep) <= 8) {
488  return multiply_power10_64bit(value, pow10);
489  } else {
490  return multiply_power10_128bit(value, pow10);
491  }
492 }
493 
506 template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
507 CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10)
508 {
509  // Use this function if you have no knowledge of what pow10 might be
510  // If you do, prefer calling the bit-size-specific versions
511  if constexpr (sizeof(Rep) <= 4) {
512  return divide_power10_32bit(value, pow10);
513  } else if constexpr (sizeof(Rep) <= 8) {
514  return divide_power10_64bit(value, pow10);
515  } else {
516  return divide_power10_128bit(value, pow10);
517  }
518 }
519 
528 template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
529 CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift)
530 {
531  // Bit shifts larger than this are undefined behavior
532  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
533  return (bit_shift <= max_safe_bit_shift) ? value << bit_shift
534  : cuda::std::numeric_limits<IntegerType>::max();
535 }
536 
545 template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
546 CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift)
547 {
548  // Bit shifts larger than this are undefined behavior
549  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
550  return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0;
551 }
552 
556 template <typename FloatingType>
559  static constexpr bool is_double = cuda::std::is_same_v<FloatingType, double>;
560 
562  using IntegerRep = std::conditional_t<is_double, uint64_t, uint32_t>;
563 
565  static constexpr auto num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
566 
568  using ShiftingRep = std::conditional_t<is_double, __uint128_t, uint64_t>;
569 
570  // The significand of a float / double is 24 / 53 bits
571  // However, to uniquely represent each double / float as different #'s in decimal
572  // you need 17 / 9 digits (from std::numeric_limits<T>::max_digits10)
573  // To represent 10^17 / 10^9, you need 57 / 30 bits
574  // So we need to keep track of at least this # of bits during shifting to ensure no info is lost
575 
576  // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert
577  // between floating and decimal (see shifting functions for details).
578 
579  // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must
580  // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer
581 
582  // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio
583  // The difference between 2^10 and 10^3 is 1024/1000: 2.4%
584  // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4%
585 
586  // This 2.4% error compounds each time we do an iteration.
587  // The min (normal) float is 2^-126.
588  // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45
589  // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts)
590  // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45
591 
592  // Similarly, the min (normal) double is 2^-1022.
593  // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324
594  // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts)
595  // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324
596 
597  // To account for this compounding error, we can either complicate our loop code (slow),
598  // or use extra bits (in the direction we're shifting the 2s!) to compensate:
599  // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error)
601  static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1;
602 
603  // How much room do we have for shifting?
604  // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9
605  // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20
606  // Thus for double / float we can shift up to 20 / 9 decimal places at once
607 
608  // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow.
609  // To simplify our loop code, we'll keep to this ratio by instead shifting a max of
610  // 18 / 9 decimal places, for double / float (60 / 30 bits)
612  static constexpr int max_digits_shift = is_double ? 18 : 9;
614  static constexpr int max_bits_shift = max_digits_shift * 10 / 3;
615 
616  // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep
618  static constexpr auto max_digits_shift_pow =
619  multiply_power10<IntegerRep>(IntegerRep(1), max_digits_shift);
620 };
621 
634 template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
635 CUDF_HOST_DEVICE cuda::std::pair<typename floating_converter<FloatingType>::IntegralType, int>
636 add_half_if_truncates(FloatingType floating,
638  int pow2,
639  int pow10)
640 {
641  // The user-supplied scale may truncate information, so we need to talk about rounding.
642  // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345
643 
644  // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12!
645  // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1
646  // While correct (given our choice to truncate), this is surprising and undesirable.
647  // This problem happens because 1.2 is not perfectly representable in floating point,
648  // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...)
649 
650  // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how
651  // do we make sure we store 1.2? We'll add half an ulp! (unit in the last place)
652  // Then 1.1999... becomes 1.2000...1... which truncates to 1.2.
653  // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2
654 
655  // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the
656  // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number.
657  // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way.
658 
659  // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1.
660  // We'll always shift up so every input to the conversion algorithm is aligned the same way.
661 
662  // If we add a full ulp we run into issues where we add too much and get the wrong result.
663  // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision.
664  // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill.
665 
666  // So when does the user-supplied scale truncate info?
667  // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift.
668  // For powers < 0: When the 10s shift is less than the corresponding bit-shift.
669 
670  // Corresponding bit-shift:
671  // 2^10 is approximately 10^3, but this is off by 1.024%
672  // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10
673  // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90)
674  // Do comparison without dividing, which loses information:
675  // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3)
676  int const pow2_term = 90 * pow2;
677  int const pow10_term = 299 * pow10;
678  bool const conversion_truncates =
679  (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0));
680 
681  // However, don't add a half-bit if the input is a whole number!
682  // This is only for errors introduced by rounding decimal fractions!
683  bool const is_whole_number = (cuda::std::floor(floating) == floating);
684  bool const add_half_bit = conversion_truncates && !is_whole_number;
685 
686  // Add half a bit on truncation (shift to make room and update pow2)
687  integer_rep <<= 1;
688  --pow2;
689  integer_rep += static_cast<decltype(integer_rep)>(add_half_bit);
690 
691  return {integer_rep, pow2};
692 }
693 
704 template <typename Rep,
705  typename FloatingType,
706  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
707 CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_pospow(
708  typename shifting_constants<FloatingType>::IntegerRep const base2_value, int pow2, int pow10)
709 {
710  // To convert to decimal, we need to apply the input powers of 2 and 10
711  // The result will be (integer) base2_value * (2^pow2) / (10^pow10)
712  // Output type is ShiftingRep
713 
714  // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s.
715  // We'll iterate back and forth between them, shifting up by 2s
716  // and down by 10s until all of the powers have been applied.
717 
718  // However the input base2_value type has virtually no spare room to shift our data
719  // without over- or under-flowing and losing precision.
720  // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's
721  using Constants = shifting_constants<FloatingType>;
722  using ShiftingRep = typename Constants::ShiftingRep;
723  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
724 
725  // We want to start with our significand bits at the top of the shifting range,
726  // so that we don't lose information we need on intermediary right-shifts.
727  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side,
728  // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit.
729  // Note that shift_from is +1 due to shift in add_half_if_truncates()
730  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
731  static constexpr int shift_from = Constants::num_significand_bits + 1;
732  static constexpr int max_init_shift = shift_up_to - shift_from;
733 
734  // If our total bit shift is less than this, we don't need to iterate
735  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
736  if (pow2 <= max_init_shift) {
737  // Shift bits left, divide by 10s to apply the scale factor, and we're done.
738  shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
739  // NOTE: Cast can overflow!
740  return static_cast<UnsignedRep>(shifting_rep);
741  }
742 
743  // We need to iterate. Do the combined initial shift
744  shifting_rep <<= max_init_shift;
745  pow2 -= max_init_shift;
746 
747  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
748  while (pow10 > Constants::max_digits_shift) {
749  // More decimal places to shift than we have room: Divide the max number of 10s
750  shifting_rep /= Constants::max_digits_shift_pow;
751  pow10 -= Constants::max_digits_shift;
752 
753  // If our remaining bit shift is less than the max, we're finished iterating
754  if (pow2 <= Constants::max_bits_shift) {
755  // Shift bits left, divide by 10s to apply the scale factor, and we're done.
756  shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
757 
758  // NOTE: Cast can overflow!
759  return static_cast<UnsignedRep>(shifting_rep);
760  }
761 
762  // Shift the max number of bits left again
763  shifting_rep <<= Constants::max_bits_shift;
764  pow2 -= Constants::max_bits_shift;
765  }
766 
767  // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail
768  // Note: This divide result may not fit in the low half of the bit range
769  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
770  if constexpr (Constants::is_double) {
771  shifting_rep = divide_power10_64bit(shifting_rep, pow10);
772  } else {
773  shifting_rep = divide_power10_32bit(shifting_rep, pow10);
774  }
775 
776  // Final bit shift: Shift may be large, guard against UB
777  // NOTE: This can overflow (both cast and shift)!
778  return guarded_left_shift(static_cast<UnsignedRep>(shifting_rep), pow2);
779 }
780 
791 template <typename Rep,
792  typename FloatingType,
793  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
794 CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_negpow(
795  typename shifting_constants<FloatingType>::IntegerRep base2_value, int pow2, int pow10)
796 {
797  // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0
798  // See comments in that function for details.
799  // Instead here we need to multiply by 10s and shift right by 2s
800 
801  // ShiftingRep: uint64 for float's, __uint128_t for double's
802  using Constants = shifting_constants<FloatingType>;
803  using ShiftingRep = typename Constants::ShiftingRep;
804  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
805 
806  // Convert to using positive values so we don't have keep negating
807  int pow10_mag = -pow10;
808  int pow2_mag = -pow2;
809 
810  // For performing final 10s-shift
811  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
812  auto final_shifts_low10s = [&]() {
813  // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail
814  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
815  if constexpr (Constants::is_double) {
816  shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag);
817  } else {
818  shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag);
819  }
820 
821  // Final bit shifting: Shift may be large, guard against UB
822  return static_cast<UnsignedRep>(guarded_right_shift(shifting_rep, pow2_mag));
823  };
824 
825  // If our total decimal shift is less than the max, we don't need to iterate
826  if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); }
827 
828  // We want to start by lining up our bits to the top of the shifting range,
829  // except our first operation is a multiply, so not quite that far
830  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
831  // Note that shift_from is +1 due to shift in add_half_if_truncates()
832  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
833  static constexpr int shift_from = Constants::num_significand_bits + 1;
834  static constexpr int num_init_bit_shift = shift_up_to - shift_from;
835 
836  // Perform initial shift
837  shifting_rep <<= num_init_bit_shift;
838  pow2_mag += num_init_bit_shift;
839 
840  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
841  do {
842  // More decimal places to shift than we have room: Multiply the max number of 10s
843  shifting_rep *= Constants::max_digits_shift_pow;
844  pow10_mag -= Constants::max_digits_shift;
845 
846  // If our remaining bit shift is less than the max, we're finished iterating
847  if (pow2_mag <= Constants::max_bits_shift) {
848  // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail
849  shifting_rep >>= pow2_mag;
850 
851  // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.)
852  // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep.
853  // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying.
854  // NOTE: This can overflow! (Both multiply and cast)
855  return multiply_power10<UnsignedRep>(static_cast<UnsignedRep>(shifting_rep), pow10_mag);
856  }
857 
858  // More bits to shift than we have room: Shift the max number of 2s
859  shifting_rep >>= Constants::max_bits_shift;
860  pow2_mag -= Constants::max_bits_shift;
861  } while (pow10_mag > Constants::max_digits_shift);
862 
863  // Do our final shifts
864  return final_shifts_low10s();
865 }
866 
877 template <typename Rep,
878  typename FloatingType,
879  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
880 CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> convert_floating_to_integral_shifting(
881  typename floating_converter<FloatingType>::IntegralType base2_value, int pow10, int pow2)
882 {
883  // Apply the powers of 2 and 10 to convert to decimal.
884  // The result will be base2_value * (2^pow2) / (10^pow10)
885 
886  // Note that while this code is branchy, the decimal scale factor is part of the
887  // column type itself, so every thread will take the same branches on pow10.
888  // Also data within a column tends to be similar, so they will often take the
889  // same branches on pow2 as well.
890 
891  // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep)
892  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
893  if (pow10 == 0) {
894  // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32)
895  // Bit shifts may be large, guard against UB
896  if (pow2 >= 0) {
897  return guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
898  } else {
899  return static_cast<UnsignedRep>(guarded_right_shift(base2_value, -pow2));
900  }
901  } else if (pow10 > 0) {
902  if (pow2 <= 0) {
903  // Power-2/10 shifts both downward: order doesn't matter, apply and bail.
904  // Guard against shift being undefined behavior
905  auto const shifted = guarded_right_shift(base2_value, -pow2);
906  return static_cast<UnsignedRep>(divide_power10<decltype(shifted)>(shifted, pow10));
907  }
908  return shift_to_decimal_pospow<Rep, FloatingType>(base2_value, pow2, pow10);
909  } else { // pow10 < 0
910  if (pow2 >= 0) {
911  // Power-2/10 shifts both upward: order doesn't matter, apply and bail.
912  // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow!
913  auto const shifted = guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
914  return multiply_power10<UnsignedRep>(shifted, -pow10);
915  }
916  return shift_to_decimal_negpow<Rep, FloatingType>(base2_value, pow2, pow10);
917  }
918 }
919 
929 template <typename Rep,
930  typename FloatingType,
931  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
932 CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating,
933  scale_type const& scale)
934 {
935  // Extract components of the floating point number
936  using converter = floating_converter<FloatingType>;
937  auto const integer_rep = converter::bit_cast_to_integer(floating);
938  if (converter::is_zero(integer_rep)) { return 0; }
939 
940  // Note that the significand here is an unsigned integer with sizeof(FloatingType)
941  auto const is_negative = converter::get_is_negative(integer_rep);
942  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
943 
944  // Add half a bit if truncating to yield expected value, see function for discussion.
945  auto const pow10 = static_cast<int>(scale);
946  auto const [base2_value, pow2] =
947  add_half_if_truncates(floating, significand, floating_pow2, pow10);
948 
949  // Apply the powers of 2 and 10 to convert to decimal.
950  auto const magnitude =
951  convert_floating_to_integral_shifting<Rep, FloatingType>(base2_value, pow10, pow2);
952 
953  // Reapply the sign and return
954  // NOTE: Cast can overflow!
955  auto const signed_magnitude = static_cast<Rep>(magnitude);
956  return is_negative ? -signed_magnitude : signed_magnitude;
957 }
958 
968 template <typename FloatingType,
969  typename DecimalRep,
970  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
971 CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
972 {
973  // This is the reverse of shift_to_decimal_pospow(), see that for more details.
974 
975  // ShiftingRep: uint64 for float's, __uint128_t for double's
976  using Constants = shifting_constants<FloatingType>;
977  using ShiftingRep = typename Constants::ShiftingRep;
978 
979  // We want to start by lining up our bits to the top of the shifting range,
980  // except our first operation is a multiply, so not quite that far
981  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
982  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
983  int const shift_from = count_significant_bits(decimal_rep);
984  int const num_init_bit_shift = shift_up_to - shift_from;
985  int pow2 = -num_init_bit_shift;
986 
987  // Perform the initial bit shift
988  ShiftingRep shifting_rep;
989  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
990  // Shift within DecimalRep before dropping to the smaller ShiftingRep
991  decimal_rep = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
992  shifting_rep = static_cast<ShiftingRep>(decimal_rep);
993  } else {
994  // Scale up to ShiftingRep before shifting
995  shifting_rep = static_cast<ShiftingRep>(decimal_rep);
996  shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
997  }
998 
999  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
1000  while (pow10 > Constants::max_digits_shift) {
1001  // More decimal places to shift than we have room: Multiply the max number of 10s
1002  shifting_rep *= Constants::max_digits_shift_pow;
1003  pow10 -= Constants::max_digits_shift;
1004 
1005  // Then make more room by bit shifting down by the max # of 2s
1006  shifting_rep >>= Constants::max_bits_shift;
1007  pow2 += Constants::max_bits_shift;
1008  }
1009 
1010  // Last 10s-shift: multiply all remaining decimal places
1011  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
1012  if constexpr (Constants::is_double) {
1013  shifting_rep = multiply_power10_64bit(shifting_rep, pow10);
1014  } else {
1015  shifting_rep = multiply_power10_32bit(shifting_rep, pow10);
1016  }
1017 
1018  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
1019  return std::pair{shifting_rep, pow2};
1020 }
1021 
1031 template <typename FloatingType,
1032  typename DecimalRep,
1033  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
1034 CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
1035 {
1036  // This is the reverse of shift_to_decimal_negpow(), see that for more details.
1037 
1038  // ShiftingRep: uint64 for float's, __uint128_t for double's
1039  using Constants = shifting_constants<FloatingType>;
1040  using ShiftingRep = typename Constants::ShiftingRep;
1041 
1042  // We want to start with our significand bits at the top of the shifting range,
1043  // so that we lose minimal information we need on intermediary right-shifts.
1044  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side
1045  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
1046  int const shift_from = count_significant_bits(decimal_rep);
1047  int const num_init_bit_shift = shift_up_to - shift_from;
1048  int pow2 = -num_init_bit_shift;
1049 
1050  // Perform the initial bit shift
1051  ShiftingRep shifting_rep;
1052  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
1053  // Shift within DecimalRep before dropping to the smaller ShiftingRep
1054  decimal_rep = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
1055  shifting_rep = static_cast<ShiftingRep>(decimal_rep);
1056  } else {
1057  // Scale up to ShiftingRep before shifting
1058  shifting_rep = static_cast<ShiftingRep>(decimal_rep);
1059  shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
1060  }
1061 
1062  // Convert to using positive values upfront, simpler than doing later.
1063  int pow10_mag = -pow10;
1064 
1065  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
1066  while (pow10_mag > Constants::max_digits_shift) {
1067  // More decimal places to shift than we have room: Divide the max number of 10s
1068  shifting_rep /= Constants::max_digits_shift_pow;
1069  pow10_mag -= Constants::max_digits_shift;
1070 
1071  // Then make more room by bit shifting up by the max # of 2s
1072  shifting_rep <<= Constants::max_bits_shift;
1073  pow2 -= Constants::max_bits_shift;
1074  }
1075 
1076  // Last 10s-shift: Divdie all remaining decimal places.
1077  // This divide result may not fit in the low half of the bit range
1078  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
1079  if constexpr (Constants::is_double) {
1080  shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag);
1081  } else {
1082  shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag);
1083  }
1084 
1085  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
1086  return std::pair{shifting_rep, pow2};
1087 }
1088 
1098 template <typename FloatingType,
1099  typename Rep,
1100  CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
1101 CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value,
1102  scale_type const& scale)
1103 {
1104  // Check the sign of the input
1105  bool const is_negative = (value < 0);
1106 
1107  // Convert to unsigned for bit counting/shifting
1108  using UnsignedType = cuda::std::make_unsigned_t<Rep>;
1109  auto const unsigned_value = [&]() -> UnsignedType {
1110  // Must guard against minimum value, as we can't just negate it: not representable.
1111  if (value == cuda::std::numeric_limits<Rep>::min()) { return static_cast<UnsignedType>(value); }
1112 
1113  // No abs function for 128bit types, so have to do it manually.
1114  if constexpr (cuda::std::is_same_v<Rep, __int128_t>) {
1115  return static_cast<UnsignedType>(is_negative ? -value : value);
1116  } else {
1117  return cuda::std::abs(value);
1118  }
1119  }();
1120 
1121  // Shift by powers of 2 and 10 to get our integer mantissa
1122  auto const [mantissa, pow2] = [&]() {
1123  auto const pow10 = static_cast<int32_t>(scale);
1124  if (pow10 >= 0) {
1125  return shift_to_binary_pospow<FloatingType>(unsigned_value, pow10);
1126  } else { // pow10 < 0
1127  return shift_to_binary_negpow<FloatingType>(unsigned_value, pow10);
1128  }
1129  }();
1130 
1131  // Zero has special exponent bits, just handle it here
1132  if (mantissa == 0) { return FloatingType(0.0f); }
1133 
1134  // Cast our integer mantissa to floating point
1135  auto const floating = static_cast<FloatingType>(mantissa); // IEEE-754 rounds to even
1136 
1137  // Apply the sign and the remaining powers of 2
1138  using converter = floating_converter<FloatingType>;
1139  auto const magnitude = converter::add_pow2(floating, pow2);
1140  return converter::set_is_negative(magnitude, is_negative);
1141 }
1142 
1143 } // namespace detail
1144  // end of group
1146 } // namespace CUDF_EXPORT numeric
CUDF_HOST_DEVICE cuda::std::make_unsigned_t< Rep > shift_to_decimal_pospow(typename shifting_constants< FloatingType >::IntegerRep const base2_value, int pow2, int pow10)
Perform base-2 -> base-10 fixed-point conversion for pow10 > 0.
constexpr CUDF_HOST_DEVICE T multiply_power10_64bit(T value, int pow10)
Multiply by a power of 10 that fits within a 64bit integer.
CUDF_HOST_DEVICE auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
Perform base-10 -> base-2 fixed-point conversion for pow10 > 0.
CUDF_HOST_DEVICE IntegerType guarded_right_shift(IntegerType value, int bit_shift)
Perform a bit-shift right, guarding against undefined behavior.
CUDF_HOST_DEVICE T divide_power10_32bit(T value, int pow10)
Divide by a power of 10 that fits within a 32bit integer.
constexpr CUDF_HOST_DEVICE T multiply_power10(T value, int pow10)
Multiply an integer by a power of 10.
constexpr CUDF_HOST_DEVICE T divide_power10_128bit(T value, int pow10)
Divide by a power of 10 that fits within a 128bit integer.
CUDF_HOST_DEVICE cuda::std::make_unsigned_t< Rep > convert_floating_to_integral_shifting(typename floating_converter< FloatingType >::IntegralType base2_value, int pow10, int pow2)
Perform base-2 -> base-10 fixed-point conversion.
CUDF_HOST_DEVICE auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
Perform base-10 -> base-2 fixed-point conversion for pow10 < 0.
constexpr CUDF_HOST_DEVICE T multiply_power10_128bit(T value, int pow10)
Multiply by a power of 10 that fits within a 128bit integer.
CUDF_HOST_DEVICE FloatingType convert_integral_to_floating(Rep const &value, scale_type const &scale)
Perform integer decimal -> floating-point conversion.
CUDF_HOST_DEVICE IntegerType guarded_left_shift(IntegerType value, int bit_shift)
Perform a bit-shift left, guarding against undefined behavior.
CUDF_HOST_DEVICE cuda::std::pair< typename floating_converter< FloatingType >::IntegralType, int > add_half_if_truncates(FloatingType floating, typename floating_converter< FloatingType >::IntegralType integer_rep, int pow2, int pow10)
Add half a bit to integer rep of floating point if conversion causes truncation.
constexpr __uint128_t large_power_of_10()
Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an 128bit inte...
constexpr CUDF_HOST_DEVICE T divide_power10(T value, int pow10)
Divide an integer by a power of 10.
constexpr CUDF_HOST_DEVICE T multiply_power10_32bit(T value, int pow10)
Multiply by a power of 10 that fits within a 32bit integer.
CUDF_HOST_DEVICE int count_significant_bits(T value)
Determine the number of significant bits in an integer.
CUDF_HOST_DEVICE cuda::std::make_unsigned_t< Rep > shift_to_decimal_negpow(typename shifting_constants< FloatingType >::IntegerRep base2_value, int pow2, int pow10)
Perform base-2 -> base-10 fixed-point conversion for pow10 < 0.
CUDF_HOST_DEVICE T divide_power10_64bit(T value, int pow10)
Divide by a power of 10 that fits within a 64bit integer.
CUDF_HOST_DEVICE Rep convert_floating_to_integral(FloatingType const &floating, scale_type const &scale)
Perform floating-point -> integer decimal conversion.
scale_type
The scale type for fixed_point.
Definition: fixed_point.hpp:43
#define CUDF_ENABLE_IF(...)
Convenience macro for SFINAE as an unnamed template parameter.
Definition: traits.hpp:50
fixed_point and supporting types
Definition: fixed_point.hpp:33
Helper struct for getting and setting the components of a floating-point value.
static CUDF_HOST_DEVICE FloatingType bit_cast_to_floating(IntegralType integer)
Reinterpret the bits of an integer as floating-point value.
static CUDF_HOST_DEVICE FloatingType add_pow2(FloatingType floating, int pow2)
Adds to the base-2 exponent of a floating-point number.
static CUDF_HOST_DEVICE std::pair< IntegralType, int > get_significand_and_pow2(IntegralType integer_rep)
Extracts the significand and exponent of a bit-casted floating-point number, shifted for denormals.
static CUDF_HOST_DEVICE bool is_zero(IntegralType integer_rep)
Checks whether the bit-casted floating-point value is +/-0.
static CUDF_HOST_DEVICE IntegralType bit_cast_to_integer(FloatingType floating)
Reinterpret the bits of a floating-point value as an integer.
static CUDF_HOST_DEVICE FloatingType set_is_negative(FloatingType floating, bool is_negative)
Sets the sign bit of a floating-point number.
cuda::std::conditional_t< cuda::std::is_same_v< FloatingType, float >, uint32_t, uint64_t > IntegralType
Unsigned int type with same size as floating type.
static CUDF_HOST_DEVICE bool get_is_negative(IntegralType integer_rep)
Extracts the sign bit of a bit-casted floating-point number.
Helper struct with common constants needed by the floating <--> decimal conversions.
std::conditional_t< is_double, __uint128_t, uint64_t > ShiftingRep
Shift data back and forth in space of a type with 2x the starting bits, to give us enough room.
std::conditional_t< is_double, uint64_t, uint32_t > IntegerRep
Integer type that can hold the value of the significand.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32