8 #include <cudf/strings/detail/utf8.hpp>
10 #include <cudf/utilities/export.hpp>
17 #ifndef __CUDACC_RTC__
18 #include <thrust/count.h>
19 #include <thrust/execution_policy.h>
22 #include <cuda/std/functional>
23 #include <cuda/std/utility>
28 namespace CUDF_EXPORT
cudf {
41 if ((str ==
nullptr) || (bytes == 0))
return 0;
42 auto ptr =
reinterpret_cast<uint8_t const*
>(str);
43 #ifndef __CUDACC_RTC__
44 return thrust::count_if(
45 thrust::seq, ptr, ptr + bytes, [](uint8_t chr) {
return is_begin_utf8_char(chr); });
48 auto const end = ptr + bytes;
50 chars += is_begin_utf8_char(*ptr++);
67 __device__
inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
71 auto ptr = d_str.data();
72 auto const end_ptr = ptr + d_str.size_bytes();
73 while ((pos > 0) && (ptr < end_ptr)) {
74 auto const width = strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(*ptr));
91 static __constant__
char max_string_sentinel[5]{
"\xF7\xBF\xBF\xBF"};
116 char const* psentinel{
nullptr};
117 #if defined(__CUDA_ARCH__)
118 psentinel = &cudf::strings::detail::max_string_sentinel[0];
121 cudaGetSymbolAddress((
void**)&psentinel, cudf::strings::detail::max_string_sentinel));
123 return {psentinel, 4};
128 if (_length == UNKNOWN_STRING_LENGTH)
129 _length = strings::detail::characters_in_string(_data, _bytes);
135 __device__
inline string_view::const_iterator::const_iterator(
string_view const& str,
size_type pos)
136 : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
140 __device__
inline string_view::const_iterator::const_iterator(string_view
const& str,
143 : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
147 __device__
inline string_view::const_iterator& string_view::const_iterator::operator++()
149 if (byte_pos < bytes) {
152 cuda::std::max(1, strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[byte_pos])));
158 __device__
inline string_view::const_iterator string_view::const_iterator::operator++(
int)
160 string_view::const_iterator tmp(*
this);
166 string_view::const_iterator::difference_type offset)
const
168 const_iterator tmp(*
this);
171 offset > 0 ? ++tmp : --tmp;
175 __device__
inline string_view::const_iterator& string_view::const_iterator::operator+=(
176 string_view::const_iterator::difference_type offset)
180 offset > 0 ? operator++() : operator--();
184 __device__
inline string_view::const_iterator& string_view::const_iterator::operator--()
187 if (byte_pos == char_pos) {
190 while (strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[--byte_pos])) == 0)
198 __device__
inline string_view::const_iterator string_view::const_iterator::operator--(
int)
200 string_view::const_iterator tmp(*
this);
205 __device__
inline string_view::const_iterator& string_view::const_iterator::operator-=(
206 string_view::const_iterator::difference_type offset)
210 offset > 0 ? operator--() : operator++();
215 string_view::const_iterator::difference_type offset)
const
217 const_iterator tmp(*
this);
220 offset > 0 ? --tmp : ++tmp;
224 __device__
inline string_view::const_iterator& string_view::const_iterator::move_to(
227 *
this += (new_pos - char_pos);
232 string_view::const_iterator
const& rhs)
const
234 return (p == rhs.p) && (char_pos == rhs.char_pos);
238 string_view::const_iterator
const& rhs)
const
240 return (p != rhs.p) || (char_pos != rhs.char_pos);
244 string_view::const_iterator
const& rhs)
const
246 return (p == rhs.p) && (char_pos < rhs.char_pos);
250 string_view::const_iterator
const& rhs)
const
252 return (p == rhs.p) && (char_pos <= rhs.char_pos);
256 string_view::const_iterator
const& rhs)
const
258 return (p == rhs.p) && (char_pos > rhs.char_pos);
262 string_view::const_iterator
const& rhs)
const
264 return (p == rhs.p) && (char_pos >= rhs.char_pos);
270 strings::detail::to_char_utf8(p +
byte_offset(), chr);
274 __device__
inline size_type string_view::const_iterator::position()
const {
return char_pos; }
276 __device__
inline size_type string_view::const_iterator::byte_offset()
const {
return byte_pos; }
278 __device__
inline string_view::const_iterator
string_view::begin()
const {
return {*
this, 0, 0}; }
289 if (offset >= _bytes)
return 0;
291 strings::detail::to_char_utf8(
data() + offset, chr);
298 return cuda::std::get<0>(strings::detail::bytes_to_character_position(*
this, pos));
309 auto const* ptr1 =
reinterpret_cast<unsigned char const*
>(this->
data());
310 auto const* ptr2 =
reinterpret_cast<unsigned char const*
>(
data);
311 if ((ptr1 == ptr2) && (bytes == len1))
return 0;
313 for (; (idx < len1) && (idx < bytes); ++idx) {
314 if (*ptr1 != *ptr2)
return static_cast<int32_t
>(*ptr1) -
static_cast<int32_t
>(*ptr2);
318 if (idx < len1)
return 1;
319 if (idx < bytes)
return -1;
346 return (rc == 0) || (rc < 0);
352 return (rc == 0) || (rc > 0);
362 template <
bool forward>
363 __device__
inline size_type string_view::find_impl(
char const* str,
368 if (!str || pos < 0) {
return npos; }
369 if (pos > 0 && pos >
length()) {
return npos; }
372 auto const itr =
begin() + pos;
373 auto const spos = itr.byte_offset();
377 auto const find_length = (epos - spos) - bytes + 1;
380 auto ptr =
data() + (forward ? spos : (epos - bytes));
381 for (
size_type idx = 0; idx < find_length; ++idx) {
382 if (d_target.compare(ptr, bytes) == 0) {
383 return forward ? pos : character_offset(epos - bytes - idx);
386 pos += strings::detail::is_begin_utf8_char(*ptr);
387 forward ? ++ptr : --ptr;
397 return find_impl<true>(str, bytes, pos,
count);
403 size_type chwidth = strings::detail::from_char_utf8(chr, str);
419 return find_impl<false>(str, bytes, pos,
count);
425 size_type chwidth = strings::detail::from_char_utf8(chr, str);
433 auto const spos =
begin() + pos;
435 auto ss =
string_view{
data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
437 if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
444 return strings::detail::characters_in_string(
data(), bytepos);
Handy iterator for navigating through encoded characters.
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
CUDF_HOST_DEVICE string_view()
Default constructor represents an empty string.
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
std::unique_ptr< column > count(strings_column_view const &input, string_scalar const &target, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the number of times the given target string matches in each string.
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
int32_t size_type
Row index type for columns and tables.
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.