19 #include <cudf/strings/detail/utf8.hpp>
29 #include <thrust/count.h>
30 #include <thrust/execution_policy.h>
51 if ((str ==
nullptr) || (bytes == 0))
return 0;
52 auto ptr =
reinterpret_cast<uint8_t const*
>(str);
54 return thrust::count_if(
55 thrust::seq, ptr, ptr + bytes, [](uint8_t chr) {
return is_begin_utf8_char(chr); });
58 auto const end = ptr + bytes;
60 chars += is_begin_utf8_char(*ptr++);
77 __device__
inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
81 auto ptr = d_str.data();
82 auto const end_ptr = ptr + d_str.size_bytes();
83 while ((pos > 0) && (ptr < end_ptr)) {
84 auto const width = strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(*ptr));
101 static __constant__
char max_string_sentinel[5]{
"\xF7\xBF\xBF\xBF"};
126 char const* psentinel{
nullptr};
127 #if defined(__CUDA_ARCH__)
128 psentinel = &cudf::strings::detail::max_string_sentinel[0];
131 cudaGetSymbolAddress((
void**)&psentinel, cudf::strings::detail::max_string_sentinel));
138 if (_length == UNKNOWN_STRING_LENGTH)
139 _length = strings::detail::characters_in_string(_data, _bytes);
145 __device__
inline string_view::const_iterator::const_iterator(
string_view const& str,
size_type pos)
150 __device__
inline string_view::const_iterator::const_iterator(string_view
const& str,
153 : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
157 __device__
inline string_view::const_iterator& string_view::const_iterator::operator++()
159 if (byte_pos < bytes)
160 byte_pos += strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[byte_pos]));
165 __device__
inline string_view::const_iterator string_view::const_iterator::operator++(
int)
167 string_view::const_iterator tmp(*
this);
173 string_view::const_iterator::difference_type offset)
const
175 const_iterator tmp(*
this);
178 offset > 0 ? ++tmp : --tmp;
182 __device__
inline string_view::const_iterator& string_view::const_iterator::operator+=(
183 string_view::const_iterator::difference_type offset)
187 offset > 0 ? operator++() : operator--();
191 __device__
inline string_view::const_iterator& string_view::const_iterator::operator--()
194 while (strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[--byte_pos])) == 0)
200 __device__
inline string_view::const_iterator string_view::const_iterator::operator--(
int)
202 string_view::const_iterator tmp(*
this);
207 __device__
inline string_view::const_iterator& string_view::const_iterator::operator-=(
208 string_view::const_iterator::difference_type offset)
212 offset > 0 ? operator--() : operator++();
217 string_view::const_iterator::difference_type offset)
const
219 const_iterator tmp(*
this);
222 offset > 0 ? --tmp : ++tmp;
226 __device__
inline string_view::const_iterator& string_view::const_iterator::move_to(
229 *
this += (new_pos - char_pos);
234 string_view::const_iterator
const& rhs)
const
236 return (p == rhs.p) && (char_pos == rhs.char_pos);
240 string_view::const_iterator
const& rhs)
const
242 return (p != rhs.p) || (char_pos != rhs.char_pos);
246 string_view::const_iterator
const& rhs)
const
248 return (p == rhs.p) && (char_pos < rhs.char_pos);
252 string_view::const_iterator
const& rhs)
const
254 return (p == rhs.p) && (char_pos <= rhs.char_pos);
258 string_view::const_iterator
const& rhs)
const
260 return (p == rhs.p) && (char_pos > rhs.char_pos);
264 string_view::const_iterator
const& rhs)
const
266 return (p == rhs.p) && (char_pos >= rhs.char_pos);
272 strings::detail::to_char_utf8(p +
byte_offset(), chr);
276 __device__
inline size_type string_view::const_iterator::position()
const {
return char_pos; }
278 __device__
inline size_type string_view::const_iterator::byte_offset()
const {
return byte_pos; }
282 return const_iterator(*
this, 0, 0);
294 if (offset >= _bytes)
return 0;
296 strings::detail::to_char_utf8(
data() + offset, chr);
303 return std::get<0>(strings::detail::bytes_to_character_position(*
this, pos));
314 auto const* ptr1 =
reinterpret_cast<unsigned char const*
>(this->
data());
315 auto const* ptr2 =
reinterpret_cast<unsigned char const*
>(
data);
316 if ((ptr1 == ptr2) && (bytes == len1))
return 0;
318 for (; (idx < len1) && (idx < bytes); ++idx) {
319 if (*ptr1 != *ptr2)
return static_cast<int32_t
>(*ptr1) -
static_cast<int32_t
>(*ptr2);
323 if (idx < len1)
return 1;
324 if (idx < bytes)
return -1;
351 return (rc == 0) || (rc < 0);
357 return (rc == 0) || (rc > 0);
367 template <
bool forward>
368 __device__
inline size_type string_view::find_impl(
char const* str,
373 auto const nchars =
length();
374 if (!str || pos < 0 || pos > nchars)
return npos;
375 if (count < 0) count = nchars;
378 auto itr =
begin() + pos;
379 auto const spos = itr.byte_offset();
382 auto const find_length = (epos - spos) - bytes + 1;
384 auto ptr =
data() + (forward ? spos : (epos - bytes));
385 for (
size_type idx = 0; idx < find_length; ++idx) {
387 for (
size_type jdx = 0; match && (jdx < bytes); ++jdx) {
388 match = (ptr[jdx] == str[jdx]);
390 if (match) {
return forward ? pos : character_offset(epos - bytes - idx); }
392 pos += strings::detail::is_begin_utf8_char(*ptr);
393 forward ? ++ptr : --ptr;
403 return find_impl<true>(str, bytes, pos, count);
409 size_type chwidth = strings::detail::from_char_utf8(chr, str);
410 return find(str, chwidth, pos, count);
425 return find_impl<false>(str, bytes, pos, count);
431 size_type chwidth = strings::detail::from_char_utf8(chr, str);
432 return rfind(str, chwidth, pos, count);
439 auto const itr =
begin() + pos;
440 auto const spos = itr.byte_offset();
448 return strings::detail::characters_in_string(
data(), bytepos);