19 #include <cudf/strings/detail/utf8.hpp>
29 #include <thrust/count.h>
30 #include <thrust/execution_policy.h>
51 if ((str ==
nullptr) || (bytes == 0))
return 0;
52 auto ptr =
reinterpret_cast<uint8_t const*
>(str);
54 return thrust::count_if(
55 thrust::seq, ptr, ptr + bytes, [](uint8_t chr) {
return is_begin_utf8_char(chr); });
58 auto const end = ptr + bytes;
60 chars += is_begin_utf8_char(*ptr++);
75 static __constant__
char max_string_sentinel[5]{
"\xF7\xBF\xBF\xBF"};
100 const char* psentinel{
nullptr};
101 #if defined(__CUDA_ARCH__)
102 psentinel = &cudf::strings::detail::max_string_sentinel[0];
105 cudaGetSymbolAddress((
void**)&psentinel, cudf::strings::detail::max_string_sentinel));
112 if (_length == UNKNOWN_STRING_LENGTH)
113 _length = strings::detail::characters_in_string(_data, _bytes);
119 __device__
inline string_view::const_iterator::const_iterator(
const string_view& str,
size_type pos)
124 __device__
inline string_view::const_iterator& string_view::const_iterator::operator++()
126 if (byte_pos < bytes)
127 byte_pos += strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[byte_pos]));
132 __device__
inline string_view::const_iterator string_view::const_iterator::operator++(
int)
134 string_view::const_iterator tmp(*
this);
140 string_view::const_iterator::difference_type offset)
142 const_iterator tmp(*
this);
145 offset > 0 ? ++tmp : --tmp;
149 __device__
inline string_view::const_iterator& string_view::const_iterator::operator+=(
150 string_view::const_iterator::difference_type offset)
154 offset > 0 ? operator++() : operator--();
158 __device__
inline string_view::const_iterator& string_view::const_iterator::operator--()
161 while (strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[--byte_pos])) == 0)
167 __device__
inline string_view::const_iterator string_view::const_iterator::operator--(
int)
169 string_view::const_iterator tmp(*
this);
174 __device__
inline string_view::const_iterator& string_view::const_iterator::operator-=(
175 string_view::const_iterator::difference_type offset)
179 offset > 0 ? operator--() : operator++();
184 string_view::const_iterator::difference_type offset)
186 const_iterator tmp(*
this);
189 offset > 0 ? --tmp : ++tmp;
194 const string_view::const_iterator& rhs)
const
196 return (p == rhs.p) && (char_pos == rhs.char_pos);
200 const string_view::const_iterator& rhs)
const
202 return (p != rhs.p) || (char_pos != rhs.char_pos);
206 const string_view::const_iterator& rhs)
const
208 return (p == rhs.p) && (char_pos < rhs.char_pos);
212 const string_view::const_iterator& rhs)
const
214 return (p == rhs.p) && (char_pos <= rhs.char_pos);
218 const string_view::const_iterator& rhs)
const
220 return (p == rhs.p) && (char_pos > rhs.char_pos);
224 const string_view::const_iterator& rhs)
const
226 return (p == rhs.p) && (char_pos >= rhs.char_pos);
232 strings::detail::to_char_utf8(p +
byte_offset(), chr);
236 __device__
inline size_type string_view::const_iterator::position()
const {
return char_pos; }
238 __device__
inline size_type string_view::const_iterator::byte_offset()
const {
return byte_pos; }
242 return const_iterator(*
this, 0);
247 return const_iterator(*
this,
length());
254 if (offset >= _bytes)
return 0;
256 strings::detail::to_char_utf8(
data() + offset, chr);
263 const char* sptr = _data;
264 const char* eptr = sptr + _bytes;
266 while ((pos > 0) && (sptr < eptr)) {
267 size_type charbytes = strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(*sptr++));
268 if (charbytes) --pos;
282 const auto* ptr1 =
reinterpret_cast<const unsigned char*
>(this->
data());
283 const auto* ptr2 =
reinterpret_cast<const unsigned char*
>(
data);
284 if ((ptr1 == ptr2) && (bytes == len1))
return 0;
286 for (; (idx < len1) && (idx < bytes); ++idx) {
287 if (*ptr1 != *ptr2)
return static_cast<int32_t
>(*ptr1) -
static_cast<int32_t
>(*ptr2);
291 if (idx < len1)
return 1;
292 if (idx < bytes)
return -1;
319 return (rc == 0) || (rc < 0);
325 return (rc == 0) || (rc > 0);
335 template <
bool forward>
336 __device__
inline size_type string_view::find_impl(
const char* str,
341 if (!str || pos < 0)
return npos;
342 auto const nchars =
length();
343 if (count < 0) count = nchars;
345 auto const epos =
byte_offset(std::min(pos + count, nchars));
347 auto const find_length = (epos - spos) - bytes + 1;
349 auto ptr =
data() + (forward ? spos : (epos - bytes));
350 for (
size_type idx = 0; idx < find_length; ++idx) {
352 for (
size_type jdx = 0; match && (jdx < bytes); ++jdx) {
353 match = (ptr[jdx] == str[jdx]);
355 if (match) {
return character_offset(forward ? (idx + spos) : (epos - bytes - idx)); }
356 forward ? ++ptr : --ptr;
366 return find_impl<true>(str, bytes, pos, count);
372 size_type chwidth = strings::detail::from_char_utf8(chr, str);
373 return find(str, chwidth, pos, count);
388 return find_impl<false>(str, bytes, pos, count);
394 size_type chwidth = strings::detail::from_char_utf8(chr, str);
395 return rfind(str, chwidth, pos, count);
411 return strings::detail::characters_in_string(
data(), bytepos);