string_view.cuh
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/strings/detail/utf8.hpp>
10 #include <cudf/utilities/export.hpp>
11 
12 #ifndef __CUDA_ARCH__
13 #include <cudf/utilities/error.hpp>
14 #endif
15 
16 // This is defined when including this header in a https://github.com/NVIDIA/jitify
17 // or jitify2 source file. The jitify cannot include thrust headers at this time.
18 #ifndef CUDF_RUNTIME_JIT
19 #include <thrust/count.h>
20 #include <thrust/execution_policy.h>
21 #endif
22 
23 #include <cuda/std/functional>
24 #include <cuda/std/utility>
25 
26 // This file should only include device code logic.
27 // Host-only or host/device code should be defined in the string_view.hpp header file.
28 
29 namespace CUDF_EXPORT cudf {
30 namespace strings {
31 namespace detail {
32 
40 __device__ inline size_type characters_in_string(char const* str, size_type bytes)
41 {
42  if ((str == nullptr) || (bytes == 0)) return 0;
43  auto ptr = reinterpret_cast<uint8_t const*>(str);
44 #ifndef CUDF_RUNTIME_JIT
45  return thrust::count_if(
46  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
47 #else
48  size_type chars = 0;
49  auto const end = ptr + bytes;
50  while (ptr < end) {
51  chars += is_begin_utf8_char(*ptr++);
52  }
53  return chars;
54 #endif
55 }
56 
68 __device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
69  string_view d_str, size_type pos)
70 {
71  size_type bytes = 0;
72  auto ptr = d_str.data();
73  auto const end_ptr = ptr + d_str.size_bytes();
74  while ((pos > 0) && (ptr < end_ptr)) {
75  auto const width = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*ptr));
76  if (width) { --pos; }
77  bytes += width;
78  ++ptr;
79  }
80  return {bytes, pos};
81 }
82 
92 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT
93 } // namespace detail
94 } // namespace strings
95 
104 CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; }
105 
115 CUDF_HOST_DEVICE inline string_view string_view::max()
116 {
117  char const* psentinel{nullptr};
118 #if defined(__CUDA_ARCH__)
119  psentinel = &cudf::strings::detail::max_string_sentinel[0];
120 #else
122  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
123 #endif
124  return {psentinel, 4};
125 }
126 
127 __device__ inline size_type string_view::length() const
128 {
129  if (_length == UNKNOWN_STRING_LENGTH)
130  _length = strings::detail::characters_in_string(_data, _bytes);
131  return _length;
132 }
133 
134 // @cond
135 // this custom iterator knows about UTF8 encoding
136 __device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
137  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
138 {
139 }
140 
141 __device__ inline string_view::const_iterator::const_iterator(string_view const& str,
142  size_type pos,
143  size_type offset)
144  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
145 {
146 }
147 
148 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
149 {
150  if (byte_pos < bytes) {
151  // max is used to prevent an infinite loop on invalid UTF-8 data
152  byte_pos +=
153  cuda::std::max(1, strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos])));
154  }
155  ++char_pos;
156  return *this;
157 }
158 
159 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
160 {
161  string_view::const_iterator tmp(*this);
162  operator++();
163  return tmp;
164 }
165 
166 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
167  string_view::const_iterator::difference_type offset) const
168 {
169  const_iterator tmp(*this);
170  size_type adjust = abs(offset);
171  while (adjust-- > 0)
172  offset > 0 ? ++tmp : --tmp;
173  return tmp;
174 }
175 
176 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
177  string_view::const_iterator::difference_type offset)
178 {
179  size_type adjust = abs(offset);
180  while (adjust-- > 0)
181  offset > 0 ? operator++() : operator--();
182  return *this;
183 }
184 
185 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
186 {
187  if (byte_pos > 0) {
188  if (byte_pos == char_pos) {
189  --byte_pos;
190  } else {
191  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
192  ;
193  }
194  }
195  --char_pos;
196  return *this;
197 }
198 
199 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
200 {
201  string_view::const_iterator tmp(*this);
202  operator--();
203  return tmp;
204 }
205 
206 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
207  string_view::const_iterator::difference_type offset)
208 {
209  size_type adjust = abs(offset);
210  while (adjust-- > 0)
211  offset > 0 ? operator--() : operator++();
212  return *this;
213 }
214 
215 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
216  string_view::const_iterator::difference_type offset) const
217 {
218  const_iterator tmp(*this);
219  size_type adjust = abs(offset);
220  while (adjust-- > 0)
221  offset > 0 ? --tmp : ++tmp;
222  return tmp;
223 }
224 
225 __device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
226  size_type new_pos)
227 {
228  *this += (new_pos - char_pos); // more efficient than recounting from the start
229  return *this;
230 }
231 
232 __device__ inline bool string_view::const_iterator::operator==(
233  string_view::const_iterator const& rhs) const
234 {
235  return (p == rhs.p) && (char_pos == rhs.char_pos);
236 }
237 
238 __device__ inline bool string_view::const_iterator::operator!=(
239  string_view::const_iterator const& rhs) const
240 {
241  return (p != rhs.p) || (char_pos != rhs.char_pos);
242 }
243 
244 __device__ inline bool string_view::const_iterator::operator<(
245  string_view::const_iterator const& rhs) const
246 {
247  return (p == rhs.p) && (char_pos < rhs.char_pos);
248 }
249 
250 __device__ inline bool string_view::const_iterator::operator<=(
251  string_view::const_iterator const& rhs) const
252 {
253  return (p == rhs.p) && (char_pos <= rhs.char_pos);
254 }
255 
256 __device__ inline bool string_view::const_iterator::operator>(
257  string_view::const_iterator const& rhs) const
258 {
259  return (p == rhs.p) && (char_pos > rhs.char_pos);
260 }
261 
262 __device__ inline bool string_view::const_iterator::operator>=(
263  string_view::const_iterator const& rhs) const
264 {
265  return (p == rhs.p) && (char_pos >= rhs.char_pos);
266 }
267 
268 __device__ inline char_utf8 string_view::const_iterator::operator*() const
269 {
270  char_utf8 chr = 0;
271  strings::detail::to_char_utf8(p + byte_offset(), chr);
272  return chr;
273 }
274 
275 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
276 
277 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
278 
279 __device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; }
280 
281 __device__ inline string_view::const_iterator string_view::end() const
282 {
283  return {*this, length(), size_bytes()};
284 }
285 // @endcond
286 
287 __device__ inline char_utf8 string_view::operator[](size_type pos) const
288 {
289  size_type offset = byte_offset(pos);
290  if (offset >= _bytes) return 0;
291  char_utf8 chr = 0;
292  strings::detail::to_char_utf8(data() + offset, chr);
293  return chr;
294 }
295 
296 __device__ inline size_type string_view::byte_offset(size_type pos) const
297 {
298  if (length() == size_bytes()) return pos;
299  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
300 }
301 
302 __device__ inline int string_view::compare(string_view const& in) const
303 {
304  return compare(in.data(), in.size_bytes());
305 }
306 
307 __device__ inline int string_view::compare(char const* data, size_type bytes) const
308 {
309  size_type const len1 = size_bytes();
310  auto const* ptr1 = reinterpret_cast<unsigned char const*>(this->data());
311  auto const* ptr2 = reinterpret_cast<unsigned char const*>(data);
312  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
313  size_type idx = 0;
314  for (; (idx < len1) && (idx < bytes); ++idx) {
315  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
316  ++ptr1;
317  ++ptr2;
318  }
319  if (idx < len1) return 1;
320  if (idx < bytes) return -1;
321  return 0;
322 }
323 
324 __device__ inline bool string_view::operator==(string_view const& rhs) const
325 {
326  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
327 }
328 
329 __device__ inline bool string_view::operator!=(string_view const& rhs) const
330 {
331  return compare(rhs) != 0;
332 }
333 
334 __device__ inline bool string_view::operator<(string_view const& rhs) const
335 {
336  return compare(rhs) < 0;
337 }
338 
339 __device__ inline bool string_view::operator>(string_view const& rhs) const
340 {
341  return compare(rhs) > 0;
342 }
343 
344 __device__ inline bool string_view::operator<=(string_view const& rhs) const
345 {
346  int rc = compare(rhs);
347  return (rc == 0) || (rc < 0);
348 }
349 
350 __device__ inline bool string_view::operator>=(string_view const& rhs) const
351 {
352  int rc = compare(rhs);
353  return (rc == 0) || (rc > 0);
354 }
355 
356 __device__ inline size_type string_view::find(string_view const& str,
357  size_type pos,
358  size_type count) const
359 {
360  return find(str.data(), str.size_bytes(), pos, count);
361 }
362 
363 template <bool forward>
364 __device__ inline size_type string_view::find_impl(char const* str,
365  size_type bytes,
366  size_type pos,
367  size_type count) const
368 {
369  if (!str || pos < 0) { return npos; }
370  if (pos > 0 && pos > length()) { return npos; }
371 
372  // use iterator to help reduce character/byte counting
373  auto const itr = begin() + pos;
374  auto const spos = itr.byte_offset();
375  auto const epos =
376  (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes();
377 
378  auto const find_length = (epos - spos) - bytes + 1;
379  auto const d_target = string_view{str, bytes};
380 
381  auto ptr = data() + (forward ? spos : (epos - bytes));
382  for (size_type idx = 0; idx < find_length; ++idx) {
383  if (d_target.compare(ptr, bytes) == 0) {
384  return forward ? pos : character_offset(epos - bytes - idx);
385  }
386  // use pos to record the current find position
387  pos += strings::detail::is_begin_utf8_char(*ptr);
388  forward ? ++ptr : --ptr;
389  }
390  return npos;
391 }
392 
393 __device__ inline size_type string_view::find(char const* str,
394  size_type bytes,
395  size_type pos,
396  size_type count) const
397 {
398  return find_impl<true>(str, bytes, pos, count);
399 }
400 
401 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
402 {
403  char str[sizeof(char_utf8)]; // NOLINT
404  size_type chwidth = strings::detail::from_char_utf8(chr, str);
405  return find(str, chwidth, pos, count);
406 }
407 
408 __device__ inline size_type string_view::rfind(string_view const& str,
409  size_type pos,
410  size_type count) const
411 {
412  return rfind(str.data(), str.size_bytes(), pos, count);
413 }
414 
415 __device__ inline size_type string_view::rfind(char const* str,
416  size_type bytes,
417  size_type pos,
418  size_type count) const
419 {
420  return find_impl<false>(str, bytes, pos, count);
421 }
422 
423 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
424 {
425  char str[sizeof(char_utf8)]; // NOLINT
426  size_type chwidth = strings::detail::from_char_utf8(chr, str);
427  return rfind(str, chwidth, pos, count);
428 }
429 
430 // parameters are character position values
431 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
432 {
433  if (pos < 0 || pos >= length()) { return string_view{}; }
434  auto const spos = begin() + pos;
435  auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()};
436  auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
437  // this potentially saves redundant character counting downstream
438  if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
439  return ss;
440 }
441 
442 __device__ inline size_type string_view::character_offset(size_type bytepos) const
443 {
444  if (length() == size_bytes()) return bytepos;
445  return strings::detail::characters_in_string(data(), bytepos);
446 }
447 
448 } // namespace CUDF_EXPORT cudf
Handy iterator for navigating through encoded characters.
Definition: string_view.hpp:64
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:33
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:40
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
CUDF_HOST_DEVICE string_view()
Default constructor represents an empty string.
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:52
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:229
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:84
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:20
cuDF interfaces
Definition: host_udf.hpp:26
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:21