string_view.cuh
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/strings/detail/utf8.hpp>
10 #include <cudf/utilities/export.hpp>
11 
12 #ifndef __CUDA_ARCH__
13 #include <cudf/utilities/error.hpp>
14 #endif
15 
16 // This is defined when compiling with NVRTC. NVRTC cannot include thrust headers at this time.
17 #ifndef __CUDACC_RTC__
18 #include <thrust/count.h>
19 #include <thrust/execution_policy.h>
20 #endif
21 
22 #include <cuda/std/functional>
23 #include <cuda/std/utility>
24 
25 // This file should only include device code logic.
26 // Host-only or host/device code should be defined in the string_view.hpp header file.
27 
28 namespace CUDF_EXPORT cudf {
29 namespace strings {
30 namespace detail {
31 
39 __device__ inline size_type characters_in_string(char const* str, size_type bytes)
40 {
41  if ((str == nullptr) || (bytes == 0)) return 0;
42  auto ptr = reinterpret_cast<uint8_t const*>(str);
43 #ifndef __CUDACC_RTC__
44  return thrust::count_if(
45  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
46 #else
47  size_type chars = 0;
48  auto const end = ptr + bytes;
49  while (ptr < end) {
50  chars += is_begin_utf8_char(*ptr++);
51  }
52  return chars;
53 #endif
54 }
55 
67 __device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
68  string_view d_str, size_type pos)
69 {
70  size_type bytes = 0;
71  auto ptr = d_str.data();
72  auto const end_ptr = ptr + d_str.size_bytes();
73  while ((pos > 0) && (ptr < end_ptr)) {
74  auto const width = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*ptr));
75  if (width) { --pos; }
76  bytes += width;
77  ++ptr;
78  }
79  return {bytes, pos};
80 }
81 
91 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT
92 } // namespace detail
93 } // namespace strings
94 
103 CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; }
104 
114 CUDF_HOST_DEVICE inline string_view string_view::max()
115 {
116  char const* psentinel{nullptr};
117 #if defined(__CUDA_ARCH__)
118  psentinel = &cudf::strings::detail::max_string_sentinel[0];
119 #else
121  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
122 #endif
123  return {psentinel, 4};
124 }
125 
126 __device__ inline size_type string_view::length() const
127 {
128  if (_length == UNKNOWN_STRING_LENGTH)
129  _length = strings::detail::characters_in_string(_data, _bytes);
130  return _length;
131 }
132 
133 // @cond
134 // this custom iterator knows about UTF8 encoding
135 __device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
136  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
137 {
138 }
139 
140 __device__ inline string_view::const_iterator::const_iterator(string_view const& str,
141  size_type pos,
142  size_type offset)
143  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
144 {
145 }
146 
147 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
148 {
149  if (byte_pos < bytes) {
150  // max is used to prevent an infinite loop on invalid UTF-8 data
151  byte_pos +=
152  cuda::std::max(1, strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos])));
153  }
154  ++char_pos;
155  return *this;
156 }
157 
158 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
159 {
160  string_view::const_iterator tmp(*this);
161  operator++();
162  return tmp;
163 }
164 
165 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
166  string_view::const_iterator::difference_type offset) const
167 {
168  const_iterator tmp(*this);
169  size_type adjust = abs(offset);
170  while (adjust-- > 0)
171  offset > 0 ? ++tmp : --tmp;
172  return tmp;
173 }
174 
175 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
176  string_view::const_iterator::difference_type offset)
177 {
178  size_type adjust = abs(offset);
179  while (adjust-- > 0)
180  offset > 0 ? operator++() : operator--();
181  return *this;
182 }
183 
184 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
185 {
186  if (byte_pos > 0) {
187  if (byte_pos == char_pos) {
188  --byte_pos;
189  } else {
190  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
191  ;
192  }
193  }
194  --char_pos;
195  return *this;
196 }
197 
198 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
199 {
200  string_view::const_iterator tmp(*this);
201  operator--();
202  return tmp;
203 }
204 
205 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
206  string_view::const_iterator::difference_type offset)
207 {
208  size_type adjust = abs(offset);
209  while (adjust-- > 0)
210  offset > 0 ? operator--() : operator++();
211  return *this;
212 }
213 
214 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
215  string_view::const_iterator::difference_type offset) const
216 {
217  const_iterator tmp(*this);
218  size_type adjust = abs(offset);
219  while (adjust-- > 0)
220  offset > 0 ? --tmp : ++tmp;
221  return tmp;
222 }
223 
224 __device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
225  size_type new_pos)
226 {
227  *this += (new_pos - char_pos); // more efficient than recounting from the start
228  return *this;
229 }
230 
231 __device__ inline bool string_view::const_iterator::operator==(
232  string_view::const_iterator const& rhs) const
233 {
234  return (p == rhs.p) && (char_pos == rhs.char_pos);
235 }
236 
237 __device__ inline bool string_view::const_iterator::operator!=(
238  string_view::const_iterator const& rhs) const
239 {
240  return (p != rhs.p) || (char_pos != rhs.char_pos);
241 }
242 
243 __device__ inline bool string_view::const_iterator::operator<(
244  string_view::const_iterator const& rhs) const
245 {
246  return (p == rhs.p) && (char_pos < rhs.char_pos);
247 }
248 
249 __device__ inline bool string_view::const_iterator::operator<=(
250  string_view::const_iterator const& rhs) const
251 {
252  return (p == rhs.p) && (char_pos <= rhs.char_pos);
253 }
254 
255 __device__ inline bool string_view::const_iterator::operator>(
256  string_view::const_iterator const& rhs) const
257 {
258  return (p == rhs.p) && (char_pos > rhs.char_pos);
259 }
260 
261 __device__ inline bool string_view::const_iterator::operator>=(
262  string_view::const_iterator const& rhs) const
263 {
264  return (p == rhs.p) && (char_pos >= rhs.char_pos);
265 }
266 
267 __device__ inline char_utf8 string_view::const_iterator::operator*() const
268 {
269  char_utf8 chr = 0;
270  strings::detail::to_char_utf8(p + byte_offset(), chr);
271  return chr;
272 }
273 
274 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
275 
276 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
277 
278 __device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; }
279 
280 __device__ inline string_view::const_iterator string_view::end() const
281 {
282  return {*this, length(), size_bytes()};
283 }
284 // @endcond
285 
286 __device__ inline char_utf8 string_view::operator[](size_type pos) const
287 {
288  size_type offset = byte_offset(pos);
289  if (offset >= _bytes) return 0;
290  char_utf8 chr = 0;
291  strings::detail::to_char_utf8(data() + offset, chr);
292  return chr;
293 }
294 
295 __device__ inline size_type string_view::byte_offset(size_type pos) const
296 {
297  if (length() == size_bytes()) return pos;
298  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
299 }
300 
301 __device__ inline int string_view::compare(string_view const& in) const
302 {
303  return compare(in.data(), in.size_bytes());
304 }
305 
306 __device__ inline int string_view::compare(char const* data, size_type bytes) const
307 {
308  size_type const len1 = size_bytes();
309  auto const* ptr1 = reinterpret_cast<unsigned char const*>(this->data());
310  auto const* ptr2 = reinterpret_cast<unsigned char const*>(data);
311  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
312  size_type idx = 0;
313  for (; (idx < len1) && (idx < bytes); ++idx) {
314  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
315  ++ptr1;
316  ++ptr2;
317  }
318  if (idx < len1) return 1;
319  if (idx < bytes) return -1;
320  return 0;
321 }
322 
323 __device__ inline bool string_view::operator==(string_view const& rhs) const
324 {
325  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
326 }
327 
328 __device__ inline bool string_view::operator!=(string_view const& rhs) const
329 {
330  return compare(rhs) != 0;
331 }
332 
333 __device__ inline bool string_view::operator<(string_view const& rhs) const
334 {
335  return compare(rhs) < 0;
336 }
337 
338 __device__ inline bool string_view::operator>(string_view const& rhs) const
339 {
340  return compare(rhs) > 0;
341 }
342 
343 __device__ inline bool string_view::operator<=(string_view const& rhs) const
344 {
345  int rc = compare(rhs);
346  return (rc == 0) || (rc < 0);
347 }
348 
349 __device__ inline bool string_view::operator>=(string_view const& rhs) const
350 {
351  int rc = compare(rhs);
352  return (rc == 0) || (rc > 0);
353 }
354 
355 __device__ inline size_type string_view::find(string_view const& str,
356  size_type pos,
357  size_type count) const
358 {
359  return find(str.data(), str.size_bytes(), pos, count);
360 }
361 
362 template <bool forward>
363 __device__ inline size_type string_view::find_impl(char const* str,
364  size_type bytes,
365  size_type pos,
366  size_type count) const
367 {
368  if (!str || pos < 0) { return npos; }
369  if (pos > 0 && pos > length()) { return npos; }
370 
371  // use iterator to help reduce character/byte counting
372  auto const itr = begin() + pos;
373  auto const spos = itr.byte_offset();
374  auto const epos =
375  (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes();
376 
377  auto const find_length = (epos - spos) - bytes + 1;
378  auto const d_target = string_view{str, bytes};
379 
380  auto ptr = data() + (forward ? spos : (epos - bytes));
381  for (size_type idx = 0; idx < find_length; ++idx) {
382  if (d_target.compare(ptr, bytes) == 0) {
383  return forward ? pos : character_offset(epos - bytes - idx);
384  }
385  // use pos to record the current find position
386  pos += strings::detail::is_begin_utf8_char(*ptr);
387  forward ? ++ptr : --ptr;
388  }
389  return npos;
390 }
391 
392 __device__ inline size_type string_view::find(char const* str,
393  size_type bytes,
394  size_type pos,
395  size_type count) const
396 {
397  return find_impl<true>(str, bytes, pos, count);
398 }
399 
400 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
401 {
402  char str[sizeof(char_utf8)]; // NOLINT
403  size_type chwidth = strings::detail::from_char_utf8(chr, str);
404  return find(str, chwidth, pos, count);
405 }
406 
407 __device__ inline size_type string_view::rfind(string_view const& str,
408  size_type pos,
409  size_type count) const
410 {
411  return rfind(str.data(), str.size_bytes(), pos, count);
412 }
413 
414 __device__ inline size_type string_view::rfind(char const* str,
415  size_type bytes,
416  size_type pos,
417  size_type count) const
418 {
419  return find_impl<false>(str, bytes, pos, count);
420 }
421 
422 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
423 {
424  char str[sizeof(char_utf8)]; // NOLINT
425  size_type chwidth = strings::detail::from_char_utf8(chr, str);
426  return rfind(str, chwidth, pos, count);
427 }
428 
429 // parameters are character position values
431 {
432  if (pos < 0 || pos >= length()) { return string_view{}; }
433  auto const spos = begin() + pos;
434  auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()};
435  auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
436  // this potentially saves redundant character counting downstream
437  if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
438  return ss;
439 }
440 
441 __device__ inline size_type string_view::character_offset(size_type bytepos) const
442 {
443  if (length() == size_bytes()) return bytepos;
444  return strings::detail::characters_in_string(data(), bytepos);
445 }
446 
447 } // namespace CUDF_EXPORT cudf
Handy iterator for navigating through encoded characters.
Definition: string_view.hpp:66
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:35
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:42
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
CUDF_HOST_DEVICE string_view()
Default constructor represents an empty string.
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:54
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
std::unique_ptr< column > count(strings_column_view const &input, string_scalar const &target, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the number of times the given target string matches in each string.
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:243
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:85
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:22
cuDF interfaces
Definition: host_udf.hpp:26
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:21