All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
string_view.cuh
1 /*
2  * Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/strings/detail/utf8.hpp>
21 #include <cudf/utilities/export.hpp>
22 
23 #ifndef __CUDA_ARCH__
24 #include <cudf/utilities/error.hpp>
25 #endif
26 
27 // This is defined when including this header in a https://github.com/NVIDIA/jitify
28 // or jitify2 source file. The jitify cannot include thrust headers at this time.
29 #ifndef CUDF_JIT_UDF
30 #include <thrust/count.h>
31 #include <thrust/execution_policy.h>
32 #endif
33 
34 #include <cuda/std/utility>
35 
36 #include <algorithm>
37 
38 // This file should only include device code logic.
39 // Host-only or host/device code should be defined in the string_view.hpp header file.
40 
41 namespace CUDF_EXPORT cudf {
42 namespace strings {
43 namespace detail {
44 
52 __device__ inline size_type characters_in_string(char const* str, size_type bytes)
53 {
54  if ((str == nullptr) || (bytes == 0)) return 0;
55  auto ptr = reinterpret_cast<uint8_t const*>(str);
56 #ifndef CUDF_JIT_UDF
57  return thrust::count_if(
58  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
59 #else
60  size_type chars = 0;
61  auto const end = ptr + bytes;
62  while (ptr < end) {
63  chars += is_begin_utf8_char(*ptr++);
64  }
65  return chars;
66 #endif
67 }
68 
80 __device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
81  string_view d_str, size_type pos)
82 {
83  size_type bytes = 0;
84  auto ptr = d_str.data();
85  auto const end_ptr = ptr + d_str.size_bytes();
86  while ((pos > 0) && (ptr < end_ptr)) {
87  auto const width = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*ptr));
88  if (width) { --pos; }
89  bytes += width;
90  ++ptr;
91  }
92  return {bytes, pos};
93 }
94 
104 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT
105 } // namespace detail
106 } // namespace strings
107 
116 CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; }
117 
127 CUDF_HOST_DEVICE inline string_view string_view::max()
128 {
129  char const* psentinel{nullptr};
130 #if defined(__CUDA_ARCH__)
131  psentinel = &cudf::strings::detail::max_string_sentinel[0];
132 #else
134  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
135 #endif
136  return {psentinel, 4};
137 }
138 
139 __device__ inline size_type string_view::length() const
140 {
141  if (_length == UNKNOWN_STRING_LENGTH)
142  _length = strings::detail::characters_in_string(_data, _bytes);
143  return _length;
144 }
145 
146 // @cond
147 // this custom iterator knows about UTF8 encoding
148 __device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
149  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
150 {
151 }
152 
153 __device__ inline string_view::const_iterator::const_iterator(string_view const& str,
154  size_type pos,
155  size_type offset)
156  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
157 {
158 }
159 
160 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
161 {
162  if (byte_pos < bytes) {
163  // max is used to prevent an infinite loop on invalid UTF-8 data
164  byte_pos +=
165  cuda::std::max(1, strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos])));
166  }
167  ++char_pos;
168  return *this;
169 }
170 
171 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
172 {
173  string_view::const_iterator tmp(*this);
174  operator++();
175  return tmp;
176 }
177 
178 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
179  string_view::const_iterator::difference_type offset) const
180 {
181  const_iterator tmp(*this);
182  size_type adjust = abs(offset);
183  while (adjust-- > 0)
184  offset > 0 ? ++tmp : --tmp;
185  return tmp;
186 }
187 
188 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
189  string_view::const_iterator::difference_type offset)
190 {
191  size_type adjust = abs(offset);
192  while (adjust-- > 0)
193  offset > 0 ? operator++() : operator--();
194  return *this;
195 }
196 
197 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
198 {
199  if (byte_pos > 0) {
200  if (byte_pos == char_pos) {
201  --byte_pos;
202  } else {
203  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
204  ;
205  }
206  }
207  --char_pos;
208  return *this;
209 }
210 
211 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
212 {
213  string_view::const_iterator tmp(*this);
214  operator--();
215  return tmp;
216 }
217 
218 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
219  string_view::const_iterator::difference_type offset)
220 {
221  size_type adjust = abs(offset);
222  while (adjust-- > 0)
223  offset > 0 ? operator--() : operator++();
224  return *this;
225 }
226 
227 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
228  string_view::const_iterator::difference_type offset) const
229 {
230  const_iterator tmp(*this);
231  size_type adjust = abs(offset);
232  while (adjust-- > 0)
233  offset > 0 ? --tmp : ++tmp;
234  return tmp;
235 }
236 
237 __device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
238  size_type new_pos)
239 {
240  *this += (new_pos - char_pos); // more efficient than recounting from the start
241  return *this;
242 }
243 
244 __device__ inline bool string_view::const_iterator::operator==(
245  string_view::const_iterator const& rhs) const
246 {
247  return (p == rhs.p) && (char_pos == rhs.char_pos);
248 }
249 
250 __device__ inline bool string_view::const_iterator::operator!=(
251  string_view::const_iterator const& rhs) const
252 {
253  return (p != rhs.p) || (char_pos != rhs.char_pos);
254 }
255 
256 __device__ inline bool string_view::const_iterator::operator<(
257  string_view::const_iterator const& rhs) const
258 {
259  return (p == rhs.p) && (char_pos < rhs.char_pos);
260 }
261 
262 __device__ inline bool string_view::const_iterator::operator<=(
263  string_view::const_iterator const& rhs) const
264 {
265  return (p == rhs.p) && (char_pos <= rhs.char_pos);
266 }
267 
268 __device__ inline bool string_view::const_iterator::operator>(
269  string_view::const_iterator const& rhs) const
270 {
271  return (p == rhs.p) && (char_pos > rhs.char_pos);
272 }
273 
274 __device__ inline bool string_view::const_iterator::operator>=(
275  string_view::const_iterator const& rhs) const
276 {
277  return (p == rhs.p) && (char_pos >= rhs.char_pos);
278 }
279 
280 __device__ inline char_utf8 string_view::const_iterator::operator*() const
281 {
282  char_utf8 chr = 0;
283  strings::detail::to_char_utf8(p + byte_offset(), chr);
284  return chr;
285 }
286 
287 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
288 
289 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
290 
291 __device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; }
292 
293 __device__ inline string_view::const_iterator string_view::end() const
294 {
295  return {*this, length(), size_bytes()};
296 }
297 // @endcond
298 
299 __device__ inline char_utf8 string_view::operator[](size_type pos) const
300 {
301  size_type offset = byte_offset(pos);
302  if (offset >= _bytes) return 0;
303  char_utf8 chr = 0;
304  strings::detail::to_char_utf8(data() + offset, chr);
305  return chr;
306 }
307 
308 __device__ inline size_type string_view::byte_offset(size_type pos) const
309 {
310  if (length() == size_bytes()) return pos;
311  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
312 }
313 
314 __device__ inline int string_view::compare(string_view const& in) const
315 {
316  return compare(in.data(), in.size_bytes());
317 }
318 
319 __device__ inline int string_view::compare(char const* data, size_type bytes) const
320 {
321  size_type const len1 = size_bytes();
322  auto const* ptr1 = reinterpret_cast<unsigned char const*>(this->data());
323  auto const* ptr2 = reinterpret_cast<unsigned char const*>(data);
324  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
325  size_type idx = 0;
326  for (; (idx < len1) && (idx < bytes); ++idx) {
327  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
328  ++ptr1;
329  ++ptr2;
330  }
331  if (idx < len1) return 1;
332  if (idx < bytes) return -1;
333  return 0;
334 }
335 
336 __device__ inline bool string_view::operator==(string_view const& rhs) const
337 {
338  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
339 }
340 
341 __device__ inline bool string_view::operator!=(string_view const& rhs) const
342 {
343  return compare(rhs) != 0;
344 }
345 
346 __device__ inline bool string_view::operator<(string_view const& rhs) const
347 {
348  return compare(rhs) < 0;
349 }
350 
351 __device__ inline bool string_view::operator>(string_view const& rhs) const
352 {
353  return compare(rhs) > 0;
354 }
355 
356 __device__ inline bool string_view::operator<=(string_view const& rhs) const
357 {
358  int rc = compare(rhs);
359  return (rc == 0) || (rc < 0);
360 }
361 
362 __device__ inline bool string_view::operator>=(string_view const& rhs) const
363 {
364  int rc = compare(rhs);
365  return (rc == 0) || (rc > 0);
366 }
367 
368 __device__ inline size_type string_view::find(string_view const& str,
369  size_type pos,
370  size_type count) const
371 {
372  return find(str.data(), str.size_bytes(), pos, count);
373 }
374 
375 template <bool forward>
376 __device__ inline size_type string_view::find_impl(char const* str,
377  size_type bytes,
378  size_type pos,
379  size_type count) const
380 {
381  if (!str || pos < 0) { return npos; }
382  if (pos > 0 && pos > length()) { return npos; }
383 
384  // use iterator to help reduce character/byte counting
385  auto const itr = begin() + pos;
386  auto const spos = itr.byte_offset();
387  auto const epos =
388  (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes();
389 
390  auto const find_length = (epos - spos) - bytes + 1;
391  auto const d_target = string_view{str, bytes};
392 
393  auto ptr = data() + (forward ? spos : (epos - bytes));
394  for (size_type idx = 0; idx < find_length; ++idx) {
395  if (d_target.compare(ptr, bytes) == 0) {
396  return forward ? pos : character_offset(epos - bytes - idx);
397  }
398  // use pos to record the current find position
399  pos += strings::detail::is_begin_utf8_char(*ptr);
400  forward ? ++ptr : --ptr;
401  }
402  return npos;
403 }
404 
405 __device__ inline size_type string_view::find(char const* str,
406  size_type bytes,
407  size_type pos,
408  size_type count) const
409 {
410  return find_impl<true>(str, bytes, pos, count);
411 }
412 
413 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
414 {
415  char str[sizeof(char_utf8)]; // NOLINT
416  size_type chwidth = strings::detail::from_char_utf8(chr, str);
417  return find(str, chwidth, pos, count);
418 }
419 
420 __device__ inline size_type string_view::rfind(string_view const& str,
421  size_type pos,
422  size_type count) const
423 {
424  return rfind(str.data(), str.size_bytes(), pos, count);
425 }
426 
427 __device__ inline size_type string_view::rfind(char const* str,
428  size_type bytes,
429  size_type pos,
430  size_type count) const
431 {
432  return find_impl<false>(str, bytes, pos, count);
433 }
434 
435 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
436 {
437  char str[sizeof(char_utf8)]; // NOLINT
438  size_type chwidth = strings::detail::from_char_utf8(chr, str);
439  return rfind(str, chwidth, pos, count);
440 }
441 
442 // parameters are character position values
443 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
444 {
445  if (pos < 0 || pos >= length()) { return string_view{}; }
446  auto const spos = begin() + pos;
447  auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()};
448  auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
449  // this potentially saves redundant character counting downstream
450  if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
451  return ss;
452 }
453 
454 __device__ inline size_type string_view::character_offset(size_type bytepos) const
455 {
456  if (length() == size_bytes()) return bytepos;
457  return strings::detail::characters_in_string(data(), bytepos);
458 }
459 
460 } // namespace CUDF_EXPORT cudf
Handy iterator for navigating through encoded characters.
Definition: string_view.hpp:75
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:44
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:51
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
CUDF_HOST_DEVICE string_view()
Default constructor represents an empty string.
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:63
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:264
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:31
cuDF interfaces
Definition: host_udf.hpp:37
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32