string_view.cuh
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/strings/detail/utf8.hpp>
21 
22 #ifndef __CUDA_ARCH__
23 #include <cudf/utilities/error.hpp>
24 #endif
25 
26 // This is defined when including this header in a https://github.com/NVIDIA/jitify
27 // or jitify2 source file. The jitify cannot include thrust headers at this time.
28 #ifndef CUDF_JIT_UDF
29 #include <thrust/count.h>
30 #include <thrust/execution_policy.h>
31 #endif
32 
33 #include <algorithm>
34 
35 // This file should only include device code logic.
36 // Host-only or host/device code should be defined in the string_view.hpp header file.
37 
38 namespace cudf {
39 namespace strings {
40 namespace detail {
41 
49 __device__ inline size_type characters_in_string(char const* str, size_type bytes)
50 {
51  if ((str == nullptr) || (bytes == 0)) return 0;
52  auto ptr = reinterpret_cast<uint8_t const*>(str);
53 #ifndef CUDF_JIT_UDF
54  return thrust::count_if(
55  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
56 #else
57  size_type chars = 0;
58  auto const end = ptr + bytes;
59  while (ptr < end) {
60  chars += is_begin_utf8_char(*ptr++);
61  }
62  return chars;
63 #endif
64 }
65 
77 __device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
78  size_type pos)
79 {
80  size_type bytes = 0;
81  auto ptr = d_str.data();
82  auto const end_ptr = ptr + d_str.size_bytes();
83  while ((pos > 0) && (ptr < end_ptr)) {
84  auto const width = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*ptr));
85  if (width) { --pos; }
86  bytes += width;
87  ++ptr;
88  }
89  return {bytes, pos};
90 }
91 
101 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
102 } // namespace detail
103 } // namespace strings
104 
114 
125 {
126  char const* psentinel{nullptr};
127 #if defined(__CUDA_ARCH__)
128  psentinel = &cudf::strings::detail::max_string_sentinel[0];
129 #else
131  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
132 #endif
133  return {psentinel, 4};
134 }
135 
136 __device__ inline size_type string_view::length() const
137 {
138  if (_length == UNKNOWN_STRING_LENGTH)
139  _length = strings::detail::characters_in_string(_data, _bytes);
140  return _length;
141 }
142 
143 // @cond
144 // this custom iterator knows about UTF8 encoding
145 __device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
146  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
147 {
148 }
149 
150 __device__ inline string_view::const_iterator::const_iterator(string_view const& str,
151  size_type pos,
152  size_type offset)
153  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
154 {
155 }
156 
157 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
158 {
159  if (byte_pos < bytes)
160  byte_pos += strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos]));
161  ++char_pos;
162  return *this;
163 }
164 
165 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
166 {
167  string_view::const_iterator tmp(*this);
168  operator++();
169  return tmp;
170 }
171 
172 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
173  string_view::const_iterator::difference_type offset) const
174 {
175  const_iterator tmp(*this);
176  size_type adjust = abs(offset);
177  while (adjust-- > 0)
178  offset > 0 ? ++tmp : --tmp;
179  return tmp;
180 }
181 
182 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
183  string_view::const_iterator::difference_type offset)
184 {
185  size_type adjust = abs(offset);
186  while (adjust-- > 0)
187  offset > 0 ? operator++() : operator--();
188  return *this;
189 }
190 
191 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
192 {
193  if (byte_pos > 0)
194  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
195  ;
196  --char_pos;
197  return *this;
198 }
199 
200 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
201 {
202  string_view::const_iterator tmp(*this);
203  operator--();
204  return tmp;
205 }
206 
207 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
208  string_view::const_iterator::difference_type offset)
209 {
210  size_type adjust = abs(offset);
211  while (adjust-- > 0)
212  offset > 0 ? operator--() : operator++();
213  return *this;
214 }
215 
216 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
217  string_view::const_iterator::difference_type offset) const
218 {
219  const_iterator tmp(*this);
220  size_type adjust = abs(offset);
221  while (adjust-- > 0)
222  offset > 0 ? --tmp : ++tmp;
223  return tmp;
224 }
225 
226 __device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
227  size_type new_pos)
228 {
229  *this += (new_pos - char_pos); // more efficient than recounting from the start
230  return *this;
231 }
232 
233 __device__ inline bool string_view::const_iterator::operator==(
234  string_view::const_iterator const& rhs) const
235 {
236  return (p == rhs.p) && (char_pos == rhs.char_pos);
237 }
238 
239 __device__ inline bool string_view::const_iterator::operator!=(
240  string_view::const_iterator const& rhs) const
241 {
242  return (p != rhs.p) || (char_pos != rhs.char_pos);
243 }
244 
245 __device__ inline bool string_view::const_iterator::operator<(
246  string_view::const_iterator const& rhs) const
247 {
248  return (p == rhs.p) && (char_pos < rhs.char_pos);
249 }
250 
251 __device__ inline bool string_view::const_iterator::operator<=(
252  string_view::const_iterator const& rhs) const
253 {
254  return (p == rhs.p) && (char_pos <= rhs.char_pos);
255 }
256 
257 __device__ inline bool string_view::const_iterator::operator>(
258  string_view::const_iterator const& rhs) const
259 {
260  return (p == rhs.p) && (char_pos > rhs.char_pos);
261 }
262 
263 __device__ inline bool string_view::const_iterator::operator>=(
264  string_view::const_iterator const& rhs) const
265 {
266  return (p == rhs.p) && (char_pos >= rhs.char_pos);
267 }
268 
269 __device__ inline char_utf8 string_view::const_iterator::operator*() const
270 {
271  char_utf8 chr = 0;
272  strings::detail::to_char_utf8(p + byte_offset(), chr);
273  return chr;
274 }
275 
276 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
277 
278 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
279 
280 __device__ inline string_view::const_iterator string_view::begin() const
281 {
282  return const_iterator(*this, 0, 0);
283 }
284 
285 __device__ inline string_view::const_iterator string_view::end() const
286 {
287  return const_iterator(*this, length(), size_bytes());
288 }
289 // @endcond
290 
291 __device__ inline char_utf8 string_view::operator[](size_type pos) const
292 {
293  size_type offset = byte_offset(pos);
294  if (offset >= _bytes) return 0;
295  char_utf8 chr = 0;
296  strings::detail::to_char_utf8(data() + offset, chr);
297  return chr;
298 }
299 
300 __device__ inline size_type string_view::byte_offset(size_type pos) const
301 {
302  if (length() == size_bytes()) return pos;
303  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
304 }
305 
306 __device__ inline int string_view::compare(string_view const& in) const
307 {
308  return compare(in.data(), in.size_bytes());
309 }
310 
311 __device__ inline int string_view::compare(char const* data, size_type bytes) const
312 {
313  size_type const len1 = size_bytes();
314  auto const* ptr1 = reinterpret_cast<unsigned char const*>(this->data());
315  auto const* ptr2 = reinterpret_cast<unsigned char const*>(data);
316  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
317  size_type idx = 0;
318  for (; (idx < len1) && (idx < bytes); ++idx) {
319  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
320  ++ptr1;
321  ++ptr2;
322  }
323  if (idx < len1) return 1;
324  if (idx < bytes) return -1;
325  return 0;
326 }
327 
328 __device__ inline bool string_view::operator==(string_view const& rhs) const
329 {
330  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
331 }
332 
333 __device__ inline bool string_view::operator!=(string_view const& rhs) const
334 {
335  return compare(rhs) != 0;
336 }
337 
338 __device__ inline bool string_view::operator<(string_view const& rhs) const
339 {
340  return compare(rhs) < 0;
341 }
342 
343 __device__ inline bool string_view::operator>(string_view const& rhs) const
344 {
345  return compare(rhs) > 0;
346 }
347 
348 __device__ inline bool string_view::operator<=(string_view const& rhs) const
349 {
350  int rc = compare(rhs);
351  return (rc == 0) || (rc < 0);
352 }
353 
354 __device__ inline bool string_view::operator>=(string_view const& rhs) const
355 {
356  int rc = compare(rhs);
357  return (rc == 0) || (rc > 0);
358 }
359 
360 __device__ inline size_type string_view::find(string_view const& str,
361  size_type pos,
362  size_type count) const
363 {
364  return find(str.data(), str.size_bytes(), pos, count);
365 }
366 
367 template <bool forward>
368 __device__ inline size_type string_view::find_impl(char const* str,
369  size_type bytes,
370  size_type pos,
371  size_type count) const
372 {
373  auto const nchars = length();
374  if (!str || pos < 0 || pos > nchars) return npos;
375  if (count < 0) count = nchars;
376 
377  // use iterator to help reduce character/byte counting
378  auto itr = begin() + pos;
379  auto const spos = itr.byte_offset();
380  auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes();
381 
382  auto const find_length = (epos - spos) - bytes + 1;
383 
384  auto ptr = data() + (forward ? spos : (epos - bytes));
385  for (size_type idx = 0; idx < find_length; ++idx) {
386  bool match = true;
387  for (size_type jdx = 0; match && (jdx < bytes); ++jdx) {
388  match = (ptr[jdx] == str[jdx]);
389  }
390  if (match) { return forward ? pos : character_offset(epos - bytes - idx); }
391  // use pos to record the current find position
392  pos += strings::detail::is_begin_utf8_char(*ptr);
393  forward ? ++ptr : --ptr;
394  }
395  return npos;
396 }
397 
398 __device__ inline size_type string_view::find(char const* str,
399  size_type bytes,
400  size_type pos,
401  size_type count) const
402 {
403  return find_impl<true>(str, bytes, pos, count);
404 }
405 
406 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
407 {
408  char str[sizeof(char_utf8)];
409  size_type chwidth = strings::detail::from_char_utf8(chr, str);
410  return find(str, chwidth, pos, count);
411 }
412 
413 __device__ inline size_type string_view::rfind(string_view const& str,
414  size_type pos,
415  size_type count) const
416 {
417  return rfind(str.data(), str.size_bytes(), pos, count);
418 }
419 
420 __device__ inline size_type string_view::rfind(char const* str,
421  size_type bytes,
422  size_type pos,
423  size_type count) const
424 {
425  return find_impl<false>(str, bytes, pos, count);
426 }
427 
428 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
429 {
430  char str[sizeof(char_utf8)];
431  size_type chwidth = strings::detail::from_char_utf8(chr, str);
432  return rfind(str, chwidth, pos, count);
433 }
434 
435 // parameters are character position values
436 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
437 {
438  if (pos < 0 || pos >= length()) { return string_view{}; }
439  auto const itr = begin() + pos;
440  auto const spos = itr.byte_offset();
441  auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes();
442  return {data() + spos, epos - spos};
443 }
444 
445 __device__ inline size_type string_view::character_offset(size_type bytepos) const
446 {
447  if (length() == size_bytes()) return bytepos;
448  return strings::detail::characters_in_string(data(), bytepos);
449 }
450 
451 } // namespace cudf
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:44
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:51
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:63
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
static CUDF_HOST_DEVICE string_view max()
Return maximum value associated with the string type.
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
static CUDF_HOST_DEVICE string_view min()
Return minimum value associated with the string type.
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator==(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator!=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:263
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:93
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:31
cuDF interfaces
Definition: aggregation.hpp:34
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32