string_view.cuh
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/strings/detail/utf8.hpp>
21 #include <cudf/utilities/export.hpp>
22 
23 #ifndef __CUDA_ARCH__
24 #include <cudf/utilities/error.hpp>
25 #endif
26 
27 // This is defined when including this header in a https://github.com/NVIDIA/jitify
28 // or jitify2 source file. The jitify cannot include thrust headers at this time.
29 #ifndef CUDF_JIT_UDF
30 #include <thrust/count.h>
31 #include <thrust/execution_policy.h>
32 #endif
33 
34 #include <algorithm>
35 
36 // This file should only include device code logic.
37 // Host-only or host/device code should be defined in the string_view.hpp header file.
38 
39 namespace CUDF_EXPORT cudf {
40 namespace strings {
41 namespace detail {
42 
50 __device__ inline size_type characters_in_string(char const* str, size_type bytes)
51 {
52  if ((str == nullptr) || (bytes == 0)) return 0;
53  auto ptr = reinterpret_cast<uint8_t const*>(str);
54 #ifndef CUDF_JIT_UDF
55  return thrust::count_if(
56  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
57 #else
58  size_type chars = 0;
59  auto const end = ptr + bytes;
60  while (ptr < end) {
61  chars += is_begin_utf8_char(*ptr++);
62  }
63  return chars;
64 #endif
65 }
66 
78 __device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
79  size_type pos)
80 {
81  size_type bytes = 0;
82  auto ptr = d_str.data();
83  auto const end_ptr = ptr + d_str.size_bytes();
84  while ((pos > 0) && (ptr < end_ptr)) {
85  auto const width = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*ptr));
86  if (width) { --pos; }
87  bytes += width;
88  ++ptr;
89  }
90  return {bytes, pos};
91 }
92 
102 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
103 } // namespace detail
104 } // namespace strings
105 
114 CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; }
115 
125 CUDF_HOST_DEVICE inline string_view string_view::max()
126 {
127  char const* psentinel{nullptr};
128 #if defined(__CUDA_ARCH__)
129  psentinel = &cudf::strings::detail::max_string_sentinel[0];
130 #else
132  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
133 #endif
134  return {psentinel, 4};
135 }
136 
137 __device__ inline size_type string_view::length() const
138 {
139  if (_length == UNKNOWN_STRING_LENGTH)
140  _length = strings::detail::characters_in_string(_data, _bytes);
141  return _length;
142 }
143 
144 // @cond
145 // this custom iterator knows about UTF8 encoding
146 __device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
147  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
148 {
149 }
150 
151 __device__ inline string_view::const_iterator::const_iterator(string_view const& str,
152  size_type pos,
153  size_type offset)
154  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
155 {
156 }
157 
158 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
159 {
160  if (byte_pos < bytes)
161  byte_pos += strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos]));
162  ++char_pos;
163  return *this;
164 }
165 
166 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
167 {
168  string_view::const_iterator tmp(*this);
169  operator++();
170  return tmp;
171 }
172 
173 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
174  string_view::const_iterator::difference_type offset) const
175 {
176  const_iterator tmp(*this);
177  size_type adjust = abs(offset);
178  while (adjust-- > 0)
179  offset > 0 ? ++tmp : --tmp;
180  return tmp;
181 }
182 
183 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
184  string_view::const_iterator::difference_type offset)
185 {
186  size_type adjust = abs(offset);
187  while (adjust-- > 0)
188  offset > 0 ? operator++() : operator--();
189  return *this;
190 }
191 
192 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
193 {
194  if (byte_pos > 0)
195  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
196  ;
197  --char_pos;
198  return *this;
199 }
200 
201 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
202 {
203  string_view::const_iterator tmp(*this);
204  operator--();
205  return tmp;
206 }
207 
208 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
209  string_view::const_iterator::difference_type offset)
210 {
211  size_type adjust = abs(offset);
212  while (adjust-- > 0)
213  offset > 0 ? operator--() : operator++();
214  return *this;
215 }
216 
217 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
218  string_view::const_iterator::difference_type offset) const
219 {
220  const_iterator tmp(*this);
221  size_type adjust = abs(offset);
222  while (adjust-- > 0)
223  offset > 0 ? --tmp : ++tmp;
224  return tmp;
225 }
226 
227 __device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
228  size_type new_pos)
229 {
230  *this += (new_pos - char_pos); // more efficient than recounting from the start
231  return *this;
232 }
233 
234 __device__ inline bool string_view::const_iterator::operator==(
235  string_view::const_iterator const& rhs) const
236 {
237  return (p == rhs.p) && (char_pos == rhs.char_pos);
238 }
239 
240 __device__ inline bool string_view::const_iterator::operator!=(
241  string_view::const_iterator const& rhs) const
242 {
243  return (p != rhs.p) || (char_pos != rhs.char_pos);
244 }
245 
246 __device__ inline bool string_view::const_iterator::operator<(
247  string_view::const_iterator const& rhs) const
248 {
249  return (p == rhs.p) && (char_pos < rhs.char_pos);
250 }
251 
252 __device__ inline bool string_view::const_iterator::operator<=(
253  string_view::const_iterator const& rhs) const
254 {
255  return (p == rhs.p) && (char_pos <= rhs.char_pos);
256 }
257 
258 __device__ inline bool string_view::const_iterator::operator>(
259  string_view::const_iterator const& rhs) const
260 {
261  return (p == rhs.p) && (char_pos > rhs.char_pos);
262 }
263 
264 __device__ inline bool string_view::const_iterator::operator>=(
265  string_view::const_iterator const& rhs) const
266 {
267  return (p == rhs.p) && (char_pos >= rhs.char_pos);
268 }
269 
270 __device__ inline char_utf8 string_view::const_iterator::operator*() const
271 {
272  char_utf8 chr = 0;
273  strings::detail::to_char_utf8(p + byte_offset(), chr);
274  return chr;
275 }
276 
277 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
278 
279 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
280 
281 __device__ inline string_view::const_iterator string_view::begin() const
282 {
283  return const_iterator(*this, 0, 0);
284 }
285 
286 __device__ inline string_view::const_iterator string_view::end() const
287 {
288  return const_iterator(*this, length(), size_bytes());
289 }
290 // @endcond
291 
292 __device__ inline char_utf8 string_view::operator[](size_type pos) const
293 {
294  size_type offset = byte_offset(pos);
295  if (offset >= _bytes) return 0;
296  char_utf8 chr = 0;
297  strings::detail::to_char_utf8(data() + offset, chr);
298  return chr;
299 }
300 
301 __device__ inline size_type string_view::byte_offset(size_type pos) const
302 {
303  if (length() == size_bytes()) return pos;
304  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
305 }
306 
307 __device__ inline int string_view::compare(string_view const& in) const
308 {
309  return compare(in.data(), in.size_bytes());
310 }
311 
312 __device__ inline int string_view::compare(char const* data, size_type bytes) const
313 {
314  size_type const len1 = size_bytes();
315  auto const* ptr1 = reinterpret_cast<unsigned char const*>(this->data());
316  auto const* ptr2 = reinterpret_cast<unsigned char const*>(data);
317  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
318  size_type idx = 0;
319  for (; (idx < len1) && (idx < bytes); ++idx) {
320  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
321  ++ptr1;
322  ++ptr2;
323  }
324  if (idx < len1) return 1;
325  if (idx < bytes) return -1;
326  return 0;
327 }
328 
329 __device__ inline bool string_view::operator==(string_view const& rhs) const
330 {
331  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
332 }
333 
334 __device__ inline bool string_view::operator!=(string_view const& rhs) const
335 {
336  return compare(rhs) != 0;
337 }
338 
339 __device__ inline bool string_view::operator<(string_view const& rhs) const
340 {
341  return compare(rhs) < 0;
342 }
343 
344 __device__ inline bool string_view::operator>(string_view const& rhs) const
345 {
346  return compare(rhs) > 0;
347 }
348 
349 __device__ inline bool string_view::operator<=(string_view const& rhs) const
350 {
351  int rc = compare(rhs);
352  return (rc == 0) || (rc < 0);
353 }
354 
355 __device__ inline bool string_view::operator>=(string_view const& rhs) const
356 {
357  int rc = compare(rhs);
358  return (rc == 0) || (rc > 0);
359 }
360 
361 __device__ inline size_type string_view::find(string_view const& str,
362  size_type pos,
363  size_type count) const
364 {
365  return find(str.data(), str.size_bytes(), pos, count);
366 }
367 
368 template <bool forward>
369 __device__ inline size_type string_view::find_impl(char const* str,
370  size_type bytes,
371  size_type pos,
372  size_type count) const
373 {
374  auto const nchars = length();
375  if (!str || pos < 0 || pos > nchars) return npos;
376  if (count < 0) count = nchars;
377 
378  // use iterator to help reduce character/byte counting
379  auto itr = begin() + pos;
380  auto const spos = itr.byte_offset();
381  auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes();
382 
383  auto const find_length = (epos - spos) - bytes + 1;
384 
385  auto ptr = data() + (forward ? spos : (epos - bytes));
386  for (size_type idx = 0; idx < find_length; ++idx) {
387  bool match = true;
388  for (size_type jdx = 0; match && (jdx < bytes); ++jdx) {
389  match = (ptr[jdx] == str[jdx]);
390  }
391  if (match) { return forward ? pos : character_offset(epos - bytes - idx); }
392  // use pos to record the current find position
393  pos += strings::detail::is_begin_utf8_char(*ptr);
394  forward ? ++ptr : --ptr;
395  }
396  return npos;
397 }
398 
399 __device__ inline size_type string_view::find(char const* str,
400  size_type bytes,
401  size_type pos,
402  size_type count) const
403 {
404  return find_impl<true>(str, bytes, pos, count);
405 }
406 
407 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
408 {
409  char str[sizeof(char_utf8)];
410  size_type chwidth = strings::detail::from_char_utf8(chr, str);
411  return find(str, chwidth, pos, count);
412 }
413 
414 __device__ inline size_type string_view::rfind(string_view const& str,
415  size_type pos,
416  size_type count) const
417 {
418  return rfind(str.data(), str.size_bytes(), pos, count);
419 }
420 
421 __device__ inline size_type string_view::rfind(char const* str,
422  size_type bytes,
423  size_type pos,
424  size_type count) const
425 {
426  return find_impl<false>(str, bytes, pos, count);
427 }
428 
429 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
430 {
431  char str[sizeof(char_utf8)];
432  size_type chwidth = strings::detail::from_char_utf8(chr, str);
433  return rfind(str, chwidth, pos, count);
434 }
435 
436 // parameters are character position values
437 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
438 {
439  if (pos < 0 || pos >= length()) { return string_view{}; }
440  auto const itr = begin() + pos;
441  auto const spos = itr.byte_offset();
442  auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes();
443  return {data() + spos, epos - spos};
444 }
445 
446 __device__ inline size_type string_view::character_offset(size_type bytepos) const
447 {
448  if (length() == size_bytes()) return bytepos;
449  return strings::detail::characters_in_string(data(), bytepos);
450 }
451 
452 } // namespace CUDF_EXPORT cudf
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:44
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:51
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
size_type length() const
Return the number of characters in this string.
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
bool operator==(string_view const &rhs) const
Returns true if rhs matches this string exactly.
const_iterator end() const
Return new iterator pointing past the end of this string.
int compare(string_view const &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
bool operator>=(string_view const &rhs) const
Returns true if rhs matches or is ordered before this string.
CUDF_HOST_DEVICE char const * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:63
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
bool operator!=(string_view const &rhs) const
Returns true if rhs does not match this string.
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
bool operator<=(string_view const &rhs) const
Returns true if this string matches or is ordered before rhs.
bool operator<(string_view const &rhs) const
Returns true if this string is ordered before rhs.
bool operator>(string_view const &rhs) const
Returns true if rhs is ordered before this string.
static cudf::size_type const npos
No-position value.
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:264
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:31
cuDF interfaces
Definition: aggregation.hpp:35
Class definition for cudf::string_view.
#define CUDF_HOST_DEVICE
Indicates that the function or method is usable on host and device.
Definition: types.hpp:32