string_view.cuh
1 /*
2  * Copyright (c) 2019-2022, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/strings/detail/utf8.hpp>
21 
22 #ifndef __CUDA_ARCH__
23 #include <cudf/utilities/error.hpp>
24 #endif
25 
26 // This is defined when including this header in a https://github.com/NVIDIA/jitify
27 // or jitify2 source file. The jitify cannot include thrust headers at this time.
28 #ifndef CUDF_JIT_UDF
29 #include <thrust/count.h>
30 #include <thrust/execution_policy.h>
31 #endif
32 
33 #include <algorithm>
34 
35 // This file should only include device code logic.
36 // Host-only or host/device code should be defined in the string_view.hpp header file.
37 
38 namespace cudf {
39 namespace strings {
40 namespace detail {
41 
49 __device__ inline size_type characters_in_string(const char* str, size_type bytes)
50 {
51  if ((str == nullptr) || (bytes == 0)) return 0;
52  auto ptr = reinterpret_cast<uint8_t const*>(str);
53 #ifndef CUDF_JIT_UDF
54  return thrust::count_if(
55  thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
56 #else
57  size_type chars = 0;
58  auto const end = ptr + bytes;
59  while (ptr < end) {
60  chars += is_begin_utf8_char(*ptr++);
61  }
62  return chars;
63 #endif
64 }
65 
75 static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
76 } // namespace detail
77 } // namespace strings
78 
87 CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
88 
98 CUDF_HOST_DEVICE inline string_view string_view::max()
99 {
100  const char* psentinel{nullptr};
101 #if defined(__CUDA_ARCH__)
102  psentinel = &cudf::strings::detail::max_string_sentinel[0];
103 #else
105  cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
106 #endif
107  return string_view(psentinel, 4);
108 }
109 
110 __device__ inline size_type string_view::length() const
111 {
112  if (_length == UNKNOWN_STRING_LENGTH)
113  _length = strings::detail::characters_in_string(_data, _bytes);
114  return _length;
115 }
116 
117 // @cond
118 // this custom iterator knows about UTF8 encoding
119 __device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos)
120  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
121 {
122 }
123 
124 __device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
125 {
126  if (byte_pos < bytes)
127  byte_pos += strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos]));
128  ++char_pos;
129  return *this;
130 }
131 
132 __device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
133 {
134  string_view::const_iterator tmp(*this);
135  operator++();
136  return tmp;
137 }
138 
139 __device__ inline string_view::const_iterator string_view::const_iterator::operator+(
140  string_view::const_iterator::difference_type offset)
141 {
142  const_iterator tmp(*this);
143  size_type adjust = abs(offset);
144  while (adjust-- > 0)
145  offset > 0 ? ++tmp : --tmp;
146  return tmp;
147 }
148 
149 __device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
150  string_view::const_iterator::difference_type offset)
151 {
152  size_type adjust = abs(offset);
153  while (adjust-- > 0)
154  offset > 0 ? operator++() : operator--();
155  return *this;
156 }
157 
158 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
159 {
160  if (byte_pos > 0)
161  while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
162  ;
163  --char_pos;
164  return *this;
165 }
166 
167 __device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
168 {
169  string_view::const_iterator tmp(*this);
170  operator--();
171  return tmp;
172 }
173 
174 __device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
175  string_view::const_iterator::difference_type offset)
176 {
177  size_type adjust = abs(offset);
178  while (adjust-- > 0)
179  offset > 0 ? operator--() : operator++();
180  return *this;
181 }
182 
183 __device__ inline string_view::const_iterator string_view::const_iterator::operator-(
184  string_view::const_iterator::difference_type offset)
185 {
186  const_iterator tmp(*this);
187  size_type adjust = abs(offset);
188  while (adjust-- > 0)
189  offset > 0 ? --tmp : ++tmp;
190  return tmp;
191 }
192 
193 __device__ inline bool string_view::const_iterator::operator==(
194  const string_view::const_iterator& rhs) const
195 {
196  return (p == rhs.p) && (char_pos == rhs.char_pos);
197 }
198 
199 __device__ inline bool string_view::const_iterator::operator!=(
200  const string_view::const_iterator& rhs) const
201 {
202  return (p != rhs.p) || (char_pos != rhs.char_pos);
203 }
204 
205 __device__ inline bool string_view::const_iterator::operator<(
206  const string_view::const_iterator& rhs) const
207 {
208  return (p == rhs.p) && (char_pos < rhs.char_pos);
209 }
210 
211 __device__ inline bool string_view::const_iterator::operator<=(
212  const string_view::const_iterator& rhs) const
213 {
214  return (p == rhs.p) && (char_pos <= rhs.char_pos);
215 }
216 
217 __device__ inline bool string_view::const_iterator::operator>(
218  const string_view::const_iterator& rhs) const
219 {
220  return (p == rhs.p) && (char_pos > rhs.char_pos);
221 }
222 
223 __device__ inline bool string_view::const_iterator::operator>=(
224  const string_view::const_iterator& rhs) const
225 {
226  return (p == rhs.p) && (char_pos >= rhs.char_pos);
227 }
228 
229 __device__ inline char_utf8 string_view::const_iterator::operator*() const
230 {
231  char_utf8 chr = 0;
232  strings::detail::to_char_utf8(p + byte_offset(), chr);
233  return chr;
234 }
235 
236 __device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
237 
238 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
239 
240 __device__ inline string_view::const_iterator string_view::begin() const
241 {
242  return const_iterator(*this, 0);
243 }
244 
245 __device__ inline string_view::const_iterator string_view::end() const
246 {
247  return const_iterator(*this, length());
248 }
249 // @endcond
250 
251 __device__ inline char_utf8 string_view::operator[](size_type pos) const
252 {
253  size_type offset = byte_offset(pos);
254  if (offset >= _bytes) return 0;
255  char_utf8 chr = 0;
256  strings::detail::to_char_utf8(data() + offset, chr);
257  return chr;
258 }
259 
260 __device__ inline size_type string_view::byte_offset(size_type pos) const
261 {
262  size_type offset = 0;
263  const char* sptr = _data;
264  const char* eptr = sptr + _bytes;
265  if (length() == size_bytes()) return pos;
266  while ((pos > 0) && (sptr < eptr)) {
267  size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*sptr++));
268  if (charbytes) --pos;
269  offset += charbytes;
270  }
271  return offset;
272 }
273 
274 __device__ inline int string_view::compare(const string_view& in) const
275 {
276  return compare(in.data(), in.size_bytes());
277 }
278 
279 __device__ inline int string_view::compare(const char* data, size_type bytes) const
280 {
281  size_type const len1 = size_bytes();
282  const auto* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
283  const auto* ptr2 = reinterpret_cast<const unsigned char*>(data);
284  if ((ptr1 == ptr2) && (bytes == len1)) return 0;
285  size_type idx = 0;
286  for (; (idx < len1) && (idx < bytes); ++idx) {
287  if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
288  ++ptr1;
289  ++ptr2;
290  }
291  if (idx < len1) return 1;
292  if (idx < bytes) return -1;
293  return 0;
294 }
295 
296 __device__ inline bool string_view::operator==(const string_view& rhs) const
297 {
298  return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
299 }
300 
301 __device__ inline bool string_view::operator!=(const string_view& rhs) const
302 {
303  return compare(rhs) != 0;
304 }
305 
306 __device__ inline bool string_view::operator<(const string_view& rhs) const
307 {
308  return compare(rhs) < 0;
309 }
310 
311 __device__ inline bool string_view::operator>(const string_view& rhs) const
312 {
313  return compare(rhs) > 0;
314 }
315 
316 __device__ inline bool string_view::operator<=(const string_view& rhs) const
317 {
318  int rc = compare(rhs);
319  return (rc == 0) || (rc < 0);
320 }
321 
322 __device__ inline bool string_view::operator>=(const string_view& rhs) const
323 {
324  int rc = compare(rhs);
325  return (rc == 0) || (rc > 0);
326 }
327 
328 __device__ inline size_type string_view::find(const string_view& str,
329  size_type pos,
330  size_type count) const
331 {
332  return find(str.data(), str.size_bytes(), pos, count);
333 }
334 
335 template <bool forward>
336 __device__ inline size_type string_view::find_impl(const char* str,
337  size_type bytes,
338  size_type pos,
339  size_type count) const
340 {
341  if (!str || pos < 0) return npos;
342  auto const nchars = length();
343  if (count < 0) count = nchars;
344  auto const spos = byte_offset(pos);
345  auto const epos = byte_offset(std::min(pos + count, nchars));
346 
347  auto const find_length = (epos - spos) - bytes + 1;
348 
349  auto ptr = data() + (forward ? spos : (epos - bytes));
350  for (size_type idx = 0; idx < find_length; ++idx) {
351  bool match = true;
352  for (size_type jdx = 0; match && (jdx < bytes); ++jdx) {
353  match = (ptr[jdx] == str[jdx]);
354  }
355  if (match) { return character_offset(forward ? (idx + spos) : (epos - bytes - idx)); }
356  forward ? ++ptr : --ptr;
357  }
358  return npos;
359 }
360 
361 __device__ inline size_type string_view::find(const char* str,
362  size_type bytes,
363  size_type pos,
364  size_type count) const
365 {
366  return find_impl<true>(str, bytes, pos, count);
367 }
368 
369 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
370 {
371  char str[sizeof(char_utf8)];
372  size_type chwidth = strings::detail::from_char_utf8(chr, str);
373  return find(str, chwidth, pos, count);
374 }
375 
376 __device__ inline size_type string_view::rfind(const string_view& str,
377  size_type pos,
378  size_type count) const
379 {
380  return rfind(str.data(), str.size_bytes(), pos, count);
381 }
382 
383 __device__ inline size_type string_view::rfind(const char* str,
384  size_type bytes,
385  size_type pos,
386  size_type count) const
387 {
388  return find_impl<false>(str, bytes, pos, count);
389 }
390 
391 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
392 {
393  char str[sizeof(char_utf8)];
394  size_type chwidth = strings::detail::from_char_utf8(chr, str);
395  return rfind(str, chwidth, pos, count);
396 }
397 
398 // parameters are character position values
400 {
401  size_type spos = byte_offset(pos);
402  size_type epos = byte_offset(pos + length);
403  if (epos > size_bytes()) epos = size_bytes();
404  if (spos >= epos) return string_view("", 0);
405  return string_view(data() + spos, epos - spos);
406 }
407 
408 __device__ inline size_type string_view::character_offset(size_type bytepos) const
409 {
410  if (length() == size_bytes()) return bytepos;
411  return strings::detail::characters_in_string(data(), bytepos);
412 }
413 
414 } // namespace cudf
cudf::string_view::substr
string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
Definition: string_view.cuh:399
cudf::string_view::end
const_iterator end() const
Return new iterator pointing past the end of this string.
numeric::operator>=
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:796
cudf::char_utf8
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:31
string_view.hpp
Class definition for cudf::string_view.
numeric::operator+
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:703
cudf::size_type
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:80
cudf::string_view::byte_offset
size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
Definition: string_view.cuh:260
cudf::string_view
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:44
cudf::string_view::size_bytes
CUDF_HOST_DEVICE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:51
cudf::string_view::operator<=
bool operator<=(const string_view &rhs) const
Returns true if this string matches or is ordered before rhs.
Definition: string_view.cuh:316
cudf::string_view::operator[]
char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
Definition: string_view.cuh:251
cudf::string_view::operator<
bool operator<(const string_view &rhs) const
Returns true if this string is ordered before rhs.
Definition: string_view.cuh:306
cudf::string_view::operator>
bool operator>(const string_view &rhs) const
Returns true if rhs is ordered before this string.
Definition: string_view.cuh:311
cudf::string_view::rfind
size_type rfind(const string_view &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
Definition: string_view.cuh:376
cudf::string_view::string_view
CUDF_HOST_DEVICE string_view()
Default constructor represents an empty string.
Definition: string_view.hpp:332
cudf::string_view::npos
static cudf::size_type const npos
No-position value.
Definition: string_view.hpp:366
cudf::string_view::compare
int compare(const string_view &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
Definition: string_view.cuh:274
cudf::string_view::operator!=
bool operator!=(const string_view &rhs) const
Returns true if rhs does not match this string.
Definition: string_view.cuh:301
numeric::operator*
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:739
cudf::string_view::begin
const_iterator begin() const
Return new iterator pointing to the beginning of this string.
cudf
cuDF interfaces
Definition: aggregation.hpp:34
numeric::operator-
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:721
numeric::operator>
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:814
cudf::string_view::length
size_type length() const
Return the number of characters in this string.
Definition: string_view.cuh:110
CUDF_CUDA_TRY
#define CUDF_CUDA_TRY(call)
Error checking macro for CUDA runtime API functions.
Definition: error.hpp:209
cudf::string_view::max
static CUDF_HOST_DEVICE string_view max()
Return maximum value associated with the string type.
Definition: string_view.cuh:98
cudf::string_view::operator==
bool operator==(const string_view &rhs) const
Returns true if rhs matches this string exactly.
Definition: string_view.cuh:296
numeric::operator==
CUDF_HOST_DEVICE bool operator==(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:769
cudf::string_view::find
size_type find(const string_view &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
Definition: string_view.cuh:328
cudf::string_view::data
CUDF_HOST_DEVICE const char * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:63
numeric::operator<
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:805
cudf::string_view::min
static CUDF_HOST_DEVICE string_view min()
Return minimum value associated with the string type.
Definition: string_view.cuh:87
numeric::operator<=
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:787
numeric::operator!=
CUDF_HOST_DEVICE bool operator!=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
Definition: fixed_point.hpp:778
error.hpp
cudf::string_view::operator>=
bool operator>=(const string_view &rhs) const
Returns true if rhs matches or is ordered before this string.
Definition: string_view.cuh:322