string_view.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2021, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cuda_runtime.h>
19 #include <cstddef>
20 #include <cudf/types.hpp>
21 #include <iterator>
22 
28 namespace cudf {
29 
30 using char_utf8 = uint32_t;
31 
37 constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
38 
44 constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};
45 
50 constexpr int8_t VARIABLE_CHAR_WIDTH{0};
51 
63 class string_view {
64  public:
68  CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; }
72  CUDA_DEVICE_CALLABLE size_type length() const;
76  CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; }
77 
81  CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; }
82 
87  public:
88  using difference_type = ptrdiff_t;
89  using value_type = char_utf8;
90  using reference = char_utf8&;
91  using pointer = char_utf8*;
92  using iterator_category = std::input_iterator_tag;
93  CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos);
94  const_iterator(const const_iterator& mit) = default;
95  const_iterator(const_iterator&& mit) = default;
96  const_iterator& operator=(const const_iterator&) = default;
97  const_iterator& operator=(const_iterator&&) = default;
98  CUDA_DEVICE_CALLABLE const_iterator& operator++();
99  CUDA_DEVICE_CALLABLE const_iterator operator++(int);
100  CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type);
101  CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type);
102  CUDA_DEVICE_CALLABLE const_iterator& operator--();
103  CUDA_DEVICE_CALLABLE const_iterator operator--(int);
104  CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type);
105  CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type);
106  CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const;
107  CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const;
108  CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const;
109  CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const;
110  CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const;
111  CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const;
112  CUDA_DEVICE_CALLABLE char_utf8 operator*() const;
113  CUDA_DEVICE_CALLABLE size_type position() const;
114  CUDA_DEVICE_CALLABLE size_type byte_offset() const;
115 
116  private:
117  const char* p{};
118  size_type bytes{};
119  size_type char_pos{};
120  size_type byte_pos{};
121  };
122 
126  CUDA_DEVICE_CALLABLE const_iterator begin() const;
130  CUDA_DEVICE_CALLABLE const_iterator end() const;
131 
137  CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const;
143  CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const;
144 
158  CUDA_DEVICE_CALLABLE int compare(const string_view& str) const;
173  CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const;
174 
178  CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const;
182  CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const;
186  CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const;
190  CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const;
194  CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const;
198  CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const;
199 
210  CUDA_DEVICE_CALLABLE size_type find(const string_view& str,
211  size_type pos = 0,
212  size_type count = -1) const;
224  CUDA_DEVICE_CALLABLE size_type find(const char* str,
225  size_type bytes,
226  size_type pos = 0,
227  size_type count = -1) const;
238  CUDA_DEVICE_CALLABLE size_type find(char_utf8 character,
239  size_type pos = 0,
240  size_type count = -1) const;
251  CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str,
252  size_type pos = 0,
253  size_type count = -1) const;
265  CUDA_DEVICE_CALLABLE size_type rfind(const char* str,
266  size_type bytes,
267  size_type pos = 0,
268  size_type count = -1) const;
279  CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character,
280  size_type pos = 0,
281  size_type count = -1) const;
282 
291  CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const;
292 
301  CUDA_HOST_DEVICE_CALLABLE static string_view min();
302 
312  CUDA_HOST_DEVICE_CALLABLE static string_view max();
313 
317  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {}
318 
325  CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
326  : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
327  {
328  }
329 
330  string_view(const string_view&) = default;
331  string_view(string_view&&) = default;
332  ~string_view() = default;
333  string_view& operator=(const string_view&) = default;
334  string_view& operator=(string_view&&) = default;
335 
336  private:
337  const char* _data{};
338  size_type _bytes{};
339  mutable size_type _length{};
340  mutable int8_t _char_width{};
341 
348  CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const;
349 };
350 
351 namespace strings {
352 namespace detail {
353 
360 constexpr bool is_begin_utf8_char(uint8_t byte)
361 {
362  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
363  return (byte & 0xC0) != 0x80;
364 }
365 
372 constexpr size_type bytes_in_char_utf8(char_utf8 character)
373 {
374  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
375  static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
376  static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
377 }
378 
389 constexpr size_type bytes_in_utf8_byte(uint8_t byte)
390 {
391  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0) // 4-byte character prefix
392  + static_cast<size_type>((byte & 0xE0) == 0xE0) // 3-byte character prefix
393  + static_cast<size_type>((byte & 0xC0) == 0xC0) // 2-byte character prefix
394  - static_cast<size_type>((byte & 0xC0) == 0x80); // intermediate byte
395 }
396 
404 CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character)
405 {
406  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
407 
408  character = static_cast<char_utf8>(*str++) & 0xFF;
409  if (chr_width > 1) {
410  character = character << 8;
411  character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 8;
412  if (chr_width > 2) {
413  character = character << 8;
414  character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 16;
415  if (chr_width > 3) {
416  character = character << 8;
417  character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 24;
418  }
419  }
420  }
421  return chr_width;
422 }
423 
431 CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str)
432 {
433  size_type const chr_width = bytes_in_char_utf8(character);
434  for (size_type idx = 0; idx < chr_width; ++idx) {
435  str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
436  character = character >> 8;
437  }
438  return chr_width;
439 }
440 
441 } // namespace detail
442 } // namespace strings
443 } // namespace cudf
cudf::string_view::operator<=
CUDA_DEVICE_CALLABLE bool operator<=(const string_view &rhs) const
Returns true if this string matches or is ordered before rhs.
Definition: string_view.cuh:304
cudf::string_view::size_bytes
CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const
Return the number of bytes in this string.
Definition: string_view.hpp:68
cudf::string_view::end
CUDA_DEVICE_CALLABLE const_iterator end() const
Return new iterator pointing past the end of this string.
Definition: string_view.cuh:235
cudf::char_utf8
uint32_t char_utf8
UTF-8 characters are 1-4 bytes.
Definition: string_view.hpp:30
cudf::string_view::min
static CUDA_HOST_DEVICE_CALLABLE string_view min()
Return minimum value associated with the string type.
Definition: string_view.cuh:69
cudf::strings::detail::bytes_in_utf8_byte
constexpr size_type bytes_in_utf8_byte(uint8_t byte)
Returns the number of bytes used to represent the provided byte.
Definition: string_view.hpp:389
cudf::strings::detail::from_char_utf8
CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char *str)
Place a char_utf8 value into a char array.
Definition: string_view.hpp:431
types.hpp
Type declarations for libcudf.
cudf::string_view::rfind
CUDA_DEVICE_CALLABLE size_type rfind(const string_view &str, size_type pos=0, size_type count=-1) const
Returns the character position of the last occurrence where the argument str is found in this string ...
Definition: string_view.cuh:358
cudf::strings::detail::bytes_in_char_utf8
constexpr size_type bytes_in_char_utf8(char_utf8 character)
Returns the number of bytes in the specified character.
Definition: string_view.hpp:372
cudf::string_view
A non-owning, immutable view of device data that is a variable length char array representing a UTF-8...
Definition: string_view.hpp:63
cudf::string_view::operator==
CUDA_DEVICE_CALLABLE bool operator==(const string_view &rhs) const
Returns true if rhs matches this string exactly.
Definition: string_view.cuh:284
cudf::string_view::operator>=
CUDA_DEVICE_CALLABLE bool operator>=(const string_view &rhs) const
Returns true if rhs matches or is ordered before this string.
Definition: string_view.cuh:310
cudf::string_view::string_view
CUDA_HOST_DEVICE_CALLABLE string_view()
Default constructor represents an empty string.
Definition: string_view.hpp:317
cudf::strings::detail::is_begin_utf8_char
constexpr bool is_begin_utf8_char(uint8_t byte)
This will return true if passed the first byte of a UTF-8 character.
Definition: string_view.hpp:360
cudf::string_view::operator!=
CUDA_DEVICE_CALLABLE bool operator!=(const string_view &rhs) const
Returns true if rhs does not match this string.
Definition: string_view.cuh:289
cudf::UNKNOWN_STRING_LENGTH
constexpr cudf::size_type UNKNOWN_STRING_LENGTH
The string length is initialized to this value as a place-holder.
Definition: string_view.hpp:37
cudf::VARIABLE_CHAR_WIDTH
constexpr int8_t VARIABLE_CHAR_WIDTH
This value is assigned to the _char_width member if the string contains characters of different width...
Definition: string_view.hpp:50
cudf::strings::detail::to_char_utf8
CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char *str, char_utf8 &character)
Convert a char array into a char_utf8 value.
Definition: string_view.hpp:404
cudf::string_view::operator<
CUDA_DEVICE_CALLABLE bool operator<(const string_view &rhs) const
Returns true if this string is ordered before rhs.
Definition: string_view.cuh:294
cudf::string_view::operator>
CUDA_DEVICE_CALLABLE bool operator>(const string_view &rhs) const
Returns true if rhs is ordered before this string.
Definition: string_view.cuh:299
cudf
cuDF interfaces
Definition: aggregation.hpp:34
cudf::string_view::compare
CUDA_DEVICE_CALLABLE int compare(const string_view &str) const
Comparing target string with this string. Each character is compared as a UTF-8 code-point value.
Definition: string_view.cuh:263
cudf::string_view::substr
CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const
Return a sub-string of this string. The original string and device memory must still be maintained fo...
Definition: string_view.cuh:400
cudf::string_view::const_iterator
Handy iterator for navigating through encoded characters.
Definition: string_view.hpp:86
cudf::string_view::operator[]
CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const
Return single UTF-8 character at the given character position.
Definition: string_view.cuh:240
cudf::string_view::find
CUDA_DEVICE_CALLABLE size_type find(const string_view &str, size_type pos=0, size_type count=-1) const
Returns the character position of the first occurrence where the argument str is found in this string...
Definition: string_view.cuh:316
cudf::UNKNOWN_CHAR_WIDTH
constexpr int8_t UNKNOWN_CHAR_WIDTH
The char width is initialized to this value as a place-holder.
Definition: string_view.hpp:44
cudf::string_view::byte_offset
CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const
Return the byte offset from data() for a given character position.
Definition: string_view.cuh:249
cudf::string_view::max
static CUDA_HOST_DEVICE_CALLABLE string_view max()
Return maximum value associated with the string type.
Definition: string_view.cuh:80
cudf::string_view::length
CUDA_DEVICE_CALLABLE size_type length() const
Return the number of characters in this string.
Definition: string_view.cuh:91
cudf::string_view::data
CUDA_HOST_DEVICE_CALLABLE const char * data() const
Return a pointer to the internal device array.
Definition: string_view.hpp:76
cudf::string_view::empty
CUDA_HOST_DEVICE_CALLABLE bool empty() const
Return true if string has no characters.
Definition: string_view.hpp:81
cudf::string_view::begin
CUDA_DEVICE_CALLABLE const_iterator begin() const
Return new iterator pointing to the beginning of this string.
Definition: string_view.cuh:230