libcudf: subword_tokenize.hpp Source File

 /*

  * Copyright (c) 2020-2024, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <cudf/column/column.hpp>

 #include <cudf/column/column_view.hpp>

 #include <cudf/strings/strings_column_view.hpp>

 #include <cudf/utilities/export.hpp>

 #include <cudf/utilities/memory_resource.hpp>


 namespace CUDF_EXPORT nvtext {


 struct hashed_vocabulary {

   uint16_t first_token_id{};

   uint16_t separator_token_id{};

   uint16_t unknown_token_id{};

   uint32_t outer_hash_a{};

   uint32_t outer_hash_b{};

   uint16_t num_bins{};

   std::unique_ptr<cudf::column> table;

   std::unique_ptr<cudf::column> bin_coefficients;

   std::unique_ptr<cudf::column> bin_offsets;

   std::unique_ptr<cudf::column>

     cp_metadata;

   std::unique_ptr<cudf::column>

     aux_cp_table;

 };


 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(

   std::string const& filename_hashed_vocabulary,

   rmm::cuda_stream_view stream      = cudf::get_default_stream(),

   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());


 struct tokenizer_result {

   uint32_t nrows_tensor{};

   uint32_t sequence_length{};

   std::unique_ptr<cudf::column> tensor_token_ids;

   std::unique_ptr<cudf::column> tensor_attention_mask;

   std::unique_ptr<cudf::column> tensor_metadata;

 };


 tokenizer_result subword_tokenize(

   cudf::strings_column_view const& strings,

   hashed_vocabulary const& vocabulary_table,

   uint32_t max_sequence_length,

   uint32_t stride,

   bool do_lower_case,

   bool do_truncate,

   rmm::cuda_stream_view stream      = cudf::get_default_stream(),

   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

   // end of group

 }  // namespace CUDF_EXPORT nvtext

cudf::strings_column_view
Given a column-view of strings type, an instance of this class provides a wrapper on this compound co...
Definition: strings_column_view.hpp:38

rmm::cuda_stream_view

column.hpp
Class definition for cudf::column.

column_view.hpp
column view class definitions

cudf::get_default_stream
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.

cudf::get_current_device_resource_ref
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
Definition: memory_resource.hpp:47

device_async_resource_ref
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref

nvtext::subword_tokenize
tokenizer_result subword_tokenize(cudf::strings_column_view const &strings, hashed_vocabulary const &vocabulary_table, uint32_t max_sequence_length, uint32_t stride, bool do_lower_case, bool do_truncate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a tokenizer that cleans the text, splits it into tokens and returns token-ids from an input v...

nvtext::load_vocabulary_file
std::unique_ptr< hashed_vocabulary > load_vocabulary_file(std::string const &filename_hashed_vocabulary, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Load the hashed vocabulary file into device memory.

memory_resource.hpp

nvtext
NVText APIs.
Definition: byte_pair_encoding.hpp:26

strings_column_view.hpp
Class definition for cudf::strings_column_view.

nvtext::hashed_vocabulary
The vocabulary data for use with the subword_tokenize function.
Definition: subword_tokenize.hpp:35

nvtext::hashed_vocabulary::bin_coefficients
std::unique_ptr< cudf::column > bin_coefficients
Definition: subword_tokenize.hpp:44

nvtext::hashed_vocabulary::aux_cp_table
std::unique_ptr< cudf::column > aux_cp_table
uint64 column, The auxiliary code point table to use for normalization
Definition: subword_tokenize.hpp:51

nvtext::hashed_vocabulary::bin_offsets
std::unique_ptr< cudf::column > bin_offsets
Definition: subword_tokenize.hpp:46

nvtext::hashed_vocabulary::table
std::unique_ptr< cudf::column > table
Definition: subword_tokenize.hpp:42

nvtext::hashed_vocabulary::cp_metadata
std::unique_ptr< cudf::column > cp_metadata
uint32 column, The code point metadata table to use for normalization
Definition: subword_tokenize.hpp:49

nvtext::tokenizer_result
Result object for the subword_tokenize functions.
Definition: subword_tokenize.hpp:77

nvtext::tokenizer_result::tensor_token_ids
std::unique_ptr< cudf::column > tensor_token_ids
A vector of token-ids for each row.
Definition: subword_tokenize.hpp:92

nvtext::tokenizer_result::tensor_metadata
std::unique_ptr< cudf::column > tensor_metadata
The metadata for each tensor row.
Definition: subword_tokenize.hpp:105

nvtext::tokenizer_result::tensor_attention_mask
std::unique_ptr< cudf::column > tensor_attention_mask
This mask identifies which tensor-token-ids are valid.
Definition: subword_tokenize.hpp:98