remote_handle.hpp
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 #pragma once
6 
7 #include <cassert>
8 #include <cstddef>
9 #include <cstring>
10 #include <memory>
11 #include <optional>
12 #include <string>
13 
14 #include <kvikio/defaults.hpp>
15 #include <kvikio/error.hpp>
16 #include <kvikio/threadpool_wrapper.hpp>
17 #include <kvikio/utils.hpp>
18 
19 struct curl_slist;
20 
21 namespace kvikio {
22 
23 class CurlHandle; // Prototype
24 
31 enum class RemoteEndpointType : uint8_t {
32  AUTO,
34  S3,
37  S3_PUBLIC,
42  WEBHDFS,
44  HTTP,
46 };
47 
54 enum class RemoteIOBackend : uint8_t {
56  0,
60  MULTI_POLL =
61  1,
65 };
66 
74 enum class RemoteReactorDispatch : uint8_t {
75  PER_CHUNK =
76  0,
80  PER_PREAD =
81  1,
85 };
86 
96  protected:
97  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
99 
100  public:
101  virtual ~RemoteEndpoint() = default;
102 
110  virtual void setopt(CurlHandle& curl) = 0;
111 
117  virtual std::string str() const = 0;
118 
124  virtual std::size_t get_file_size() = 0;
125 
130  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
131 
137  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
138 };
139 
146 class HttpEndpoint : public RemoteEndpoint {
147  private:
148  std::string _url;
149 
150  public:
156  HttpEndpoint(std::string url);
157 
158  ~HttpEndpoint() override = default;
159  void setopt(CurlHandle& curl) override;
160  std::string str() const override;
161  std::size_t get_file_size() override;
162  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
163 
170  static bool is_url_valid(std::string const& url) noexcept;
171 };
172 
179 class S3Endpoint : public RemoteEndpoint {
180  private:
181  std::string _url;
182  std::string _aws_sigv4;
183  std::string _aws_userpwd;
184  curl_slist* _curl_header_list{};
185 
186  public:
202  static std::string url_from_bucket_and_object(std::string bucket_name,
203  std::string object_name,
204  std::optional<std::string> aws_region,
205  std::optional<std::string> aws_endpoint_url);
206 
215  [[nodiscard]] static std::pair<std::string, std::string> parse_s3_url(std::string const& s3_url);
216 
232  S3Endpoint(std::string url,
233  std::optional<std::string> aws_region = std::nullopt,
234  std::optional<std::string> aws_access_key = std::nullopt,
235  std::optional<std::string> aws_secret_access_key = std::nullopt,
236  std::optional<std::string> aws_session_token = std::nullopt);
237 
255  S3Endpoint(std::pair<std::string, std::string> bucket_and_object_names,
256  std::optional<std::string> aws_region = std::nullopt,
257  std::optional<std::string> aws_access_key = std::nullopt,
258  std::optional<std::string> aws_secret_access_key = std::nullopt,
259  std::optional<std::string> aws_endpoint_url = std::nullopt,
260  std::optional<std::string> aws_session_token = std::nullopt);
261 
262  ~S3Endpoint() override;
263  void setopt(CurlHandle& curl) override;
264  std::string str() const override;
265  std::size_t get_file_size() override;
266  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
267 
274  static bool is_url_valid(std::string const& url) noexcept;
275 };
276 
284  private:
285  std::string _url;
286 
287  public:
288  explicit S3PublicEndpoint(std::string url);
289 
290  ~S3PublicEndpoint() override = default;
291  void setopt(CurlHandle& curl) override;
292  std::string str() const override;
293  std::size_t get_file_size() override;
294  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
295 
302  static bool is_url_valid(std::string const& url) noexcept;
303 };
304 
312  private:
313  std::string _url;
314 
315  public:
316  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
317 
318  ~S3EndpointWithPresignedUrl() override = default;
319  void setopt(CurlHandle& curl) override;
320  std::string str() const override;
321  std::size_t get_file_size() override;
322  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
323 
330  static bool is_url_valid(std::string const& url) noexcept;
331 };
332 
337  private:
338  std::unique_ptr<RemoteEndpoint> _endpoint;
339  std::size_t _nbytes;
340 
341  public:
419  static RemoteHandle open(std::string url,
421  std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
422  std::optional<std::size_t> nbytes = std::nullopt);
423 
430  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes);
431 
439  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint);
440 
441  // A remote handle is moveable but not copyable.
442  RemoteHandle(RemoteHandle&& o) = default;
443  RemoteHandle& operator=(RemoteHandle&& o) = default;
444  RemoteHandle(RemoteHandle const&) = delete;
445  RemoteHandle& operator=(RemoteHandle const&) = delete;
446 
452  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
453 
462  [[nodiscard]] std::size_t nbytes() const noexcept;
463 
469  [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept;
470 
483  std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0);
484 
511  std::future<std::size_t> pread(void* buf,
512  std::size_t size,
513  std::size_t file_offset = 0,
514  std::size_t task_size = defaults::task_size(),
515  ThreadPool* thread_pool = &defaults::thread_pool());
516 };
517 
518 } // namespace kvikio
Representation of a curl easy handle pointer and its operations.
Definition: libcurl.hpp:80
A remote endpoint for HTTP/HTTPS resources.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
std::size_t get_file_size() override
Get the size of the remote file.
std::string str() const override
Get a description of this remote point instance.
HttpEndpoint(std::string url)
Create an http endpoint from a url.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for HTTP/HTTPS endpoints.
Abstract base class for remote endpoints.
RemoteEndpointType remote_endpoint_type() const noexcept
Get the type of the remote file.
virtual std::size_t get_file_size()=0
Get the size of the remote file.
virtual void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size)=0
Set up the range request in order to read part of a file given the file offset and read size.
virtual void setopt(CurlHandle &curl)=0
Set needed connection options on a curl handle.
virtual std::string str() const =0
Get a description of this remote point instance.
Handle of remote file.
RemoteEndpointType remote_endpoint_type() const noexcept
Get the type of the remote file.
static RemoteHandle open(std::string url, RemoteEndpointType remote_endpoint_type=RemoteEndpointType::AUTO, std::optional< std::vector< RemoteEndpointType >> allow_list=std::nullopt, std::optional< std::size_t > nbytes=std::nullopt)
Create a remote file handle from a URL.
RemoteHandle(std::unique_ptr< RemoteEndpoint > endpoint)
Create a new remote handle from an endpoint (infers the file size).
RemoteHandle(std::unique_ptr< RemoteEndpoint > endpoint, std::size_t nbytes)
Create a new remote handle from an endpoint and a file size.
A remote endpoint for AWS S3 storage using presigned URLs.
std::size_t get_file_size() override
Get the size of the remote file.
std::string str() const override
Get a description of this remote point instance.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 endpoints with presigned URL.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
A remote endpoint for AWS S3 storage requiring credentials.
static std::string url_from_bucket_and_object(std::string bucket_name, std::string object_name, std::optional< std::string > aws_region, std::optional< std::string > aws_endpoint_url)
Get url from a AWS S3 bucket and object name.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 endpoints (excluding presigned URL).
S3Endpoint(std::pair< std::string, std::string > bucket_and_object_names, std::optional< std::string > aws_region=std::nullopt, std::optional< std::string > aws_access_key=std::nullopt, std::optional< std::string > aws_secret_access_key=std::nullopt, std::optional< std::string > aws_endpoint_url=std::nullopt, std::optional< std::string > aws_session_token=std::nullopt)
Create a S3 endpoint from a bucket and object name.
S3Endpoint(std::string url, std::optional< std::string > aws_region=std::nullopt, std::optional< std::string > aws_access_key=std::nullopt, std::optional< std::string > aws_secret_access_key=std::nullopt, std::optional< std::string > aws_session_token=std::nullopt)
Create a S3 endpoint from a url.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
static std::pair< std::string, std::string > parse_s3_url(std::string const &s3_url)
Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
std::string str() const override
Get a description of this remote point instance.
std::size_t get_file_size() override
Get the size of the remote file.
A remote endpoint for publicly accessible S3 objects without authentication.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 public endpoints.
std::size_t get_file_size() override
Get the size of the remote file.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
std::string str() const override
Get a description of this remote point instance.
Singleton class of default values used throughout KvikIO.
Definition: defaults.hpp:122
KvikIO namespace.
Definition: batch.hpp:16
BS::thread_pool ThreadPool
Thread pool type used for parallel I/O operations.
RemoteReactorDispatch
How sub-ranges of a single pread() are distributed across reactor threads when the MULTI_POLL backend...
RemoteIOBackend
Selects the remote I/O backend.
RemoteEndpointType
Types of remote file endpoints supported by KvikIO.