diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5e678466ddf0..147a800a48b8 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -44,7 +44,7 @@ Imports: utils, vctrs Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") -RoxygenNote: 7.3.3 +RoxygenNote: 7.3.3.9000 Config/testthat/edition: 3 Config/build/bootstrap: TRUE Suggests: diff --git a/r/NAMESPACE b/r/NAMESPACE index cdeb27c4067f..40d65cab1950 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -183,6 +183,7 @@ S3method(vec_ptype_full,arrow_fixed_size_list) S3method(vec_ptype_full,arrow_large_list) S3method(vec_ptype_full,arrow_list) export(Array) +export(AzureFileSystem) export(Buffer) export(BufferOutputStream) export(BufferReader) @@ -282,6 +283,7 @@ export(arrow_available) export(arrow_info) export(arrow_table) export(arrow_with_acero) +export(arrow_with_azure) export(arrow_with_dataset) export(arrow_with_gcs) export(arrow_with_json) @@ -295,6 +297,7 @@ export(as_data_type) export(as_record_batch) export(as_record_batch_reader) export(as_schema) +export(az_container) export(binary) export(bool) export(boolean) diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 699f94dcbdb5..91b46788aab2 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -46,6 +46,7 @@ arrow_info <- function() { json = arrow_with_json(), s3 = arrow_with_s3(), gcs = arrow_with_gcs(), + azure = arrow_with_azure(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) @@ -128,6 +129,15 @@ arrow_with_gcs <- function() { }) } +#' @rdname arrow_info +#' @export +arrow_with_azure <- function() { + tryCatch(.Call(`_azure_available`), error = function(e) { + return(FALSE) + }) +} + + #' @rdname arrow_info #' @export arrow_with_json <- function() { diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 455e6bc8a7fd..5722913b9704 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1408,6 +1408,10 @@ fs___GcsFileSystem__options <- function(fs) { .Call(`_arrow_fs___GcsFileSystem__options`, fs) } +fs___AzureFileSystem__Make <- function(options) { + .Call(`_arrow_fs___AzureFileSystem__Make`, options) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 99c09c40dc3b..863051334692 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -189,6 +189,31 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `default_metadata`: default metadata to write in new objects. #' - `project_id`: the project to use for creating buckets. #' +#' `AzureFileSystem$create()` takes following required argument: +#' +#' - `account_name`: Azure Blob Storage account name. +#' +#' `AzureFileSystem$create()` takes following optional arguments: +#' +#' - `account_key`: Account key of the storage account. Cannot be used with +#' `sas_token`. +#' - `blob_storage_authority`: Hostname of the blob service, defaulting to +#' `"blob.core.windows.net"`. +#' - `blob_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `client_id`: The client/application ID for Azure Active Directory +#' authentication. If used with `client_secret` and `tenant_id` then it is the +#' application ID for a registered Azure AD application. Otherwise, it is the +#' client ID of a user-assigned managed identity. +#' - `client_secret`: Client secret for Azure Active Directory authentication. +#' Must be provided with both `client_id` and `tenant_id`. +#' - `dfs_storage_authority`: Hostname of the data lake (gen 2) service, +#' defaulting to `"dfs.core.windows.net"`. +#' - `dfs_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `sas_token`: Shared access signature (SAS) token for the storage account. +#' Cannot be used with `account key`. +#' - `tenant_id`: Tenant ID for Azure Active Directory authentication. Must +#' be provided with both `client_id` and `client_secret`. +#' #' @section Methods: #' #' - `path(x)`: Create a `SubTreeFileSystem` from the current `FileSystem` @@ -253,6 +278,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and #' 'OFF'. #' +#' On `AzureFileSystem`, passing no arguments for authentication uses the +#' `AzureDefaultCredential` for authentication, so that several authentication +#' types are tried until one succeeds. +#' #' @usage NULL #' @format NULL #' @docType class @@ -645,6 +674,93 @@ GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, .. fs___GcsFileSystem__Make(anonymous, options) } +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +AzureFileSystem <- R6Class( + "AzureFileSystem", + inherit = FileSystem +) + +AzureFileSystem$create <- function(account_name, ...) { + options <- list(...) + valid_opts <- c( + "account_key", + "blob_storage_authority", + "blob_storage_scheme", + "client_id", + "client_secret", + "dfs_storage_authority", + "dfs_storage_scheme", + "sas_token", + "tenant_id" + ) + + invalid_opts <- setdiff(names(options), valid_opts) + if (length(invalid_opts)) { + stop( + "Invalid options for AzureFileSystem: ", + oxford_paste(invalid_opts), + call. = FALSE + ) + } + if (!is.null(options$tenant_id) || !is.null(options$client_id) || !is.null(options$client_secret)) { + if (is.null(options$client_id)) { + stop( + "`client_id` must be given with `tenant_id` and `client_secret`", + call. = FALSE + ) + } + if (sum(is.null(options$tenant_id), is.null(options$client_secret)) == 1) { + stop( + "Provide only `client_id` to authenticate with ", + "Managed Identity Credential, or provide `client_id`, `tenant_id`, ", + "and`client_secret` to authenticate with Client Secret Credential", + call. = FALSE + ) + } + } else if (!is.null(options$account_key) && !is.null(options$sas_token)) { + stop( + "Cannot specify both `account_key` and `sas_token`", + call. = FALSE + ) + } + + fs___AzureFileSystem__Make(c(account_name = account_name, options)) +} + +#' Connect to an Azure Blob Storage container +#' +#' `az_conainer` is a convenience function to create an `AzureFileSystem` object +#' that provides a file system interface for blob storage containers in an Azure +#' Storage Account. +#' +#' @param container_path string Container name or path. +#' @param ... Additional connection options, passed to `AzureFileSystem$create()`. +#' +#' @return A `SubTreeFileSystem` containing an `AzureFileSystem` and the container's +#' relative path. Note that this function's success does not guarantee that you +#' are authorized to access the container's contents. +#' @examplesIf FALSE +#' container_fs <- az_container( +#' container_path = "arrow-datasets", +#' account_name = azurite_account_name, +#' account_key = azurite_account_key, +#' blob_storage_authority = azurite_blob_storage_authority, +#' blob_storage_scheme = azurite_blob_storage_scheme +#' ) +#' @export +az_container <- function(container_path, ...) { + assert_that(is.string(container_path)) + args <- list2(...) + + fs <- exec(AzureFileSystem$create, !!!args) + + SubTreeFileSystem$create(container_path, fs) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 39700914db4b..5d9e432414b6 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -261,10 +261,11 @@ reference: - title: File systems desc: > - Functions for working with files on S3 and GCS + Functions for working with files on S3, GCS, and Azure contents: - s3_bucket - gs_bucket + - az_container - copy_files - title: Flight diff --git a/r/configure b/r/configure index 9e92eb6b47f2..7642b9b9dd3f 100755 --- a/r/configure +++ b/r/configure @@ -359,10 +359,14 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + if arrow_built_with ARROW_AZURE; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES -lcurl -lxml2 -lssl" + fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi - if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3 || arrow_built_with ARROW_AZURE; then # If pkg-config is available it will handle this for us automatically SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi diff --git a/r/configure.win b/r/configure.win index 16c5ec1bee8d..67fcaf2feb44 100755 --- a/r/configure.win +++ b/r/configure.win @@ -187,10 +187,13 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + if arrow_built_with ARROW_AZURE; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi - if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3 || arrow_built_with ARROW_AZURE; then # If pkg-config is available it will handle this for us automatically SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 9acfef109c56..8a78ba7ecaac 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -30,7 +30,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "json") +features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "azure", "json") suppressPackageStartupMessages({ library(decor) diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 241994223d3e..fdc97c63a632 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -79,6 +79,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -Dlz4_SOURCE=${lz4_SOURCE:-} \ -DARROW_FILESYSTEM=ON \ -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_JSON=${ARROW_JSON:-ON} \ diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 83e7fc652616..0cca6d3767d8 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -6,6 +6,7 @@ \alias{LocalFileSystem} \alias{S3FileSystem} \alias{GcsFileSystem} +\alias{AzureFileSystem} \alias{SubTreeFileSystem} \title{FileSystem classes} \description{ @@ -89,6 +90,33 @@ the filesystem encounters errors. Default is 15 seconds. \item \code{default_metadata}: default metadata to write in new objects. \item \code{project_id}: the project to use for creating buckets. } + +\code{AzureFileSystem$create()} takes following required argument: +\itemize{ +\item \code{account_name}: Azure Blob Storage account name. +} + +\code{AzureFileSystem$create()} takes following optional arguments: +\itemize{ +\item \code{account_key}: Account key of the storage account. Cannot be used with +\code{sas_token}. +\item \code{blob_storage_authority}: Hostname of the blob service, defaulting to +\code{"blob.core.windows.net"}. +\item \code{blob_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{client_id}: The client/application ID for Azure Active Directory +authentication. If used with \code{client_secret} and \code{tenant_id} then it is the +application ID for a registered Azure AD application. Otherwise, it is the +client ID of a user-assigned managed identity. +\item \code{client_secret}: Client secret for Azure Active Directory authentication. +Must be provided with both \code{client_id} and \code{tenant_id}. +\item \code{dfs_storage_authority}: Hostname of the data lake (gen 2) service, +defaulting to \code{"dfs.core.windows.net"}. +\item \code{dfs_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{sas_token}: Shared access signature (SAS) token for the storage account. +Cannot be used with \verb{account key}. +\item \code{tenant_id}: Tenant ID for Azure Active Directory authentication. Must +be provided with both \code{client_id} and \code{client_secret}. +} } \section{Methods}{ @@ -161,5 +189,9 @@ environment variable \code{ARROW_S3_LOG_LEVEL} (e.g., to running any code that interacts with S3. Possible values include 'FATAL' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and 'OFF'. + +On \code{AzureFileSystem}, passing no arguments for authentication uses the +\code{AzureDefaultCredential} for authentication, so that several authentication +types are tried until one succeeds. } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index ee156cc9129b..9355f6063c90 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -32,7 +32,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:filter]{filter_out()}} +\item \code{\link[dplyr:filter_out]{filter_out()}} \item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} @@ -199,7 +199,7 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} \item \code{\link[dplyr:across]{if_all()}} diff --git a/r/man/arrow_info.Rd b/r/man/arrow_info.Rd index a839d3ba8fd2..4e6d12c46cbe 100644 --- a/r/man/arrow_info.Rd +++ b/r/man/arrow_info.Rd @@ -9,6 +9,7 @@ \alias{arrow_with_parquet} \alias{arrow_with_s3} \alias{arrow_with_gcs} +\alias{arrow_with_azure} \alias{arrow_with_json} \title{Report information on the package's capabilities} \usage{ @@ -28,6 +29,8 @@ arrow_with_s3() arrow_with_gcs() +arrow_with_azure() + arrow_with_json() } \value{ diff --git a/r/man/az_container.Rd b/r/man/az_container.Rd new file mode 100644 index 000000000000..a749b4a4e188 --- /dev/null +++ b/r/man/az_container.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{az_container} +\alias{az_container} +\title{Connect to an Azure Blob Storage container} +\usage{ +az_container(container_path, ...) +} +\arguments{ +\item{container_path}{string Container name or path.} + +\item{...}{Additional connection options, passed to \code{AzureFileSystem$create()}.} +} +\value{ +A \code{SubTreeFileSystem} containing an \code{AzureFileSystem} and the container's +relative path. Note that this function's success does not guarantee that you +are authorized to access the container's contents. +} +\description{ +\code{az_conainer} is a convenience function to create an \code{AzureFileSystem} object +that provides a file system interface for blob storage containers in an Azure +Storage Account. +} +\examples{ +\dontshow{if (FALSE) withAutoprint(\{ # examplesIf} +container_fs <- az_container( + container_path = "arrow-datasets", + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) +\dontshow{\}) # examplesIf} +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index adfd90c8a5d0..be82e5b5fcb0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3642,6 +3642,21 @@ extern "C" SEXP _arrow_fs___GcsFileSystem__options(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_AZURE) +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options); +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ +BEGIN_CPP11 + arrow::r::Input::type options(options_sexp); + return cpp11::as_sexp(fs___AzureFileSystem__Make(options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ + Rf_error("Cannot call fs___AzureFileSystem__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5691,6 +5706,15 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _azure_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_AZURE) + TRUE +#else + FALSE +#endif +); +} extern "C" SEXP _json_available() { return Rf_ScalarLogical( #if defined(ARROW_R_WITH_JSON) @@ -5707,6 +5731,7 @@ static const R_CallMethodDef CallEntries[] = { { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, { "_gcs_available", (DL_FUNC)& _gcs_available, 0 }, + { "_azure_available", (DL_FUNC)& _azure_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, { "_arrow_test_arrow_altrep_set_string_elt", (DL_FUNC) &_arrow_test_arrow_altrep_set_string_elt, 3}, @@ -6060,6 +6085,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_fs___GcsFileSystem__options", (DL_FUNC) &_arrow_fs___GcsFileSystem__options, 1}, + { "_arrow_fs___AzureFileSystem__Make", (DL_FUNC) &_arrow_fs___AzureFileSystem__Make, 1}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 82cf99514d8c..774139f3d80b 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -501,3 +501,55 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr } #endif + +#if defined(ARROW_R_WITH_AZURE) +#include + +// [[azure::export]] +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options) { + fs::AzureOptions azure_opts; + + // Set account name + azure_opts.account_name = cpp11::as_cpp(options["account_name"]); + + if (!Rf_isNull(options["blob_storage_authority"])) { + azure_opts.blob_storage_authority = + cpp11::as_cpp(options["blob_storage_authority"]); + } + if (!Rf_isNull(options["dfs_storage_authority"])) { + azure_opts.dfs_storage_authority = + cpp11::as_cpp(options["dfs_storage_authority"]); + } + if (!Rf_isNull(options["blob_storage_scheme"])) { + azure_opts.blob_storage_scheme = + cpp11::as_cpp(options["blob_storage_scheme"]); + } + if (!Rf_isNull(options["dfs_storage_scheme"])) { + azure_opts.dfs_storage_scheme = + cpp11::as_cpp(options["dfs_storage_scheme"]); + } + + if (!Rf_isNull(options["client_id"])) { + if (Rf_isNull(options["tenant_id"]) && Rf_isNull(options["client_secret"])) { + StopIfNotOk(azure_opts.ConfigureManagedIdentityCredential( + cpp11::as_cpp(options["client_id"]))); + } else if (!Rf_isNull(options["tenant_id"]) && !Rf_isNull(options["client_secret"])) { + StopIfNotOk(azure_opts.ConfigureClientSecretCredential( + cpp11::as_cpp(options["tenant_id"]), + cpp11::as_cpp(options["client_id"]), + cpp11::as_cpp(options["client_secret"]))); + } + } else if (!Rf_isNull(options["account_key"])) { + StopIfNotOk(azure_opts.ConfigureAccountKeyCredential( + cpp11::as_cpp(options["account_key"]))); + } else if (!Rf_isNull(options["sas_token"])) { + StopIfNotOk(azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"]))); + } else { + StopIfNotOk(azure_opts.ConfigureDefaultCredential()); + } + + auto io_context = MainRThread::GetInstance().CancellableIOContext(); + return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); +} + +#endif diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index 7b37abf764b0..9fba086a18e3 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -25,12 +25,18 @@ #' returns a URI containing the filesystem scheme (e.g. 's3://', 'gs://'), the #' absolute path, and any necessary connection options as URL query parameters. test_filesystem <- function(name, fs, path_formatter, uri_formatter) { - # NOTE: it's important that we label these tests with name of filesystem so + # NOTE 1: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. - test_that(sprintf("read/write Feather on %s using URIs", name), { - write_feather(example_data, uri_formatter("test.feather")) - expect_identical(read_feather(uri_formatter("test.feather")), example_data) - }) + + # NOTE 2: as far as I can tell, Azure doesn't support passing a URI directly + # like we can do in S3/GCS. Skipping any tests that rely on this feature + # for name == "azure". + if (name != "azure") { + test_that(sprintf("read/write Feather on %s using URIs", name), { + write_feather(example_data, uri_formatter("test.feather")) + expect_identical(read_feather(uri_formatter("test.feather")), example_data) + }) + } test_that(sprintf("read/write Feather on %s using Filesystem", name), { write_feather(example_data, fs$path(path_formatter("test2.feather"))) @@ -72,11 +78,13 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { ) }) - test_that(sprintf("read/write Parquet on %s", name), { - skip_if_not_available("parquet") - write_parquet(example_data, fs$path(path_formatter("test.parquet"))) - expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) - }) + if (name != "azure") { + test_that(sprintf("read/write Parquet on %s", name), { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(path_formatter("test.parquet"))) + expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) + }) + } if (arrow_with_dataset()) { make_temp_dir <- function() { @@ -85,39 +93,41 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { normalizePath(path, winslash = "/") } - test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { - skip_if_not_available("parquet") - expect_identical( - open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), - example_data |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset with vector of %s file URIs", name), { - expect_identical( - open_dataset( - c(uri_formatter("test.feather"), uri_formatter("test2.feather")), - format = "feather" - ) |> - arrange(int) |> - collect(), - rbind(example_data, example_data) |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { - td <- make_temp_dir() - expect_error( - open_dataset( - c( - uri_formatter("test.feather"), - paste0("file://", file.path(td, "fake.feather")) + if (name != "azure") { + test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { + skip_if_not_available("parquet") + expect_identical( + open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), + example_data |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset with vector of %s file URIs", name), { + expect_identical( + open_dataset( + c(uri_formatter("test.feather"), uri_formatter("test2.feather")), + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { + td <- make_temp_dir() + expect_error( + open_dataset( + c( + uri_formatter("test.feather"), + paste0("file://", file.path(td, "fake.feather")) + ), + format = "feather" ), - format = "feather" - ), - "Vectors of URIs for different file systems are not supported" - ) - }) + "Vectors of URIs for different file systems are not supported" + ) + }) + } # Dataset test setup, cf. test-dataset.R first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") @@ -167,18 +177,21 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) }) - + if (name != "azure") { + test_that(sprintf("copy files with %s", name), { + td <- make_temp_dir() + copy_files(uri_formatter("hive_dir"), td) + expect_length(dir(td), 2) + ds <- open_dataset(td) + expect_identical( + ds |> select(int, dbl, lgl) |> collect() |> arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) + ) + }) + } test_that(sprintf("copy files with %s", name), { td <- make_temp_dir() - copy_files(uri_formatter("hive_dir"), td) - expect_length(dir(td), 2) - ds <- open_dataset(td) - expect_identical( - ds |> select(int, dbl, lgl) |> collect() |> arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) - ) - - # Let's copy the other way and use a SubTreeFileSystem rather than URI + copy_files(fs$path(path_formatter("hive_dir")), td) copy_files(td, fs$path(path_formatter("hive_dir2"))) ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) expect_identical( diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R new file mode 100644 index 000000000000..378444791981 --- /dev/null +++ b/r/tests/testthat/test-azure.R @@ -0,0 +1,247 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("azure") + +# test_filesystem requires dplyr +library(dplyr) + +# This test script depends on ./ci/scripts/install_azurite.sh +skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") + +# Use default azurite credentials, +# see https://learn.microsoft.com/en-us/azure/storage/common/storage-connect-azurite?tabs=blob-storage +azurite_account_name <- "devstoreaccount1" +# Note that this is a well-known default credential for local development on Azurite. +azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +azurite_blob_host <- "127.0.0.1" +azurite_blob_port <- "10000" +azurite_blob_storage_authority <- sprintf("%s:%s", azurite_blob_host, azurite_blob_port) +azurite_blob_storage_scheme <- "http" + +pid_azurite <- sys::exec_background( + "azurite", + c("azurite", "--inMemoryPersistence", "--blobHost", azurite_blob_host), + std_out = FALSE +) +# Kill azurite background process once tests have finished running. +withr::defer(tools::pskill(pid_azurite)) + +# Helper functions for Azure URIs and paths +azure_uri <- function(...) { + endpoint <- sprintf("%s%s%s", azurite_blob_host, "%3A", azurite_blob_port) + template <- "abfs://%s:%s@%s?endpoint=%s" + # URL encode the account key because it contains reserved characters + encoded_key <- curl::curl_escape(azurite_account_key) + sprintf(template, azurite_account_name, encoded_key, azure_path(...), endpoint) +} + +azure_path <- function(...) { + # 'dir' is the container name (following the convention in the s3 tests). + paste(dir, ..., sep = "/") +} + +fs <- AzureFileSystem$create( + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) + +# (1) CreateDir and DeleteDir work correctly +dir <- "test" +fs$CreateDir(dir) +# Clean up when we're all done +withr::defer(fs$DeleteDir(dir)) + +# (2) Run default filesystem tests on azure filesystem + +# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather +# (or any other read/write helper), so some of the test_filesystem tests can't be run +# with AzureFilesystem. Some tests below cover some of the skipped cases in +# test_filesystem. +test_filesystem("azure", fs, azure_path, azure_uri) + +# (3) Test write/read parquet + +example_data <- tibble::tibble( + int = c(1:3, NA_integer_, 5:10), + dbl = c(1:8, NA, 10) + 0.1, + dbl2 = rep(5, 10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + false = logical(10), + chr = letters[c(1:5, NA, 7:10)], + fct = factor(letters[c(1:4, NA, NA, 7:10)]) +) + +test_that("read/write Parquet on azure", { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(azure_path("test.parquet"))) + expect_identical(read_parquet(fs$path(azure_path("test.parquet"))), example_data) +}) + +# (4) open_dataset with a vector of azure file paths + +# TODO: I couldn't pass a vector of paths similar to the original test in +# test_filesystem, but you can pass a folder containing many files. +write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) +write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) + +open_multi_fs <- arrow:::az_container( + container_path = azure_path("openmulti"), + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) + +test_that("open_dataset with AzureFileSystem folder", { + expect_identical( + open_dataset( + open_multi_fs, + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) +}) + +# (5) Check that multiple valid combinations of options can be used to +# instantiate AzureFileSystem. + +fs1 <- AzureFileSystem$create(account_name = "fake-account-name") +expect_s3_class(fs1, "AzureFileSystem") + +fs2 <- AzureFileSystem$create(account_name = "fake-account-name", account_key = "fakeaccountkey") +expect_s3_class(fs2, "AzureFileSystem") + + +fs3 <- AzureFileSystem$create( + account_name = "fake-account", + account_key = "fakeaccount", + blob_storage_authority = "fake-blob-authority", + dfs_storage_authority = "fake-dfs-authority", + blob_storage_scheme = "https", + dfs_storage_scheme = "https" +) +expect_s3_class(fs3, "AzureFileSystem") + +fs4 <- AzureFileSystem$create( + account_name = "fake-account-name", + sas_token = "fakesastoken" +) +expect_s3_class(fs4, "AzureFileSystem") + +fs5 <- AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id", + client_secret = "fake-client-secret" +) +expect_s3_class(fs5, "AzureFileSystem") + +fs6 <- AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id" +) +expect_s3_class(fs6, "AzureFileSystem") + +# (6) Check that invalid argument combinations are caught upfront +# with appropriate error message. + +error_msg_1 <- "`client_id` must be given with `tenant_id` and `client_secret`" +error_msg_2 <- "Provide only `client_id` to authenticate with Managed Identity Credential, or provide `client_id`, `tenant_id`, and`client_secret` to authenticate with Client Secret Credential" # nolint + +test_that("client_id must be specified with account_name and tenant_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_secret must not be provided with client_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id", + client_secret = "fake-client-secret" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name, tenant_id, and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + + +test_that("client_id must be provided alone or with tenant_id and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("cannot specify both account_key and sas_token", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + account_key = "fakeaccount", + sas_token = "fakesastoken" + ), + "Cannot specify both `account_key` and `sas_token`", + fixed = TRUE + ) +}) + +test_that("at a minimum account_name must be passed", { + expect_error( + AzureFileSystem$create(), + 'argument "account_name" is missing, with no default', + fixed = TRUE + ) +}) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index d50191ac18a1..4b3c8691e237 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -605,6 +605,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), + ARROW_AZURE = Sys.getenv("ARROW_AZURE", "ON"), # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) @@ -799,6 +800,7 @@ turn_off_all_optional_features <- function(env_var_list) { "ARROW_DATASET" = "OFF", # depends on parquet "ARROW_S3" = "OFF", "ARROW_GCS" = "OFF", + "ARROW_AZURE" = "OFF", "ARROW_WITH_GOOGLE_CLOUD_CPP" = "OFF", "ARROW_WITH_NLOHMANN_JSON" = "OFF", "ARROW_SUBSTRAIT" = "OFF", @@ -886,13 +888,15 @@ is_feature_requested <- function(env_varname, env_var_list, default = env_is("LI with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3", env_var_list) arrow_gcs <- is_feature_requested("ARROW_GCS", env_var_list) + arrow_azure <- is_feature_requested("ARROW_AZURE", env_var_list) - if (arrow_s3 || arrow_gcs) { - # User wants S3 or GCS support. + if (arrow_s3 || arrow_gcs || arrow_azure) { + # User wants S3 or GCS or Azure support. # Make sure that we have curl and openssl system libs feats <- c( if (arrow_s3) "S3", - if (arrow_gcs) "GCS" + if (arrow_gcs) "GCS", + if (arrow_azure) "AZURE" ) start_msg <- paste(feats, collapse = "/") off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") @@ -907,16 +911,19 @@ with_cloud_support <- function(env_var_list) { print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { print_warning("requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE } } # Update the build flags env_var_list <- replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) + replace(env_var_list, "ARROW_AZURE", ifelse(arrow_azure, "ON", "OFF")) } cmake_find_package <- function(pkg, version = NULL, env_var_list) { diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index e61436df31db..2432325f61bc 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -155,6 +155,7 @@ To enable optional features including: S3 support, an alternative memory allocat -DARROW_GCS=ON \ -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ @@ -228,6 +229,7 @@ cmake \ -DARROW_MIMALLOC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 52652ad7e9ed..4c2138f693f8 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -1,29 +1,30 @@ --- -title: "Using cloud storage (S3, GCS)" +title: "Using cloud storage (S3, GCS, Azure)" description: > Learn how to work with data sets stored in an - Amazon S3 bucket or on Google Cloud Storage + Amazon S3 bucket, on Google Cloud Storage, or on Azure output: rmarkdown::html_vignette --- -Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3) and [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. +Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3), [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS), and [Microsoft Azure](https://azure.microsoft.com) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. -To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, and another one for Google Cloud Storage provided by `GcsFileSystem`. +To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, one for Google Cloud Storage provided by `GcsFileSystem`, and another for Microsoft Azure provided by the `AzureFileSystem` class. -This article provides an overview of working with both S3 and GCS data using the Arrow toolkit. +This article provides an overview of working with S3, GCS, and Azure data using the Arrow toolkit. -## S3 and GCS support +## S3, GCS, and Azure support -Before you start, make sure that your arrow installation has support for S3 and/or GCS enabled. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow installation has support for S3, GCS, and/or Azure enabled. You can check whether support is enabled via helper functions: ```r arrow_with_s3() arrow_with_gcs() +arrow_with_azure() ``` If these return `TRUE` then the relevant support is enabled. -CRAN builds of arrow include S3 support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: +CRAN builds of arrow include S3 and Azure support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: ```r # Option 1: Install from R-universe @@ -36,15 +37,15 @@ Sys.setenv("NOT_CRAN" = "true") install.packages("arrow", type = "source") ``` -On Linux, S3 and GCS support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. +On Linux, S3, GCS, and Azure support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. ## Connecting to cloud storage One way of working with filesystems is to create `?FileSystem` objects. `?S3FileSystem` objects can be created with the `s3_bucket()` function, which automatically detects the bucket's AWS region. Similarly, `?GcsFileSystem` objects -can be created with the `gs_bucket()` function. The resulting -`FileSystem` will consider paths relative to the bucket's path (so for example +can be created with the `gs_bucket()` function and `?AzureFileSystem` objects can be created with the `az_container()` function. The resulting +`FileSystem` will consider paths relative to the bucket/container's path (so for example you don't need to prefix the bucket path when listing a directory). With a `FileSystem` object, you can point to specific files in it with the `$path()` method @@ -52,7 +53,7 @@ and pass the result to file readers and writers (`read_parquet()`, `write_feathe Often the reason users work with cloud storage in real world analysis is to access large data sets. An example of this is discussed in the [datasets article](./dataset.html), but new users may prefer to work with a much smaller data set while learning how the arrow cloud storage interface works. To that end, the examples in this article rely on a multi-file Parquet dataset that stores a copy of the `diamonds` data made available through the [`ggplot2`](https://ggplot2.tidyverse.org/) package, documented in `help("diamonds", package = "ggplot2")`. The cloud storage version of this data set consists of 5 Parquet files totaling less than 1MB in size. -The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an S3FileSystem object that refers to that bucket, use the following command: +The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an `S3FileSystem` object that refers to that bucket, use the following command: ```r bucket <- s3_bucket("arrow-datasets") @@ -147,7 +148,7 @@ june2019 <- SubTreeFileSystem$create("s3://arrow-datasets/nyc-taxi/year=2019/mon ## Connecting directly with a URI -In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()` and `gs_bucket()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3 or GCS. The format of an S3 URI is as follows: +In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()`, `gs_bucket()`, and `az_container()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3, GCS, or Azure. The format of an S3 URI is as follows: ``` s3://[access_key:secret_key@]bucket/path[?region=] @@ -160,6 +161,12 @@ gs://[access_key:secret_key@]bucket/path gs://anonymous@bucket/path ``` +For Azure, the URI format looks like this: + +``` +abfs://container@account_name.dfs.core.windows.net/path +``` + For example, the Parquet file storing the "good cut" diamonds that we downloaded earlier in the article is available on both S3 and CGS. The relevant URIs are as follows: ```r @@ -258,6 +265,21 @@ df <- read_parquet("gs://anonymous@arrow-datasets/diamonds/cut=Good/part-0.parqu +### Azure Authentication + +By default, `AzureFileSystem$create()` and `az_container()` use the [DefaultAzureCredential]( https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential) for authentication. This will try several different types of authentication, using the first one that succeeds. Like with GCS, a simple way to authenticate with Azure is to first use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/?view=azure-cli-latest) to login and setup default credentials: + +``` +az login +``` + +It is possible to use other forms of authentication with Azure when calling `AzureFileSystem$create()` and `az_container()`. + +- Passing `client_id` on its own will use [`ManagedIdentityCredential`](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview) to authenticate. +- Passing `client_id` with `tenant_id` and `client_secret` will use [`ClientSecretCredential`](https://learn.microsoft.com/en-us/entra/identity-platform/app-objects-and-service-principals?tabs=browser) to authenticate. +- Passing `sas_token` will use a shared access signature (SAS) token for the storage account. +- Passing `account_key` will use the account key for the storage account. + ## Using a proxy server If you need to use a proxy server to connect to an S3 bucket, you can provide @@ -329,10 +351,8 @@ variables, you can set environment variable `AWS_EC2_METADATA_DISABLED` to Sys.setenv(AWS_EC2_METADATA_DISABLED = TRUE) ``` - ## Further reading -- To learn more about `FileSystem` classes, including `S3FileSystem` and `GcsFileSystem`, see `help("FileSystem", package = "arrow")`. -- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). - +- To learn more about `FileSystem` classes, including `S3FileSystem`, `GcsFileSystem`, and `AzureFileSystem`, see `help("FileSystem", package = "arrow")`. +- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index a058975ccf19..01955d6fdc72 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -32,13 +32,13 @@ exception, as it ships with gcc 4.8. ### Libraries -Optional support for reading from cloud storage--AWS S3 and -Google Cloud Storage (GCS)--requires additional system dependencies: +Optional support for reading from cloud storage--AWS S3, +Google Cloud Storage (GCS), and Azure--requires additional system dependencies: * CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) * OpenSSL >= 3.0: install `openssl-devel` (rpm) or `libssl-dev` (deb) -The prebuilt binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. +The prebuilt binaries come with S3, GCS, and Azure support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3, GCS, and Azure support in the build if the prerequisites are not met--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ## Install release version (easy way) @@ -99,9 +99,9 @@ install.packages("arrow") This installs the source version of the R package, but during the installation process will check for compatible libarrow binaries that we host and use those if available. If no binary is available or can't be found, then this option falls back onto method 2 below (full source build), but setting the environment variable results in a more fully-featured build than default. -The libarrow binaries include support for AWS S3 and GCS, so they require the +The libarrow binaries include support for AWS S3, GCS, and Azure, so they require the libcurl and openssl libraries installed separately, as noted above. -If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3 and GCS support disabled). +If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3, GCS, and Azure support disabled). If the internet access of your computer doesn't allow downloading the libarrow binaries (e.g. if access is limited to CRAN), you can first identify the right source and version by trying to install on the offline computer: @@ -204,19 +204,19 @@ information about dependencies and minimum versions. If downloading dependencies at build time is not an option, as when building on a system that is disconnected or behind a firewall, there are a few options. See "Offline builds" below. -#### Dependencies for S3 and GCS support +#### Dependencies for S3, GCS, and Azure support -Support for working with data in S3 and GCS is not enabled in the default +Support for working with data in S3, GCS, and Azure is not enabled in the default source build, and it has additional system requirements as described above. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set -`ARROW_S3=ON` and/or `ARROW_GCS=ON`. +`ARROW_S3=ON`, `ARROW_GCS=ON`, and/or `ARROW_AZURE=ON`. -When either feature is enabled, the install script will check for the presence -of the required dependencies, and if the prerequisites are met, it will turn -off S3 and GCS support--installation will succeed but without S3 or GCS +When one of these features is enabled, the install script will check for the presence +of the required dependencies, and if the prerequisites are not met, it will turn +off S3, GCS, and Azure support--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, -you'll need to reinstall the package in order to enable S3 and GCS support. +you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ### Advanced configuration @@ -233,6 +233,7 @@ default values are shown below. | ---| --- | :-: | | `ARROW_S3` | S3 support (if dependencies are met)* | `OFF` | | `ARROW_GCS` | GCS support (if dependencies are met)* | `OFF` | +| `ARROW_Azure` | Azure support (if dependencies are met)* | `OFF` | | `ARROW_JEMALLOC` | The `jemalloc` memory allocator | `ON` | | `ARROW_MIMALLOC` | The `mimalloc` memory allocator | `ON` | | `ARROW_PARQUET` | | `ON` |