From c167a8b98b422e86cde9ac6af8fa319b0d1bff84 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Tue, 3 Mar 2026 04:05:27 +0000 Subject: [PATCH 01/39] Add AzureBlobFileSystem placeholder, verify devtools::document() behaves correctly. --- r/DESCRIPTION | 2 +- r/NAMESPACE | 1 + r/R/filesystem.R | 17 +++++++++++++++++ r/man/FileSystem.Rd | 1 + r/man/acero.Rd | 4 ++-- 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5e678466ddf0..147a800a48b8 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -44,7 +44,7 @@ Imports: utils, vctrs Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source") -RoxygenNote: 7.3.3 +RoxygenNote: 7.3.3.9000 Config/testthat/edition: 3 Config/build/bootstrap: TRUE Suggests: diff --git a/r/NAMESPACE b/r/NAMESPACE index cdeb27c4067f..5a027b1a4c85 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -183,6 +183,7 @@ S3method(vec_ptype_full,arrow_fixed_size_list) S3method(vec_ptype_full,arrow_large_list) S3method(vec_ptype_full,arrow_list) export(Array) +export(AzureBlobFileSystem) export(Buffer) export(BufferOutputStream) export(BufferReader) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 99c09c40dc3b..0fc10c4702ec 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -645,6 +645,23 @@ GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, .. fs___GcsFileSystem__Make(anonymous, options) } +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +AzureBlobFileSystem <- R6Class( + "AzureBlobFileSystem", + inherit = FileSystem, + active = list( + region = function() fs___S3FileSystem__region(self) + ) +) + +AzureBlobFileSystem$test <- function(msg) { + sprintf("Hello, %s", msg) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 83e7fc652616..4ff80e26220e 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -6,6 +6,7 @@ \alias{LocalFileSystem} \alias{S3FileSystem} \alias{GcsFileSystem} +\alias{AzureBlobFileSystem} \alias{SubTreeFileSystem} \title{FileSystem classes} \description{ diff --git a/r/man/acero.Rd b/r/man/acero.Rd index ee156cc9129b..9355f6063c90 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -32,7 +32,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:filter]{filter_out()}} +\item \code{\link[dplyr:filter_out]{filter_out()}} \item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} @@ -199,7 +199,7 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} \item \code{\link[dplyr:across]{if_all()}} From 2ed3ab74f489c03f181e60e4985e4bd65eb8055b Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 4 Mar 2026 21:26:48 -0500 Subject: [PATCH 02/39] Added c++ stub --- r/R/filesystem.R | 20 ++++++++++++++------ r/src/filesystem.cpp | 18 +++++++++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 0fc10c4702ec..3af19b7b3312 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -652,14 +652,22 @@ GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, .. #' @export AzureBlobFileSystem <- R6Class( "AzureBlobFileSystem", - inherit = FileSystem, - active = list( - region = function() fs___S3FileSystem__region(self) - ) + inherit = FileSystem ) -AzureBlobFileSystem$test <- function(msg) { - sprintf("Hello, %s", msg) +# TODO: +AzureBlobFileSystem$create <- function(...) { + fs___AzureFileSystem__Make(...) +} + +# TODO: +az_bucket <- function(bucket, ...) { + assert_that(is.string(bucket)) + args <- list2(...) + + fs <- exec(AzureFileSystem$create, !!!args) + + SubTreeFileSystem$create(bucket, fs) } #' @usage NULL diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 82cf99514d8c..cc69072ae430 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -253,7 +253,7 @@ std::shared_ptr fs___SubTreeFileSystem__base_fs( // [[arrow::export]] std::string fs___SubTreeFileSystem__base_path( const std::shared_ptr& file_system) { - return file_system->base_path(); + // return file_system->base_path(); } // [[arrow::export]] @@ -501,3 +501,19 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr } #endif + +// TODO: +#if defined(ARROW_R_WITH_AZURE) +#include + +// [[azure::export]] +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options) { + fs::AzureOptions azure_opts; + azure_opts = fs::AzureOptions::Defaults(); + + auto io_context = MainRThread::GetInstance().CancellableIOContext(); + return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); + +} + +#endif \ No newline at end of file From 8659e7306dc1845d0c89638251ba7c8af27115bc Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 4 Mar 2026 21:41:43 -0500 Subject: [PATCH 03/39] Updated codegen --- r/R/arrowExports.R | 4 ++++ r/data-raw/codegen.R | 2 +- r/src/arrowExports.cpp | 26 ++++++++++++++++++++++++++ r/src/filesystem.cpp | 2 +- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 455e6bc8a7fd..5722913b9704 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1408,6 +1408,10 @@ fs___GcsFileSystem__options <- function(fs) { .Call(`_arrow_fs___GcsFileSystem__options`, fs) } +fs___AzureFileSystem__Make <- function(options) { + .Call(`_arrow_fs___AzureFileSystem__Make`, options) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 9acfef109c56..8a78ba7ecaac 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -30,7 +30,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "json") +features <- c("acero", "dataset", "substrait", "parquet", "s3", "gcs", "azure", "json") suppressPackageStartupMessages({ library(decor) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index adfd90c8a5d0..be82e5b5fcb0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3642,6 +3642,21 @@ extern "C" SEXP _arrow_fs___GcsFileSystem__options(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_AZURE) +std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options); +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ +BEGIN_CPP11 + arrow::r::Input::type options(options_sexp); + return cpp11::as_sexp(fs___AzureFileSystem__Make(options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ + Rf_error("Cannot call fs___AzureFileSystem__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5691,6 +5706,15 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _azure_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_AZURE) + TRUE +#else + FALSE +#endif +); +} extern "C" SEXP _json_available() { return Rf_ScalarLogical( #if defined(ARROW_R_WITH_JSON) @@ -5707,6 +5731,7 @@ static const R_CallMethodDef CallEntries[] = { { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, { "_gcs_available", (DL_FUNC)& _gcs_available, 0 }, + { "_azure_available", (DL_FUNC)& _azure_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1}, { "_arrow_test_arrow_altrep_set_string_elt", (DL_FUNC) &_arrow_test_arrow_altrep_set_string_elt, 3}, @@ -6060,6 +6085,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_fs___GcsFileSystem__options", (DL_FUNC) &_arrow_fs___GcsFileSystem__options, 1}, + { "_arrow_fs___AzureFileSystem__Make", (DL_FUNC) &_arrow_fs___AzureFileSystem__Make, 1}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index cc69072ae430..47cfeccb0a88 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -516,4 +516,4 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti } -#endif \ No newline at end of file +#endif From bd602ff72bbca1857e7957d58108cdc4bdf154c1 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 4 Mar 2026 21:44:11 -0500 Subject: [PATCH 04/39] Added a comment --- r/src/filesystem.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 47cfeccb0a88..8fe078b22c03 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -502,7 +502,8 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr #endif -// TODO: +// TODO: Write the Rcpp function to interface with the AzureFileSystem class in +// arrow/filesystem/azurefs.h. #if defined(ARROW_R_WITH_AZURE) #include From e8b0452b905dc693baf6c7718df08ea10f4c7f18 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 02:59:07 +0000 Subject: [PATCH 05/39] add simple test function to work through codegen.R --- r/R/arrowExports.R | 4 ++++ r/src/arrowExports.cpp | 9 +++++++++ r/src/filesystem.cpp | 15 +++++++++++++++ r/tmp.md | 11 +++++++++++ 4 files changed, 39 insertions(+) create mode 100644 r/tmp.md diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 455e6bc8a7fd..292901ba776f 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1400,6 +1400,10 @@ FinalizeS3 <- function() { invisible(.Call(`_arrow_FinalizeS3`)) } +azurefs_is_functional_test <- function(input_string) { + .Call(`_arrow_azurefs_is_functional_test`, input_string) +} + fs___GcsFileSystem__Make <- function(anonymous, options) { .Call(`_arrow_fs___GcsFileSystem__Make`, anonymous, options) } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index adfd90c8a5d0..32aeee4a8abe 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3612,6 +3612,14 @@ BEGIN_CPP11 END_CPP11 } // filesystem.cpp +bool azurefs_is_functional_test(std::string input_string); +extern "C" SEXP _arrow_azurefs_is_functional_test(SEXP input_string_sexp){ +BEGIN_CPP11 + arrow::r::Input::type input_string(input_string_sexp); + return cpp11::as_sexp(azurefs_is_functional_test(input_string)); +END_CPP11 +} +// filesystem.cpp #if defined(ARROW_R_WITH_GCS) std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, cpp11::list options); extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ @@ -6058,6 +6066,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 18}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, + { "_arrow_azurefs_is_functional_test", (DL_FUNC) &_arrow_azurefs_is_functional_test, 1}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_fs___GcsFileSystem__options", (DL_FUNC) &_arrow_fs___GcsFileSystem__options, 1}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 82cf99514d8c..f7d07913e37e 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -356,6 +356,21 @@ void FinalizeS3() { #endif } +#if defined(ARROW_R_WITH_AZUREFS) + +#include + +// [[arrow::export]] +bool azurefs_is_functional_test(std::string input_string) { + // This just proves we can pass data in and out of the guarded block + if (input_string == "hello") { + return true; + } + return false; +} + +#endif + #if defined(ARROW_R_WITH_GCS) #include diff --git a/r/tmp.md b/r/tmp.md new file mode 100644 index 000000000000..872728514064 --- /dev/null +++ b/r/tmp.md @@ -0,0 +1,11 @@ +# Temporary development notes + +> TODO: Remove this before we open a PR to upstream arrow library. + +## Using codegen.R + +1. Install repo dependencies in `arrow/r`: `install.packages("remotes")`, then `remotes::install_deps(dependencies = TRUE)` + +2. Rscript `data-raw/codegen.R` + +The second step auto-generates stubs in `arrowExports.R` and `arrowExports.cpp` based on which C++ functions have `// [[arrow::export]]` comments above them. From 033e9a8dfba6e1b26116a1c28d46dd2a9d3fd5c6 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 03:56:16 +0000 Subject: [PATCH 06/39] temporarily "force" build DARROW_R_WITH_AZUREFS build flag. We'll want to use the proper build flag before submitting the PR but for now this successfully links the C++ binding to the azurefs_is_functional_test function. --- r/configure | 3 +++ r/tmp.md | 2 ++ 2 files changed, 5 insertions(+) diff --git a/r/configure b/r/configure index 9e92eb6b47f2..72524e7d8954 100755 --- a/r/configure +++ b/r/configure @@ -359,6 +359,9 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + if arrow_built_with ARROW_AZURE || [ "$ARROW_R_WITH_AZUREFS" = "true" ]; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZUREFS" + fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi diff --git a/r/tmp.md b/r/tmp.md index 872728514064..4ee07eadea58 100644 --- a/r/tmp.md +++ b/r/tmp.md @@ -9,3 +9,5 @@ 2. Rscript `data-raw/codegen.R` The second step auto-generates stubs in `arrowExports.R` and `arrowExports.cpp` based on which C++ functions have `// [[arrow::export]]` comments above them. + +**Note**: at the moment we need to run `export ARROW_R_WITH_AZUREFS=true` before `R CMD INSTALL .` to export the environment variable that "forces" the Azure build flag. From a9efa92730579daf8afe5b1cdd70cd00dd51d7a7 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 21:16:10 +0000 Subject: [PATCH 07/39] cleanup azurefs test function code --- r/R/arrowExports.R | 4 ---- r/src/arrowExports.cpp | 9 --------- r/src/filesystem.cpp | 15 --------------- 3 files changed, 28 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index b1e68c16e550..5722913b9704 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1400,10 +1400,6 @@ FinalizeS3 <- function() { invisible(.Call(`_arrow_FinalizeS3`)) } -azurefs_is_functional_test <- function(input_string) { - .Call(`_arrow_azurefs_is_functional_test`, input_string) -} - fs___GcsFileSystem__Make <- function(anonymous, options) { .Call(`_arrow_fs___GcsFileSystem__Make`, anonymous, options) } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index acc4311d9317..be82e5b5fcb0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3612,14 +3612,6 @@ BEGIN_CPP11 END_CPP11 } // filesystem.cpp -bool azurefs_is_functional_test(std::string input_string); -extern "C" SEXP _arrow_azurefs_is_functional_test(SEXP input_string_sexp){ -BEGIN_CPP11 - arrow::r::Input::type input_string(input_string_sexp); - return cpp11::as_sexp(azurefs_is_functional_test(input_string)); -END_CPP11 -} -// filesystem.cpp #if defined(ARROW_R_WITH_GCS) std::shared_ptr fs___GcsFileSystem__Make(bool anonymous, cpp11::list options); extern "C" SEXP _arrow_fs___GcsFileSystem__Make(SEXP anonymous_sexp, SEXP options_sexp){ @@ -6091,7 +6083,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 18}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, - { "_arrow_azurefs_is_functional_test", (DL_FUNC) &_arrow_azurefs_is_functional_test, 1}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, { "_arrow_fs___GcsFileSystem__options", (DL_FUNC) &_arrow_fs___GcsFileSystem__options, 1}, { "_arrow_fs___AzureFileSystem__Make", (DL_FUNC) &_arrow_fs___AzureFileSystem__Make, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index acd440ed54e0..8fe078b22c03 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -356,21 +356,6 @@ void FinalizeS3() { #endif } -#if defined(ARROW_R_WITH_AZUREFS) - -#include - -// [[arrow::export]] -bool azurefs_is_functional_test(std::string input_string) { - // This just proves we can pass data in and out of the guarded block - if (input_string == "hello") { - return true; - } - return false; -} - -#endif - #if defined(ARROW_R_WITH_GCS) #include From 7733eab7ffc5c50cb52c715f912fd355c71a6dc6 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 21:16:31 +0000 Subject: [PATCH 08/39] document instructions to start local azurite container --- r/tmp.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/r/tmp.md b/r/tmp.md index 4ee07eadea58..8b145cfb03bd 100644 --- a/r/tmp.md +++ b/r/tmp.md @@ -11,3 +11,8 @@ The second step auto-generates stubs in `arrowExports.R` and `arrowExports.cpp` based on which C++ functions have `// [[arrow::export]]` comments above them. **Note**: at the moment we need to run `export ARROW_R_WITH_AZUREFS=true` before `R CMD INSTALL .` to export the environment variable that "forces" the Azure build flag. + +## Using Azurite + +`docker run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite` (see README.md in https://github.com/Azure/Azurite) + From 99aad4e5c06d49068c9620491f29791e855614f5 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 21:49:37 +0000 Subject: [PATCH 09/39] add arrow_with_azure helper following convention for s3/gcp --- r/NAMESPACE | 1 + r/R/arrow-info.R | 10 ++++++++++ r/man/arrow_info.Rd | 3 +++ 3 files changed, 14 insertions(+) diff --git a/r/NAMESPACE b/r/NAMESPACE index 5a027b1a4c85..6584bbf1ec21 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -283,6 +283,7 @@ export(arrow_available) export(arrow_info) export(arrow_table) export(arrow_with_acero) +export(arrow_with_azure) export(arrow_with_dataset) export(arrow_with_gcs) export(arrow_with_json) diff --git a/r/R/arrow-info.R b/r/R/arrow-info.R index 699f94dcbdb5..91b46788aab2 100644 --- a/r/R/arrow-info.R +++ b/r/R/arrow-info.R @@ -46,6 +46,7 @@ arrow_info <- function() { json = arrow_with_json(), s3 = arrow_with_s3(), gcs = arrow_with_gcs(), + azure = arrow_with_azure(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) @@ -128,6 +129,15 @@ arrow_with_gcs <- function() { }) } +#' @rdname arrow_info +#' @export +arrow_with_azure <- function() { + tryCatch(.Call(`_azure_available`), error = function(e) { + return(FALSE) + }) +} + + #' @rdname arrow_info #' @export arrow_with_json <- function() { diff --git a/r/man/arrow_info.Rd b/r/man/arrow_info.Rd index a839d3ba8fd2..4e6d12c46cbe 100644 --- a/r/man/arrow_info.Rd +++ b/r/man/arrow_info.Rd @@ -9,6 +9,7 @@ \alias{arrow_with_parquet} \alias{arrow_with_s3} \alias{arrow_with_gcs} +\alias{arrow_with_azure} \alias{arrow_with_json} \title{Report information on the package's capabilities} \usage{ @@ -28,6 +29,8 @@ arrow_with_s3() arrow_with_gcs() +arrow_with_azure() + arrow_with_json() } \value{ From 2a7f8ea6b2a00620e30813a55b2f6855dd4cedf9 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 23:36:19 +0000 Subject: [PATCH 10/39] add ARROW_AZURE flag to nixlibs.R --- r/tools/nixlibs.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index d50191ac18a1..bd9ffbe8a3a3 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -605,6 +605,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), + # ARROW_AZURE = Sys.getenv("ARROW_AZURE", "ON"), # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) From 8ca7bc695322edd305e07cda32a2206574d85f4d Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 5 Mar 2026 23:53:09 +0000 Subject: [PATCH 11/39] debug first argument check Note: I think what's happening is that the build of Arrow I install doesn't have the ARROW_AZURE flag enabled. When I "force" the environment variable I get an error like "fs___AzureFileSystem__Make not found". --- r/src/arrowExports.cpp | 4 ++-- r/src/filesystem.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index be82e5b5fcb0..f6302873c535 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3643,7 +3643,7 @@ extern "C" SEXP _arrow_fs___GcsFileSystem__options(SEXP fs_sexp){ #endif // filesystem.cpp -#if defined(ARROW_R_WITH_AZURE) +#if defined(ARROW_R_WITH_AZUREFS) std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options); extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ BEGIN_CPP11 @@ -5708,7 +5708,7 @@ return Rf_ScalarLogical( } extern "C" SEXP _azure_available() { return Rf_ScalarLogical( -#if defined(ARROW_R_WITH_AZURE) +#if defined(ARROW_R_WITH_AZUREFS) TRUE #else FALSE diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 8fe078b22c03..dd2eebf97d5e 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -504,13 +504,17 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr // TODO: Write the Rcpp function to interface with the AzureFileSystem class in // arrow/filesystem/azurefs.h. -#if defined(ARROW_R_WITH_AZURE) +#if defined(ARROW_R_WITH_AZUREFS) #include // [[azure::export]] std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options) { fs::AzureOptions azure_opts; - azure_opts = fs::AzureOptions::Defaults(); + + // Set account name + if (!Rf_isNull(options["account_name"])) { + azure_opts.account_name = cpp11::as_cpp(options["account_name"]); + } auto io_context = MainRThread::GetInstance().CancellableIOContext(); return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); From 1111ea69bc1d8b8178efda4212d5bc5cd7ab5836 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Tue, 10 Mar 2026 22:29:03 -0400 Subject: [PATCH 12/39] Renamed R6 class correctly --- r/NAMESPACE | 2 +- r/R/filesystem.R | 6 +++--- r/man/FileSystem.Rd | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 6584bbf1ec21..027878693fb6 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -183,7 +183,7 @@ S3method(vec_ptype_full,arrow_fixed_size_list) S3method(vec_ptype_full,arrow_large_list) S3method(vec_ptype_full,arrow_list) export(Array) -export(AzureBlobFileSystem) +export(AzureFileSystem) export(Buffer) export(BufferOutputStream) export(BufferReader) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 3af19b7b3312..302fc43e3267 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -650,13 +650,13 @@ GcsFileSystem$create <- function(anonymous = FALSE, retry_limit_seconds = 15, .. #' @rdname FileSystem #' @importFrom utils modifyList #' @export -AzureBlobFileSystem <- R6Class( - "AzureBlobFileSystem", +AzureFileSystem <- R6Class( + "AzureFileSystem", inherit = FileSystem ) # TODO: -AzureBlobFileSystem$create <- function(...) { +AzureFileSystem$create <- function(...) { fs___AzureFileSystem__Make(...) } diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 4ff80e26220e..eeccda31b04c 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -6,7 +6,7 @@ \alias{LocalFileSystem} \alias{S3FileSystem} \alias{GcsFileSystem} -\alias{AzureBlobFileSystem} +\alias{AzureFileSystem} \alias{SubTreeFileSystem} \title{FileSystem classes} \description{ From ed01c1397d59f6dd7e60f0c96b509d17349b9ce4 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Tue, 10 Mar 2026 23:06:45 -0400 Subject: [PATCH 13/39] Added endpoint + key, token, and default authentication --- r/R/filesystem.R | 33 ++++++++++++++++++++++++++++++++- r/src/filesystem.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 302fc43e3267..92be0bf9f6e2 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -657,10 +657,41 @@ AzureFileSystem <- R6Class( # TODO: AzureFileSystem$create <- function(...) { + options <- list(...) + valid_opts <- c( + "account_name", + "account_key", + "blob_storage_authority", + "blob_storage_scheme", + "client_id", + "client_secret", + "dfs_storage_authority", + "dfs_storage_scheme", + "sas_token", + "tenant_id" + ) + + invalid_opts <- setdiff(names(options), valid_opts) + if (length(invalid_opts)) { + stop( + "Invalid options for AzureFileSystem: ", + oxford_paste(invalid_opts), + call. = FALSE + ) + } + if (!is.null(options$account_key) && !is.null(options$sas_token)) { + stop( + "Cannot specify both `account_key` and `sas_token`", + call. = FALSE + ) + } + # TODO: Validate combinations of tenant id/client id/client secret before + # handing off to C++. + fs___AzureFileSystem__Make(...) } -# TODO: +# TODO: Probably shouldn't be called bucket. az_bucket <- function(bucket, ...) { assert_that(is.string(bucket)) args <- list2(...) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index dd2eebf97d5e..efd3afd282c3 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -510,15 +510,37 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr // [[azure::export]] std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options) { fs::AzureOptions azure_opts; - + // Set account name if (!Rf_isNull(options["account_name"])) { azure_opts.account_name = cpp11::as_cpp(options["account_name"]); } + if (!Rf_isNull(options["blob_storage_authority"])) { + azure_opts.blob_storage_authority = cpp11::as_cpp(options["blob_storage_authority"]); + } + if (!Rf_isNull(options["dfs_storage_authority"])) { + azure_opts.dfs_storage_authority = cpp11::as_cpp(options["dfs_storage_authority"]); + } + if (!Rf_isNull(options["blob_storage_schema"])) { + azure_opts.blob_storage_schema = cpp11::as_cpp(options["blob_storage_schema"]); + } + if (!Rf_isNull(options["dfs_storage_schema"])) { + azure_opts.dfs_storage_schema = cpp11::as_cpp(options["dfs_storage_schema"]); + } + + // TODO: Deal with different combinations of tenant id/client id/client secret. + if (!Rf_isNull(options["account_key"])) { + azure_opts.ConfigureAccountKeyCredential(cpp11::as_cpp(options["account_key"])); + } else if (!Rf_isNull(options["sas_token"])) { + azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"])); + } else { + azure_opts.ConfigureDefaultCredential(); + } + auto io_context = MainRThread::GetInstance().CancellableIOContext(); return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); - + } #endif From 6795548d12cdf6adfb7e110faf40332b9ca96feb Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 11 Mar 2026 00:11:14 -0400 Subject: [PATCH 14/39] Finished logical for AzureFileSystem to match pyarrow --- r/R/filesystem.R | 26 +++++++++++++++++++++----- r/src/filesystem.cpp | 25 ++++++++++++++++--------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 92be0bf9f6e2..1d97ad29ac91 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -655,7 +655,6 @@ AzureFileSystem <- R6Class( inherit = FileSystem ) -# TODO: AzureFileSystem$create <- function(...) { options <- list(...) valid_opts <- c( @@ -679,19 +678,36 @@ AzureFileSystem$create <- function(...) { call. = FALSE ) } - if (!is.null(options$account_key) && !is.null(options$sas_token)) { + if (is.null(options$account_name)) { + stop("Missing `account_name`", call. = FALSE) + } + if (!is.null(options$tenant_id) || !is.null(options$client_id) || !is.null(options$client_secret)) { + if (is.null(options$client_id)) { + stop( + "`client_id` must be given with `tenant_id` and `client_secret`", + call. = FALSE + ) + } + if (sum(is.null(options$tenant_id), is.null(options$client_secret)) == 1) { + stop( + "Provide only `client_id` to authenticate with ", + "Managed Identity Credential, or provide `client_id`, `tenant_id`, ", + "and`client_secret` to authenticate with Client Secret Credential", + call. = FALSE + ) + } + } else if (!is.null(options$account_key) && !is.null(options$sas_token)) { stop( "Cannot specify both `account_key` and `sas_token`", call. = FALSE ) } - # TODO: Validate combinations of tenant id/client id/client secret before - # handing off to C++. - fs___AzureFileSystem__Make(...) + fs___AzureFileSystem__Make(options) } # TODO: Probably shouldn't be called bucket. +# TODO: Add documentation. az_bucket <- function(bucket, ...) { assert_that(is.string(bucket)) args <- list2(...) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index efd3afd282c3..689a2ef8b192 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -512,9 +512,7 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti fs::AzureOptions azure_opts; // Set account name - if (!Rf_isNull(options["account_name"])) { - azure_opts.account_name = cpp11::as_cpp(options["account_name"]); - } + azure_opts.account_name = cpp11::as_cpp(options["account_name"]); if (!Rf_isNull(options["blob_storage_authority"])) { azure_opts.blob_storage_authority = cpp11::as_cpp(options["blob_storage_authority"]); @@ -522,15 +520,24 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti if (!Rf_isNull(options["dfs_storage_authority"])) { azure_opts.dfs_storage_authority = cpp11::as_cpp(options["dfs_storage_authority"]); } - if (!Rf_isNull(options["blob_storage_schema"])) { - azure_opts.blob_storage_schema = cpp11::as_cpp(options["blob_storage_schema"]); + if (!Rf_isNull(options["blob_storage_scheme"])) { + azure_opts.blob_storage_scheme = cpp11::as_cpp(options["blob_storage_scheme"]); } - if (!Rf_isNull(options["dfs_storage_schema"])) { - azure_opts.dfs_storage_schema = cpp11::as_cpp(options["dfs_storage_schema"]); + if (!Rf_isNull(options["dfs_storage_scheme"])) { + azure_opts.dfs_storage_scheme = cpp11::as_cpp(options["dfs_storage_scheme"]); } - // TODO: Deal with different combinations of tenant id/client id/client secret. - if (!Rf_isNull(options["account_key"])) { + if (!Rf_isNull(options["client_id"])) { + if (Rf_isNull(options["tenant_id"]) && Rf_isNull(options["client_secret"])) { + azure_opts.ConfigureManagedIdentityCredential(cpp11::as_cpp(options["client_id"])); + } else if (!Rf_isNull(options["tenant_id"]) && !Rf_isNull(options["client_secret"])) { + azure_opts.ConfigureClientSecretCredential( + cpp11::as_cpp(options["tenant_id"]), + cpp11::as_cpp(options["client_id"]), + cpp11::as_cpp(options["client_secret"]) + ); + } + } else if (!Rf_isNull(options["account_key"])) { azure_opts.ConfigureAccountKeyCredential(cpp11::as_cpp(options["account_key"])); } else if (!Rf_isNull(options["sas_token"])) { azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"])); From de711d3ff4cce02aceb96eb902fdc4ff04c639e6 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 01:30:47 +0000 Subject: [PATCH 15/39] standardize on ARROW_R_WITH_AZURE instead of ARROW_R_WITH_AZUREFS --- r/src/filesystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 689a2ef8b192..ef634ddd1af9 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -504,7 +504,7 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr // TODO: Write the Rcpp function to interface with the AzureFileSystem class in // arrow/filesystem/azurefs.h. -#if defined(ARROW_R_WITH_AZUREFS) +#if defined(ARROW_R_WITH_AZURE) #include // [[azure::export]] From 4c14d2f2add54fdfb49f7bbf34e50423814f796a Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 01:31:08 +0000 Subject: [PATCH 16/39] standardize on ARROW_R_WITH_AZURE --- r/src/arrowExports.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index f6302873c535..be82e5b5fcb0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3643,7 +3643,7 @@ extern "C" SEXP _arrow_fs___GcsFileSystem__options(SEXP fs_sexp){ #endif // filesystem.cpp -#if defined(ARROW_R_WITH_AZUREFS) +#if defined(ARROW_R_WITH_AZURE) std::shared_ptr fs___AzureFileSystem__Make(cpp11::list options); extern "C" SEXP _arrow_fs___AzureFileSystem__Make(SEXP options_sexp){ BEGIN_CPP11 @@ -5708,7 +5708,7 @@ return Rf_ScalarLogical( } extern "C" SEXP _azure_available() { return Rf_ScalarLogical( -#if defined(ARROW_R_WITH_AZUREFS) +#if defined(ARROW_R_WITH_AZURE) TRUE #else FALSE From 87049affe57192b09fbb4cc3e8615ebf03166bdc Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 01:31:28 +0000 Subject: [PATCH 17/39] Turn on ARROW_AZURE flag in nixlibs.R --- r/tools/nixlibs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index bd9ffbe8a3a3..14edac96c60f 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -605,7 +605,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), - # ARROW_AZURE = Sys.getenv("ARROW_AZURE", "ON"), + ARROW_AZURE = Sys.getenv("ARROW_AZURE", "ON"), # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) From 94342236780ea31296faf3edb9dc927ae3199acc Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 01:33:33 +0000 Subject: [PATCH 18/39] drop temporary arrow env var hack --- r/configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/configure b/r/configure index 72524e7d8954..b417a4dd04b4 100755 --- a/r/configure +++ b/r/configure @@ -359,7 +359,7 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi - if arrow_built_with ARROW_AZURE || [ "$ARROW_R_WITH_AZUREFS" = "true" ]; then + if arrow_built_with ARROW_AZURE; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZUREFS" fi if arrow_built_with ARROW_GCS; then From 668bdb6dd6908aa51cb8f0ac5ccdcca61f37ac4d Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 01:39:10 +0000 Subject: [PATCH 19/39] temporary documentation of what I've tried so far --- r/tmp.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/r/tmp.md b/r/tmp.md index 8b145cfb03bd..6712928d7bd2 100644 --- a/r/tmp.md +++ b/r/tmp.md @@ -16,3 +16,73 @@ The second step auto-generates stubs in `arrowExports.R` and `arrowExports.cpp` `docker run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite` (see README.md in https://github.com/Azure/Azurite) +## Build troubleshooting continued + +```bash +export ARROW_HOME=/workspaces/arrow/dist + +cmake \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_COMPUTE=ON \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DARROW_EXTRA_ERROR_CONTEXT=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_INSTALL_NAME_RPATH=OFF \ + -DARROW_JEMALLOC=ON \ + -DARROW_JSON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_AZURE=ON \ + .. + + +# Try building from source via R with the relevant env vars set for feature flags. + +# Core Build Settings +export LIBARROW_MINIMAL=false +export FORCE_BUNDLED_BUILD=true +export ARROW_HOME=$ARROW_HOME +export BOOST_SOURCE=BUNDLED + +# Feature Toggles +export ARROW_COMPUTE=ON +export ARROW_CSV=ON +export ARROW_DATASET=ON +export ARROW_EXTRA_ERROR_CONTEXT=ON +export ARROW_FILESYSTEM=ON +export ARROW_JEMALLOC=ON +export ARROW_JSON=ON +export ARROW_PARQUET=ON +export ARROW_AZURE=ON + +# Visibility into build +export ARROW_R_DEV=TRUE + +# Use multiple available cores +export MAKEFLAGS="-j8" + +# Compression Codecs +export ARROW_WITH_SNAPPY=ON +export ARROW_WITH_ZLIB=ON + +# Library Linkage +export ARROW_BUILD_STATIC=ON +export ARROW_BUILD_SHARED=OFF + +# For R-specific behavior (replaces CMAKE_INSTALL_LIBDIR=lib) +export LIBARROW_BINARY=false + +export EXTRA_CMAKE_FLAGS="-DARROW_INSTALL_NAME_RPATH=OFF -DARROW_AZURE=ON -DCMAKE_SHARED_LINKER_FLAGS=-lxml2" + +export PKG_CONFIG_PATH="/usr/lib/x86_64-linux-gnu/pkgconfig" +export LDFLAGS=$(pkg-config --libs libxml-2.0) +export PKG_LIBS=$(pkg-config --libs libxml-2.0) + +# export LIBARROW_EXTERNAL_LIBDIR=/workspaces/arrow/r/libarrow + +R CMD INSTALL . --preclean + +``` \ No newline at end of file From fcfbd94c3d79c7e9728b9e74c42bb402bd048bfa Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Fri, 13 Mar 2026 03:04:09 +0000 Subject: [PATCH 20/39] Add TODO note in configure script to remove hard-coded link flags --- r/configure | 5 +++-- r/tmp.md | 10 +++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/r/configure b/r/configure index b417a4dd04b4..69b2a392a8f6 100755 --- a/r/configure +++ b/r/configure @@ -271,7 +271,8 @@ set_pkg_vars_with_pc () { PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` - PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" + # TODO: Figure out how to pass these link flags properly, need this temporarily to get R to link properly. + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES -lcurl -lxml2 -lssl" PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` } @@ -360,7 +361,7 @@ add_feature_flags () { PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi if arrow_built_with ARROW_AZURE; then - PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZUREFS" + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" diff --git a/r/tmp.md b/r/tmp.md index 6712928d7bd2..c7c7118779ba 100644 --- a/r/tmp.md +++ b/r/tmp.md @@ -38,13 +38,21 @@ cmake \ -DARROW_AZURE=ON \ .. +cmake --build . --target install -j8 + + +R -e 'install.packages("remotes"); remotes::install_deps(dependencies = TRUE)' + +R CMD INSTALL --no-multiarch . + +# ---------------------- + # Try building from source via R with the relevant env vars set for feature flags. # Core Build Settings export LIBARROW_MINIMAL=false export FORCE_BUNDLED_BUILD=true -export ARROW_HOME=$ARROW_HOME export BOOST_SOURCE=BUNDLED # Feature Toggles From cdab7f1487c2d7d18bc961c119de5f5d89e1caa6 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sat, 14 Mar 2026 18:44:09 +0000 Subject: [PATCH 21/39] initial filesystem tests --- r/tests/testthat/test-azure.R | 60 +++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 r/tests/testthat/test-azure.R diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R new file mode 100644 index 000000000000..361ea474005a --- /dev/null +++ b/r/tests/testthat/test-azure.R @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("azure") +# TODO: Add local azurite install to setup script +# skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") + +# TODO: Start azurite from the test code instead of relying on it to be already running externally. + +# Use default azurite credentials, +# see https://learn.microsoft.com/en-us/azure/storage/common/storage-connect-azurite?tabs=blob-storage +azurite_account_name <- "devstoreaccount1" +# Note that this is a well-known default credential for local development on Azurite. +azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +azurite_blob_host <- "127.0.0.1" +azurite_blob_port <- "10000" +azurite_blob_storage_authority <- sprintf("%s:%s",azurite_blob_host, azurite_blob_port) +azurite_blob_storage_scheme <- "http" + +# Helper functions for Azure URIs and paths +azure_uri <- function(...) { + template <- "az://%s:%s@%s?scheme=http&blob_endpoint=localhost%s%s" + # URL encode the account key because it contains reserved characters + encoded_key <- curl::curl_escape(azurite_account_key) + sprintf(template, azurite_account_name, encoded_key, azure_path(...), "%3A", azurite_blob_port) +} + +azure_path <- azure_path <- function(...) { + # 'now' is the container name (following the convention in the s3 tests). + paste(now, ..., sep = "/") +} + +fs <- AzureFileSystem$create( + account_name="devstoreaccount1", + account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + blob_storage_authority="127.0.0.1:10000", + blob_storage_scheme="http" +) + +now <- as.character(as.numeric(Sys.time())) +fs$CreateDir(now) +# Clean up when we're all done +withr::defer(fs$DeleteDir(now)) + +# (1) Run default filesystem tests on azure filesystem +test_filesystem("azure", fs, azure_path, azure_uri) From 221aba40bde90f9eb9ae4b42d22173c454ee812c Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 02:25:21 +0000 Subject: [PATCH 22/39] uncomment line 256 of filesystem.cpp Note: this was causing a seg fault when anything that inherits from FileSystem ries to access the base_path property. Uncommenting the return line and rebuilding the R package resolved this issue. --- r/src/filesystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index ef634ddd1af9..e9bc79102650 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -253,7 +253,7 @@ std::shared_ptr fs___SubTreeFileSystem__base_fs( // [[arrow::export]] std::string fs___SubTreeFileSystem__base_path( const std::shared_ptr& file_system) { - // return file_system->base_path(); + return file_system->base_path(); } // [[arrow::export]] From b587aaf99c6e0fb8e7d0a8472b52ac3ce7c0d138 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 02:27:20 +0000 Subject: [PATCH 23/39] checkpoint: resolved segfault error At this point, I was able to call write_feather(example_data, fs$path("test/test.feather")) successfully against Azurite. Committing progress before I make any further changes. --- r/tests/testthat/test-azure.R | 65 ++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 361ea474005a..b2d6c2550902 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +library(arrow) skip_if_not_available("azure") # TODO: Add local azurite install to setup script # skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") @@ -26,35 +26,68 @@ skip_if_not_available("azure") azurite_account_name <- "devstoreaccount1" # Note that this is a well-known default credential for local development on Azurite. azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" -azurite_blob_host <- "127.0.0.1" +azurite_blob_host <- "host.docker.internal" azurite_blob_port <- "10000" azurite_blob_storage_authority <- sprintf("%s:%s",azurite_blob_host, azurite_blob_port) azurite_blob_storage_scheme <- "http" # Helper functions for Azure URIs and paths azure_uri <- function(...) { - template <- "az://%s:%s@%s?scheme=http&blob_endpoint=localhost%s%s" + endpoint <- sprintf("%s%s%s", azurite_blob_host, "%3A", azurite_blob_port) + template <- "abfs://%s:%s@%s?endpoint=%s" # URL encode the account key because it contains reserved characters encoded_key <- curl::curl_escape(azurite_account_key) - sprintf(template, azurite_account_name, encoded_key, azure_path(...), "%3A", azurite_blob_port) + sprintf(template, azurite_account_name, encoded_key, azure_path(...), endpoint) } - -azure_path <- azure_path <- function(...) { - # 'now' is the container name (following the convention in the s3 tests). - paste(now, ..., sep = "/") +azure_path <- function(...) { + # 'dir' is the container name (following the convention in the s3 tests). + paste(dir, ..., sep = "/") } fs <- AzureFileSystem$create( - account_name="devstoreaccount1", - account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", - blob_storage_authority="127.0.0.1:10000", - blob_storage_scheme="http" + account_name=azurite_account_name, + account_key=azurite_account_key, + blob_storage_authority=azurite_blob_storage_authority, + blob_storage_scheme=azurite_blob_storage_scheme +) + +fs2 <- arrow:::az_bucket( + bucket="test", + account_name=azurite_account_name, + account_key=azurite_account_key, + blob_storage_authority=azurite_blob_storage_authority, + blob_storage_scheme=azurite_blob_storage_scheme ) -now <- as.character(as.numeric(Sys.time())) -fs$CreateDir(now) +# TODO: Factor these into tests once finished debugging. + +# (1) CreateDir and DeleteDir work correctly +dir <- "test" +fs$CreateDir(dir) # Clean up when we're all done withr::defer(fs$DeleteDir(now)) -# (1) Run default filesystem tests on azure filesystem -test_filesystem("azure", fs, azure_path, azure_uri) +# (XX) Run default filesystem tests on azure filesystem +# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather +#test_filesystem("azure", fs, azure_path, azure_uri) + +example_data <- tibble::tibble( + int = c(1:3, NA_integer_, 5:10), + dbl = c(1:8, NA, 10) + 0.1, + dbl2 = rep(5, 10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + false = logical(10), + chr = letters[c(1:5, NA, 7:10)], + fct = factor(letters[c(1:4, NA, NA, 7:10)]) +) + +# Verify that write file operation works +write_feather(example_data, azure_uri("test.feather")) + +encoded_key <- curl::curl_escape(azurite_account_key) +encoded_key +write_feather(example_data, sprintf("abfs://devstoreaccount1:%s@127.0.0.1:10000/test/test.feather", encoded_key)) + +write_feather(example_data, "az://test@devstoreaccount1:Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq%2FK1SZFPTOtr%2FKBHBeksoGMGw%3D%3D@127.0.0.1:10000/test/test.feather") + + From ed20ea15f7628a7d30759cc47211ee98972191c2 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 04:16:09 +0000 Subject: [PATCH 24/39] skip test_filesystem tests that rely on being able to connect directly with URI with Azure. --- r/tests/testthat/helper-filesystems.R | 135 ++++++++++++++------------ 1 file changed, 74 insertions(+), 61 deletions(-) diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index 7b37abf764b0..4755bbbc9de1 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -25,12 +25,18 @@ #' returns a URI containing the filesystem scheme (e.g. 's3://', 'gs://'), the #' absolute path, and any necessary connection options as URL query parameters. test_filesystem <- function(name, fs, path_formatter, uri_formatter) { - # NOTE: it's important that we label these tests with name of filesystem so + # NOTE 1: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. - test_that(sprintf("read/write Feather on %s using URIs", name), { - write_feather(example_data, uri_formatter("test.feather")) - expect_identical(read_feather(uri_formatter("test.feather")), example_data) - }) + + # NOTE 2: as far as I can tell, Azure doesn't support passing a URI directly + # like we can do in S3/GCS. Skipping any tests that rely on this feature + # for name == "azure". + if (name != "azure") { + test_that(sprintf("read/write Feather on %s using URIs", name), { + write_feather(example_data, uri_formatter("test.feather")) + expect_identical(read_feather(uri_formatter("test.feather")), example_data) + }) + } test_that(sprintf("read/write Feather on %s using Filesystem", name), { write_feather(example_data, fs$path(path_formatter("test2.feather"))) @@ -71,12 +77,14 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { example_data ) }) - - test_that(sprintf("read/write Parquet on %s", name), { - skip_if_not_available("parquet") - write_parquet(example_data, fs$path(path_formatter("test.parquet"))) - expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) - }) + + if (name != "azure") { + test_that(sprintf("read/write Parquet on %s", name), { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(path_formatter("test.parquet"))) + expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) + }) + } if (arrow_with_dataset()) { make_temp_dir <- function() { @@ -85,39 +93,41 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { normalizePath(path, winslash = "/") } - test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { - skip_if_not_available("parquet") - expect_identical( - open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), - example_data |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset with vector of %s file URIs", name), { - expect_identical( - open_dataset( - c(uri_formatter("test.feather"), uri_formatter("test2.feather")), - format = "feather" - ) |> - arrange(int) |> - collect(), - rbind(example_data, example_data) |> arrange(int) - ) - }) - - test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { - td <- make_temp_dir() - expect_error( - open_dataset( - c( - uri_formatter("test.feather"), - paste0("file://", file.path(td, "fake.feather")) + if (name != "azure") { + test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { + skip_if_not_available("parquet") + expect_identical( + open_dataset(uri_formatter("test.parquet")) |> collect() |> arrange(int), + example_data |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset with vector of %s file URIs", name), { + expect_identical( + open_dataset( + c(uri_formatter("test.feather"), uri_formatter("test2.feather")), + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) + }) + + test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { + td <- make_temp_dir() + expect_error( + open_dataset( + c( + uri_formatter("test.feather"), + paste0("file://", file.path(td, "fake.feather")) + ), + format = "feather" ), - format = "feather" - ), - "Vectors of URIs for different file systems are not supported" - ) - }) + "Vectors of URIs for different file systems are not supported" + ) + }) + } # Dataset test setup, cf. test-dataset.R first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") @@ -167,24 +177,27 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) }) - + if (name != "azure") { + test_that(sprintf("copy files with %s", name), { + td <- make_temp_dir() + copy_files(uri_formatter("hive_dir"), td) + expect_length(dir(td), 2) + ds <- open_dataset(td) + expect_identical( + ds |> select(int, dbl, lgl) |> collect() |> arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) + ) + }) + } test_that(sprintf("copy files with %s", name), { - td <- make_temp_dir() - copy_files(uri_formatter("hive_dir"), td) - expect_length(dir(td), 2) - ds <- open_dataset(td) - expect_identical( - ds |> select(int, dbl, lgl) |> collect() |> arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) - ) - - # Let's copy the other way and use a SubTreeFileSystem rather than URI - copy_files(td, fs$path(path_formatter("hive_dir2"))) - ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) - expect_identical( - ds2 |> select(int, dbl, lgl) |> collect() |> arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) - ) - }) + td <- make_temp_dir() + copy_files(fs$path(path_formatter("hive_dir")), td) + copy_files(td, fs$path(path_formatter("hive_dir2"))) + ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) + expect_identical( + ds2 |> select(int, dbl, lgl) |> collect() |> arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) + ) + }) } # if(arrow_with_dataset()) } From 9f6f6066c3ee7811fa4d302147e209caac44a2ff Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 04:20:26 +0000 Subject: [PATCH 25/39] Add most test cases from test_filesystem and recreate a couple that were skipped because of the URI issue. --- r/tests/testthat/test-azure.R | 67 ++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index b2d6c2550902..88b781ed7648 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -14,8 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -library(arrow) + skip_if_not_available("azure") + +# test_filesystem requires dplyr +library(dplyr) + # TODO: Add local azurite install to setup script # skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") @@ -39,6 +43,7 @@ azure_uri <- function(...) { encoded_key <- curl::curl_escape(azurite_account_key) sprintf(template, azurite_account_name, encoded_key, azure_path(...), endpoint) } + azure_path <- function(...) { # 'dir' is the container name (following the convention in the s3 tests). paste(dir, ..., sep = "/") @@ -51,25 +56,20 @@ fs <- AzureFileSystem$create( blob_storage_scheme=azurite_blob_storage_scheme ) -fs2 <- arrow:::az_bucket( - bucket="test", - account_name=azurite_account_name, - account_key=azurite_account_key, - blob_storage_authority=azurite_blob_storage_authority, - blob_storage_scheme=azurite_blob_storage_scheme -) - -# TODO: Factor these into tests once finished debugging. - # (1) CreateDir and DeleteDir work correctly dir <- "test" fs$CreateDir(dir) # Clean up when we're all done -withr::defer(fs$DeleteDir(now)) +withr::defer(fs$DeleteDir(dir)) -# (XX) Run default filesystem tests on azure filesystem -# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather -#test_filesystem("azure", fs, azure_path, azure_uri) +# (2) Run default filesystem tests on azure filesystem + +# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather, +# so some of the test_filesystem tests can't be run with AzureFilesystem. Some tests +# below cover some of the skipped cases in test_filesystem. +test_filesystem("azure", fs, azure_path, azure_uri) + +# (3) Test write/read parquet example_data <- tibble::tibble( int = c(1:3, NA_integer_, 5:10), @@ -81,13 +81,38 @@ example_data <- tibble::tibble( fct = factor(letters[c(1:4, NA, NA, 7:10)]) ) -# Verify that write file operation works -write_feather(example_data, azure_uri("test.feather")) +test_that("read/write Parquet on azure", { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(azure_path("test.parquet"))) + expect_identical(read_parquet(fs$path(azure_path("test.parquet"))), example_data) +}) + +# (4) open_dataset with a vector of azure file paths + +# TODO: I couldn't pass a vector of paths similar to the original test in +# test_filesystem, but you can pass a folder containing many files. +write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) +write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) + +open_multi_fs = arrow:::az_bucket( + bucket=azure_path("openmulti"), + account_name=azurite_account_name, + account_key=azurite_account_key, + blob_storage_authority=azurite_blob_storage_authority, + blob_storage_scheme=azurite_blob_storage_scheme +) -encoded_key <- curl::curl_escape(azurite_account_key) -encoded_key -write_feather(example_data, sprintf("abfs://devstoreaccount1:%s@127.0.0.1:10000/test/test.feather", encoded_key)) +test_that("open_dataset with AzureFileSystem folder", { + expect_identical( + open_dataset( + open_multi_fs, + format = "feather" + ) |> + arrange(int) |> + collect(), + rbind(example_data, example_data) |> arrange(int) + ) +}) -write_feather(example_data, "az://test@devstoreaccount1:Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq%2FK1SZFPTOtr%2FKBHBeksoGMGw%3D%3D@127.0.0.1:10000/test/test.feather") From 97118d8e647cdd94d3df78e09b2559aae02a9444 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 04:26:30 +0000 Subject: [PATCH 26/39] rename az_bucket to az_container --- r/R/filesystem.R | 3 +-- r/tests/testthat/test-azure.R | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 1d97ad29ac91..e6dbd5c3fecc 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -706,9 +706,8 @@ AzureFileSystem$create <- function(...) { fs___AzureFileSystem__Make(options) } -# TODO: Probably shouldn't be called bucket. # TODO: Add documentation. -az_bucket <- function(bucket, ...) { +az_container <- function(bucket, ...) { assert_that(is.string(bucket)) args <- list2(...) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 88b781ed7648..2f7c5e7be4a4 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -94,7 +94,7 @@ test_that("read/write Parquet on azure", { write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) -open_multi_fs = arrow:::az_bucket( +open_multi_fs = arrow:::az_container( bucket=azure_path("openmulti"), account_name=azurite_account_name, account_key=azurite_account_key, From 01fdf52d265720e8a9548504eaee7793907f5943 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 21:34:12 +0000 Subject: [PATCH 27/39] check that azurite is installed as precondition for test-azure.R script. Switch over to locally hosted azurite instead of local docker container azurite. --- r/tests/testthat/test-azure.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 2f7c5e7be4a4..ac5785702669 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -21,7 +21,7 @@ skip_if_not_available("azure") library(dplyr) # TODO: Add local azurite install to setup script -# skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") +skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") # TODO: Start azurite from the test code instead of relying on it to be already running externally. @@ -30,7 +30,7 @@ library(dplyr) azurite_account_name <- "devstoreaccount1" # Note that this is a well-known default credential for local development on Azurite. azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" -azurite_blob_host <- "host.docker.internal" +azurite_blob_host <- "127.0.0.1" azurite_blob_port <- "10000" azurite_blob_storage_authority <- sprintf("%s:%s",azurite_blob_host, azurite_blob_port) azurite_blob_storage_scheme <- "http" From d58594f89b23910a53e13058b3dd52018d42e2e5 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 22:22:32 +0000 Subject: [PATCH 28/39] add setup code to start azurite from the test-azure.R script, then kill the azurite process in a cleanup step. --- r/tests/testthat/test-azure.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index ac5785702669..85a448e2f074 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -20,11 +20,9 @@ skip_if_not_available("azure") # test_filesystem requires dplyr library(dplyr) -# TODO: Add local azurite install to setup script +# This test script depends on ./ci/scripts/install_azurite.sh skip_if_not(nzchar(Sys.which("azurite")), message = "azurite is not installed.") -# TODO: Start azurite from the test code instead of relying on it to be already running externally. - # Use default azurite credentials, # see https://learn.microsoft.com/en-us/azure/storage/common/storage-connect-azurite?tabs=blob-storage azurite_account_name <- "devstoreaccount1" @@ -35,6 +33,14 @@ azurite_blob_port <- "10000" azurite_blob_storage_authority <- sprintf("%s:%s",azurite_blob_host, azurite_blob_port) azurite_blob_storage_scheme <- "http" +pid_azurite <- sys::exec_background( + "azurite", + c("azurite", "--inMemoryPersistence", "--blobHost", azurite_blob_host), + std_out = FALSE +) +# Kill azurite background process once tests have finished running. +withr::defer(tools::pskill(pid_azurite)) + # Helper functions for Azure URIs and paths azure_uri <- function(...) { endpoint <- sprintf("%s%s%s", azurite_blob_host, "%3A", azurite_blob_port) @@ -64,9 +70,10 @@ withr::defer(fs$DeleteDir(dir)) # (2) Run default filesystem tests on azure filesystem -# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather, -# so some of the test_filesystem tests can't be run with AzureFilesystem. Some tests -# below cover some of the skipped cases in test_filesystem. +# TODO: As far as I can tell, there is no way to pass an Azurite URI to write_feather +# (or any other read/write helper), so some of the test_filesystem tests can't be run +# with AzureFilesystem. Some tests below cover some of the skipped cases in +# test_filesystem. test_filesystem("azure", fs, azure_path, azure_uri) # (3) Test write/read parquet From 02feb8c4d608e0f3e8d351b76b4bdba5480389e6 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 22:35:02 +0000 Subject: [PATCH 29/39] run air formatter --- r/tests/testthat/helper-filesystems.R | 22 +++++++++++----------- r/tests/testthat/test-azure.R | 23 ++++++++++------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index 4755bbbc9de1..9fba086a18e3 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -27,7 +27,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { # NOTE 1: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. - + # NOTE 2: as far as I can tell, Azure doesn't support passing a URI directly # like we can do in S3/GCS. Skipping any tests that rely on this feature # for name == "azure". @@ -77,7 +77,7 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { example_data ) }) - + if (name != "azure") { test_that(sprintf("read/write Parquet on %s", name), { skip_if_not_available("parquet") @@ -190,14 +190,14 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { }) } test_that(sprintf("copy files with %s", name), { - td <- make_temp_dir() - copy_files(fs$path(path_formatter("hive_dir")), td) - copy_files(td, fs$path(path_formatter("hive_dir2"))) - ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) - expect_identical( - ds2 |> select(int, dbl, lgl) |> collect() |> arrange(int), - rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) - ) - }) + td <- make_temp_dir() + copy_files(fs$path(path_formatter("hive_dir")), td) + copy_files(td, fs$path(path_formatter("hive_dir2"))) + ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) + expect_identical( + ds2 |> select(int, dbl, lgl) |> collect() |> arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) |> arrange(int) + ) + }) } # if(arrow_with_dataset()) } diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 85a448e2f074..e0bc627551a1 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -30,7 +30,7 @@ azurite_account_name <- "devstoreaccount1" azurite_account_key <- "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azurite_blob_host <- "127.0.0.1" azurite_blob_port <- "10000" -azurite_blob_storage_authority <- sprintf("%s:%s",azurite_blob_host, azurite_blob_port) +azurite_blob_storage_authority <- sprintf("%s:%s", azurite_blob_host, azurite_blob_port) azurite_blob_storage_scheme <- "http" pid_azurite <- sys::exec_background( @@ -56,10 +56,10 @@ azure_path <- function(...) { } fs <- AzureFileSystem$create( - account_name=azurite_account_name, - account_key=azurite_account_key, - blob_storage_authority=azurite_blob_storage_authority, - blob_storage_scheme=azurite_blob_storage_scheme + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme ) # (1) CreateDir and DeleteDir work correctly @@ -102,11 +102,11 @@ write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) open_multi_fs = arrow:::az_container( - bucket=azure_path("openmulti"), - account_name=azurite_account_name, - account_key=azurite_account_key, - blob_storage_authority=azurite_blob_storage_authority, - blob_storage_scheme=azurite_blob_storage_scheme + bucket = azure_path("openmulti"), + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme ) test_that("open_dataset with AzureFileSystem folder", { @@ -120,6 +120,3 @@ test_that("open_dataset with AzureFileSystem folder", { rbind(example_data, example_data) |> arrange(int) ) }) - - - From 6cc698797e9b81f2f752559ee1fb4b27b99d550e Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Sun, 15 Mar 2026 23:04:34 +0000 Subject: [PATCH 30/39] add documentation to az_container. rebuild docs with devtools::document(). rename argument in az_container to container_path instead of bucket --- r/NAMESPACE | 1 + r/R/filesystem.R | 28 ++++++++++++++++++++++++---- r/man/az_container.Rd | 34 ++++++++++++++++++++++++++++++++++ r/tests/testthat/test-azure.R | 2 +- 4 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 r/man/az_container.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 027878693fb6..40d65cab1950 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -297,6 +297,7 @@ export(as_data_type) export(as_record_batch) export(as_record_batch_reader) export(as_schema) +export(az_container) export(binary) export(bool) export(boolean) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index e6dbd5c3fecc..e06a7c259765 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -706,14 +706,34 @@ AzureFileSystem$create <- function(...) { fs___AzureFileSystem__Make(options) } -# TODO: Add documentation. -az_container <- function(bucket, ...) { - assert_that(is.string(bucket)) +#' Connect to an Azure Blob Storage container +#' +#' `az_conainer` is a convenience function to create an `AzureFileSystem` object +#' that provides a file system interface for blob storage containers in an Azure +#' Storage Account. +#' +#' @param container_path string Container name or path +#' @param ... Additional connection options, passed to `AzureFileSystem$create()` +#' +#' @return A `SubTreeFileSystem` containing an `AzureFileSystem` and the container's +#' relative path. Note that this function's success does not guarantee that you +#' are authorized to access the container's contents. +#' @examplesIf FALSE +#' container_fs <- az_container( +#' container_path = "arrow-datasets", +#' account_name = azurite_account_name, +#' account_key = azurite_account_key, +#' blob_storage_authority = azurite_blob_storage_authority, +#' blob_storage_scheme = azurite_blob_storage_scheme +#' ) +#' @export +az_container <- function(container_path, ...) { + assert_that(is.string(container_path)) args <- list2(...) fs <- exec(AzureFileSystem$create, !!!args) - SubTreeFileSystem$create(bucket, fs) + SubTreeFileSystem$create(container_path, fs) } #' @usage NULL diff --git a/r/man/az_container.Rd b/r/man/az_container.Rd new file mode 100644 index 000000000000..da074b337b1b --- /dev/null +++ b/r/man/az_container.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{az_container} +\alias{az_container} +\title{Connect to an Azure Blob Storage container} +\usage{ +az_container(container_path, ...) +} +\arguments{ +\item{container_path}{string Container name or path} + +\item{...}{Additional connection options, passed to \code{AzureFileSystem$create()}} +} +\value{ +A \code{SubTreeFileSystem} containing an \code{AzureFileSystem} and the container's +relative path. Note that this function's success does not guarantee that you +are authorized to access the container's contents. +} +\description{ +\code{az_conainer} is a convenience function to create an \code{AzureFileSystem} object +that provides a file system interface for blob storage containers in an Azure +Storage Account. +} +\examples{ +\dontshow{if (FALSE) withAutoprint(\{ # examplesIf} +container_fs <- az_container( + container_path = "arrow-datasets", + account_name = azurite_account_name, + account_key = azurite_account_key, + blob_storage_authority = azurite_blob_storage_authority, + blob_storage_scheme = azurite_blob_storage_scheme +) +\dontshow{\}) # examplesIf} +} diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index e0bc627551a1..30e1647f649f 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -102,7 +102,7 @@ write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) open_multi_fs = arrow:::az_container( - bucket = azure_path("openmulti"), + container_path = azure_path("openmulti"), account_name = azurite_account_name, account_key = azurite_account_key, blob_storage_authority = azurite_blob_storage_authority, From ba2847782edfe893a413034444829693ba4f0f56 Mon Sep 17 00:00:00 2001 From: Steve Martin <62676717+marberts@users.noreply.github.com> Date: Tue, 17 Mar 2026 16:11:36 -0400 Subject: [PATCH 31/39] docs: Updated documentation for AzureFileSystem and updated vignette (#1) --- r/R/filesystem.R | 47 ++++++++++++++++++++++++++++--------- r/man/FileSystem.Rd | 31 +++++++++++++++++++++++++ r/man/az_container.Rd | 4 ++-- r/src/filesystem.cpp | 2 -- r/vignettes/fs.Rmd | 54 +++++++++++++++++++++++++++++-------------- 5 files changed, 106 insertions(+), 32 deletions(-) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index e06a7c259765..863051334692 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -189,6 +189,31 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `default_metadata`: default metadata to write in new objects. #' - `project_id`: the project to use for creating buckets. #' +#' `AzureFileSystem$create()` takes following required argument: +#' +#' - `account_name`: Azure Blob Storage account name. +#' +#' `AzureFileSystem$create()` takes following optional arguments: +#' +#' - `account_key`: Account key of the storage account. Cannot be used with +#' `sas_token`. +#' - `blob_storage_authority`: Hostname of the blob service, defaulting to +#' `"blob.core.windows.net"`. +#' - `blob_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `client_id`: The client/application ID for Azure Active Directory +#' authentication. If used with `client_secret` and `tenant_id` then it is the +#' application ID for a registered Azure AD application. Otherwise, it is the +#' client ID of a user-assigned managed identity. +#' - `client_secret`: Client secret for Azure Active Directory authentication. +#' Must be provided with both `client_id` and `tenant_id`. +#' - `dfs_storage_authority`: Hostname of the data lake (gen 2) service, +#' defaulting to `"dfs.core.windows.net"`. +#' - `dfs_storage_scheme`: Either `"http"` or `"https"` (the default). +#' - `sas_token`: Shared access signature (SAS) token for the storage account. +#' Cannot be used with `account key`. +#' - `tenant_id`: Tenant ID for Azure Active Directory authentication. Must +#' be provided with both `client_id` and `client_secret`. +#' #' @section Methods: #' #' - `path(x)`: Create a `SubTreeFileSystem` from the current `FileSystem` @@ -253,6 +278,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and #' 'OFF'. #' +#' On `AzureFileSystem`, passing no arguments for authentication uses the +#' `AzureDefaultCredential` for authentication, so that several authentication +#' types are tried until one succeeds. +#' #' @usage NULL #' @format NULL #' @docType class @@ -655,10 +684,9 @@ AzureFileSystem <- R6Class( inherit = FileSystem ) -AzureFileSystem$create <- function(...) { +AzureFileSystem$create <- function(account_name, ...) { options <- list(...) valid_opts <- c( - "account_name", "account_key", "blob_storage_authority", "blob_storage_scheme", @@ -678,9 +706,6 @@ AzureFileSystem$create <- function(...) { call. = FALSE ) } - if (is.null(options$account_name)) { - stop("Missing `account_name`", call. = FALSE) - } if (!is.null(options$tenant_id) || !is.null(options$client_id) || !is.null(options$client_secret)) { if (is.null(options$client_id)) { stop( @@ -703,18 +728,18 @@ AzureFileSystem$create <- function(...) { ) } - fs___AzureFileSystem__Make(options) + fs___AzureFileSystem__Make(c(account_name = account_name, options)) } #' Connect to an Azure Blob Storage container -#' +#' #' `az_conainer` is a convenience function to create an `AzureFileSystem` object #' that provides a file system interface for blob storage containers in an Azure #' Storage Account. -#' -#' @param container_path string Container name or path -#' @param ... Additional connection options, passed to `AzureFileSystem$create()` -#' +#' +#' @param container_path string Container name or path. +#' @param ... Additional connection options, passed to `AzureFileSystem$create()`. +#' #' @return A `SubTreeFileSystem` containing an `AzureFileSystem` and the container's #' relative path. Note that this function's success does not guarantee that you #' are authorized to access the container's contents. diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index eeccda31b04c..0cca6d3767d8 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -90,6 +90,33 @@ the filesystem encounters errors. Default is 15 seconds. \item \code{default_metadata}: default metadata to write in new objects. \item \code{project_id}: the project to use for creating buckets. } + +\code{AzureFileSystem$create()} takes following required argument: +\itemize{ +\item \code{account_name}: Azure Blob Storage account name. +} + +\code{AzureFileSystem$create()} takes following optional arguments: +\itemize{ +\item \code{account_key}: Account key of the storage account. Cannot be used with +\code{sas_token}. +\item \code{blob_storage_authority}: Hostname of the blob service, defaulting to +\code{"blob.core.windows.net"}. +\item \code{blob_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{client_id}: The client/application ID for Azure Active Directory +authentication. If used with \code{client_secret} and \code{tenant_id} then it is the +application ID for a registered Azure AD application. Otherwise, it is the +client ID of a user-assigned managed identity. +\item \code{client_secret}: Client secret for Azure Active Directory authentication. +Must be provided with both \code{client_id} and \code{tenant_id}. +\item \code{dfs_storage_authority}: Hostname of the data lake (gen 2) service, +defaulting to \code{"dfs.core.windows.net"}. +\item \code{dfs_storage_scheme}: Either \code{"http"} or \code{"https"} (the default). +\item \code{sas_token}: Shared access signature (SAS) token for the storage account. +Cannot be used with \verb{account key}. +\item \code{tenant_id}: Tenant ID for Azure Active Directory authentication. Must +be provided with both \code{client_id} and \code{client_secret}. +} } \section{Methods}{ @@ -162,5 +189,9 @@ environment variable \code{ARROW_S3_LOG_LEVEL} (e.g., to running any code that interacts with S3. Possible values include 'FATAL' (the default), 'ERROR', 'WARN', 'INFO', 'DEBUG' (recommended), 'TRACE', and 'OFF'. + +On \code{AzureFileSystem}, passing no arguments for authentication uses the +\code{AzureDefaultCredential} for authentication, so that several authentication +types are tried until one succeeds. } diff --git a/r/man/az_container.Rd b/r/man/az_container.Rd index da074b337b1b..a749b4a4e188 100644 --- a/r/man/az_container.Rd +++ b/r/man/az_container.Rd @@ -7,9 +7,9 @@ az_container(container_path, ...) } \arguments{ -\item{container_path}{string Container name or path} +\item{container_path}{string Container name or path.} -\item{...}{Additional connection options, passed to \code{AzureFileSystem$create()}} +\item{...}{Additional connection options, passed to \code{AzureFileSystem$create()}.} } \value{ A \code{SubTreeFileSystem} containing an \code{AzureFileSystem} and the container's diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index e9bc79102650..6a9fc6d5a0b6 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -502,8 +502,6 @@ cpp11::list fs___GcsFileSystem__options(const std::shared_ptr #endif -// TODO: Write the Rcpp function to interface with the AzureFileSystem class in -// arrow/filesystem/azurefs.h. #if defined(ARROW_R_WITH_AZURE) #include diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 52652ad7e9ed..4c2138f693f8 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -1,29 +1,30 @@ --- -title: "Using cloud storage (S3, GCS)" +title: "Using cloud storage (S3, GCS, Azure)" description: > Learn how to work with data sets stored in an - Amazon S3 bucket or on Google Cloud Storage + Amazon S3 bucket, on Google Cloud Storage, or on Azure output: rmarkdown::html_vignette --- -Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3) and [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. +Working with data stored in cloud storage systems like [Amazon Simple Storage Service](https://docs.aws.amazon.com/s3/) (S3), [Google Cloud Storage](https://cloud.google.com/storage/docs) (GCS), and [Microsoft Azure](https://azure.microsoft.com) is a very common task. Because of this, the Arrow C++ library provides a toolkit aimed to make it as simple to work with cloud storage as it is to work with the local filesystem. -To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, and another one for Google Cloud Storage provided by `GcsFileSystem`. +To make this work, the Arrow C++ library contains a general-purpose interface for file systems, and the arrow package exposes this interface to R users. For instance, if you want to you can create a `LocalFileSystem` object that allows you to interact with the local file system in the usual ways: copying, moving, and deleting files, obtaining information about files and folders, and so on (see `help("FileSystem", package = "arrow")` for details). In general you probably don't need this functionality because you already have tools for working with your local file system, but this interface becomes much more useful in the context of remote file systems. Currently there is a specific implementation for Amazon S3 provided by the `S3FileSystem` class, one for Google Cloud Storage provided by `GcsFileSystem`, and another for Microsoft Azure provided by the `AzureFileSystem` class. -This article provides an overview of working with both S3 and GCS data using the Arrow toolkit. +This article provides an overview of working with S3, GCS, and Azure data using the Arrow toolkit. -## S3 and GCS support +## S3, GCS, and Azure support -Before you start, make sure that your arrow installation has support for S3 and/or GCS enabled. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow installation has support for S3, GCS, and/or Azure enabled. You can check whether support is enabled via helper functions: ```r arrow_with_s3() arrow_with_gcs() +arrow_with_azure() ``` If these return `TRUE` then the relevant support is enabled. -CRAN builds of arrow include S3 support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: +CRAN builds of arrow include S3 and Azure support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: ```r # Option 1: Install from R-universe @@ -36,15 +37,15 @@ Sys.setenv("NOT_CRAN" = "true") install.packages("arrow", type = "source") ``` -On Linux, S3 and GCS support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. +On Linux, S3, GCS, and Azure support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. ## Connecting to cloud storage One way of working with filesystems is to create `?FileSystem` objects. `?S3FileSystem` objects can be created with the `s3_bucket()` function, which automatically detects the bucket's AWS region. Similarly, `?GcsFileSystem` objects -can be created with the `gs_bucket()` function. The resulting -`FileSystem` will consider paths relative to the bucket's path (so for example +can be created with the `gs_bucket()` function and `?AzureFileSystem` objects can be created with the `az_container()` function. The resulting +`FileSystem` will consider paths relative to the bucket/container's path (so for example you don't need to prefix the bucket path when listing a directory). With a `FileSystem` object, you can point to specific files in it with the `$path()` method @@ -52,7 +53,7 @@ and pass the result to file readers and writers (`read_parquet()`, `write_feathe Often the reason users work with cloud storage in real world analysis is to access large data sets. An example of this is discussed in the [datasets article](./dataset.html), but new users may prefer to work with a much smaller data set while learning how the arrow cloud storage interface works. To that end, the examples in this article rely on a multi-file Parquet dataset that stores a copy of the `diamonds` data made available through the [`ggplot2`](https://ggplot2.tidyverse.org/) package, documented in `help("diamonds", package = "ggplot2")`. The cloud storage version of this data set consists of 5 Parquet files totaling less than 1MB in size. -The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an S3FileSystem object that refers to that bucket, use the following command: +The diamonds data set is hosted on both S3 and GCS, in a bucket named `arrow-datasets`. To create an `S3FileSystem` object that refers to that bucket, use the following command: ```r bucket <- s3_bucket("arrow-datasets") @@ -147,7 +148,7 @@ june2019 <- SubTreeFileSystem$create("s3://arrow-datasets/nyc-taxi/year=2019/mon ## Connecting directly with a URI -In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()` and `gs_bucket()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3 or GCS. The format of an S3 URI is as follows: +In most use cases, the easiest and most natural way to connect to cloud storage in arrow is to use the FileSystem objects returned by `s3_bucket()`, `gs_bucket()`, and `az_container()`, especially when multiple file operations are required. However, in some cases you may want to download a file directly by specifying the URI. This is permitted by arrow, and functions like `read_parquet()`, `write_feather()`, `open_dataset()` etc will all accept URIs to cloud resources hosted on S3, GCS, or Azure. The format of an S3 URI is as follows: ``` s3://[access_key:secret_key@]bucket/path[?region=] @@ -160,6 +161,12 @@ gs://[access_key:secret_key@]bucket/path gs://anonymous@bucket/path ``` +For Azure, the URI format looks like this: + +``` +abfs://container@account_name.dfs.core.windows.net/path +``` + For example, the Parquet file storing the "good cut" diamonds that we downloaded earlier in the article is available on both S3 and CGS. The relevant URIs are as follows: ```r @@ -258,6 +265,21 @@ df <- read_parquet("gs://anonymous@arrow-datasets/diamonds/cut=Good/part-0.parqu +### Azure Authentication + +By default, `AzureFileSystem$create()` and `az_container()` use the [DefaultAzureCredential]( https://github.com/Azure/azure-sdk-for-cpp/blob/main/sdk/identity/azure-identity/README.md#defaultazurecredential) for authentication. This will try several different types of authentication, using the first one that succeeds. Like with GCS, a simple way to authenticate with Azure is to first use [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/?view=azure-cli-latest) to login and setup default credentials: + +``` +az login +``` + +It is possible to use other forms of authentication with Azure when calling `AzureFileSystem$create()` and `az_container()`. + +- Passing `client_id` on its own will use [`ManagedIdentityCredential`](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/overview) to authenticate. +- Passing `client_id` with `tenant_id` and `client_secret` will use [`ClientSecretCredential`](https://learn.microsoft.com/en-us/entra/identity-platform/app-objects-and-service-principals?tabs=browser) to authenticate. +- Passing `sas_token` will use a shared access signature (SAS) token for the storage account. +- Passing `account_key` will use the account key for the storage account. + ## Using a proxy server If you need to use a proxy server to connect to an S3 bucket, you can provide @@ -329,10 +351,8 @@ variables, you can set environment variable `AWS_EC2_METADATA_DISABLED` to Sys.setenv(AWS_EC2_METADATA_DISABLED = TRUE) ``` - ## Further reading -- To learn more about `FileSystem` classes, including `S3FileSystem` and `GcsFileSystem`, see `help("FileSystem", package = "arrow")`. -- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). - +- To learn more about `FileSystem` classes, including `S3FileSystem`, `GcsFileSystem`, and `AzureFileSystem`, see `help("FileSystem", package = "arrow")`. +- To see a data analysis example that relies on data hosted on cloud storage, see the [dataset article](./dataset.html). From 4b443c0fd7fc53ec5981e52e27a7c4df4f698d55 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Tue, 17 Mar 2026 22:12:07 -0400 Subject: [PATCH 32/39] Updated installation vignettes to include Azure --- r/vignettes/developers/setup.Rmd | 2 ++ r/vignettes/install.Rmd | 25 +++++++++++++------------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd index e61436df31db..2432325f61bc 100644 --- a/r/vignettes/developers/setup.Rmd +++ b/r/vignettes/developers/setup.Rmd @@ -155,6 +155,7 @@ To enable optional features including: S3 support, an alternative memory allocat -DARROW_GCS=ON \ -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ @@ -228,6 +229,7 @@ cmake \ -DARROW_MIMALLOC=ON \ -DARROW_PARQUET=ON \ -DARROW_S3=ON \ + -DARROW_AZURE=ON \ -DARROW_WITH_BROTLI=ON \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_LZ4=ON \ diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index a058975ccf19..01955d6fdc72 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -32,13 +32,13 @@ exception, as it ships with gcc 4.8. ### Libraries -Optional support for reading from cloud storage--AWS S3 and -Google Cloud Storage (GCS)--requires additional system dependencies: +Optional support for reading from cloud storage--AWS S3, +Google Cloud Storage (GCS), and Azure--requires additional system dependencies: * CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb) * OpenSSL >= 3.0: install `openssl-devel` (rpm) or `libssl-dev` (deb) -The prebuilt binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support. +The prebuilt binaries come with S3, GCS, and Azure support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3, GCS, and Azure support in the build if the prerequisites are not met--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ## Install release version (easy way) @@ -99,9 +99,9 @@ install.packages("arrow") This installs the source version of the R package, but during the installation process will check for compatible libarrow binaries that we host and use those if available. If no binary is available or can't be found, then this option falls back onto method 2 below (full source build), but setting the environment variable results in a more fully-featured build than default. -The libarrow binaries include support for AWS S3 and GCS, so they require the +The libarrow binaries include support for AWS S3, GCS, and Azure, so they require the libcurl and openssl libraries installed separately, as noted above. -If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3 and GCS support disabled). +If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3, GCS, and Azure support disabled). If the internet access of your computer doesn't allow downloading the libarrow binaries (e.g. if access is limited to CRAN), you can first identify the right source and version by trying to install on the offline computer: @@ -204,19 +204,19 @@ information about dependencies and minimum versions. If downloading dependencies at build time is not an option, as when building on a system that is disconnected or behind a firewall, there are a few options. See "Offline builds" below. -#### Dependencies for S3 and GCS support +#### Dependencies for S3, GCS, and Azure support -Support for working with data in S3 and GCS is not enabled in the default +Support for working with data in S3, GCS, and Azure is not enabled in the default source build, and it has additional system requirements as described above. To enable it, set the environment variable `LIBARROW_MINIMAL=false` or `NOT_CRAN=true` to choose the full-featured build, or more selectively set -`ARROW_S3=ON` and/or `ARROW_GCS=ON`. +`ARROW_S3=ON`, `ARROW_GCS=ON`, and/or `ARROW_AZURE=ON`. -When either feature is enabled, the install script will check for the presence -of the required dependencies, and if the prerequisites are met, it will turn -off S3 and GCS support--installation will succeed but without S3 or GCS +When one of these features is enabled, the install script will check for the presence +of the required dependencies, and if the prerequisites are not met, it will turn +off S3, GCS, and Azure support--installation will succeed but without S3, GCS, or Azure functionality. If afterwards you install the missing system requirements, -you'll need to reinstall the package in order to enable S3 and GCS support. +you'll need to reinstall the package in order to enable S3, GCS, and Azure support. ### Advanced configuration @@ -233,6 +233,7 @@ default values are shown below. | ---| --- | :-: | | `ARROW_S3` | S3 support (if dependencies are met)* | `OFF` | | `ARROW_GCS` | GCS support (if dependencies are met)* | `OFF` | +| `ARROW_Azure` | Azure support (if dependencies are met)* | `OFF` | | `ARROW_JEMALLOC` | The `jemalloc` memory allocator | `ON` | | `ARROW_MIMALLOC` | The `mimalloc` memory allocator | `ON` | | `ARROW_PARQUET` | | `ON` | From e671f4a996339d1f4011cc48638fec0157c2bb65 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Tue, 17 Mar 2026 22:59:48 -0400 Subject: [PATCH 33/39] Updated install scripts --- r/configure | 2 +- r/configure.win | 5 ++++- r/inst/build_arrow_static.sh | 1 + r/tools/nixlibs.R | 10 ++++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/r/configure b/r/configure index 69b2a392a8f6..973234ac6658 100755 --- a/r/configure +++ b/r/configure @@ -366,7 +366,7 @@ add_feature_flags () { if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi - if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3 || arrow_built_with ARROW_AZURE; then # If pkg-config is available it will handle this for us automatically SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi diff --git a/r/configure.win b/r/configure.win index 16c5ec1bee8d..67fcaf2feb44 100755 --- a/r/configure.win +++ b/r/configure.win @@ -187,10 +187,13 @@ add_feature_flags () { if arrow_built_with ARROW_S3; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_S3" fi + if arrow_built_with ARROW_AZURE; then + PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" fi - if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3; then + if arrow_built_with ARROW_GCS || arrow_built_with ARROW_S3 || arrow_built_with ARROW_AZURE; then # If pkg-config is available it will handle this for us automatically SSL_LIBS_WITHOUT_PC="-lcurl -lssl -lcrypto" fi diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 241994223d3e..fdc97c63a632 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -79,6 +79,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -Dlz4_SOURCE=${lz4_SOURCE:-} \ -DARROW_FILESYSTEM=ON \ -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ -DARROW_JSON=${ARROW_JSON:-ON} \ diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 14edac96c60f..12634db4a6cc 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -800,6 +800,7 @@ turn_off_all_optional_features <- function(env_var_list) { "ARROW_DATASET" = "OFF", # depends on parquet "ARROW_S3" = "OFF", "ARROW_GCS" = "OFF", + "ARROW_AZURE" = "OFF", "ARROW_WITH_GOOGLE_CLOUD_CPP" = "OFF", "ARROW_WITH_NLOHMANN_JSON" = "OFF", "ARROW_SUBSTRAIT" = "OFF", @@ -887,13 +888,15 @@ is_feature_requested <- function(env_varname, env_var_list, default = env_is("LI with_cloud_support <- function(env_var_list) { arrow_s3 <- is_feature_requested("ARROW_S3", env_var_list) arrow_gcs <- is_feature_requested("ARROW_GCS", env_var_list) + arrow_azure <- is_feature_requested("ARROW_AZURE", env_var_list) - if (arrow_s3 || arrow_gcs) { - # User wants S3 or GCS support. + if (arrow_s3 || arrow_gcs || arrow_azure) { + # User wants S3 or GCS or Azure support. # Make sure that we have curl and openssl system libs feats <- c( if (arrow_s3) "S3", if (arrow_gcs) "GCS" + if (arrow_azure) "AZURE" ) start_msg <- paste(feats, collapse = "/") off_flags <- paste("ARROW_", feats, "=OFF", sep = "", collapse = " and ") @@ -908,16 +911,19 @@ with_cloud_support <- function(env_var_list) { print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { print_warning("requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew)") arrow_s3 <- FALSE arrow_gcs <- FALSE + arrow_azure <- FALSE } } # Update the build flags env_var_list <- replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF")) + replace(env_var_list, "ARROW_AZURE", ifelse(arrow_azure, "ON", "OFF")) } cmake_find_package <- function(pkg, version = NULL, env_var_list) { From bb66e4008d6f943facb04a00a2d30dbc0f4f09c7 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Wed, 18 Mar 2026 23:25:07 +0000 Subject: [PATCH 34/39] add tests for valid and invalid combinations of options to AzureFileSystem$create method --- r/tests/testthat/test-azure.R | 123 +++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 30e1647f649f..671b72a4627a 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -101,7 +101,7 @@ test_that("read/write Parquet on azure", { write_feather(example_data, fs$path(azure_path("openmulti/dataset1.feather"))) write_feather(example_data, fs$path(azure_path("openmulti/dataset2.feather"))) -open_multi_fs = arrow:::az_container( +open_multi_fs <- arrow:::az_container( container_path = azure_path("openmulti"), account_name = azurite_account_name, account_key = azurite_account_key, @@ -120,3 +120,124 @@ test_that("open_dataset with AzureFileSystem folder", { rbind(example_data, example_data) |> arrange(int) ) }) + +# (5) Check that multiple valid combinations of options can be used to +# instantiate AzureFileSystem. + +fs1 <- AzureFileSystem$create(account_name = "fake-account-name") +expect_s3_class(fs1, "AzureFileSystem") + +fs2 <- AzureFileSystem$create(account_name = "fake-account-name", account_key = "fakeaccountkey") +expect_s3_class(fs2, "AzureFileSystem") + + +fs3 <- AzureFileSystem$create( + account_name = "fake-account", account_key = "fakeaccount", + blob_storage_authority = "fake-blob-authority", + dfs_storage_authority = "fake-dfs-authority", + blob_storage_scheme = "https", + dfs_storage_scheme = "https" +) +expect_s3_class(fs3, "AzureFileSystem") + +fs4 <- AzureFileSystem$create( + account_name = "fake-account-name", + sas_token = "fakesastoken" +) +expect_s3_class(fs4, "AzureFileSystem") + +fs5 <- AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id", + client_secret = "fake-client-secret" +) +expect_s3_class(fs5, "AzureFileSystem") + +fs6 <- AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id" +) +expect_s3_class(fs6, "AzureFileSystem") + +# (6) Check that invalid argument combinations are caught upfront +# with appropriate error message. + +error_msg_1 <- "`client_id` must be given with `tenant_id` and `client_secret`" +error_msg_2 <- "Provide only `client_id` to authenticate with Managed Identity Credential, or provide `client_id`, `tenant_id`, and`client_secret` to authenticate with Client Secret Credential" + +test_that("client_id must be specified with account_name and tenant_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + +test_that("client_secret must not be provided with client_id", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + client_id = "fake-client-id", + client_secret = "fake-client-secret" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("client_id must be specified with account_name, tenant_id, and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_secret = "fake-client-secret" + ), + error_msg_1, + fixed = TRUE + ) +}) + + +test_that("client_id must be provided alone or with tenant_id and client_secret", { + expect_error( + AzureFileSystem$create( + account_name = "fake-account-name", + tenant_id = "fake-tenant-id", + client_id = "fake-client-id" + ), + error_msg_2, + fixed = TRUE + ) +}) + +test_that("cannot specify both account_key and sas_token", { + expect_error( + AzureFileSystem$create(account_name='fake-account-name', account_key='fakeaccount', + sas_token='fakesastoken'), + "Cannot specify both `account_key` and `sas_token`", + fixed = TRUE + ) +}) + +test_that("at a minimum account_name must be passed", { + expect_error( + AzureFileSystem$create(), + "Missing `account_name`", + fixed = TRUE + ) +}) From bc20a6d2db74a15cc2e3509117b91f1d689e7496 Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 18 Mar 2026 21:24:05 -0400 Subject: [PATCH 35/39] Ran pre-commit hooks --- r/_pkgdown.yml | 3 ++- r/src/filesystem.cpp | 26 +++++++++++++++----------- r/tests/testthat/test-azure.R | 12 ++++++++---- r/tools/nixlibs.R | 2 +- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 39700914db4b..5d9e432414b6 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -261,10 +261,11 @@ reference: - title: File systems desc: > - Functions for working with files on S3 and GCS + Functions for working with files on S3, GCS, and Azure contents: - s3_bucket - gs_bucket + - az_container - copy_files - title: Flight diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 6a9fc6d5a0b6..8d5c199afa4f 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -513,30 +513,35 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti azure_opts.account_name = cpp11::as_cpp(options["account_name"]); if (!Rf_isNull(options["blob_storage_authority"])) { - azure_opts.blob_storage_authority = cpp11::as_cpp(options["blob_storage_authority"]); + azure_opts.blob_storage_authority = + cpp11::as_cpp(options["blob_storage_authority"]); } if (!Rf_isNull(options["dfs_storage_authority"])) { - azure_opts.dfs_storage_authority = cpp11::as_cpp(options["dfs_storage_authority"]); + azure_opts.dfs_storage_authority = + cpp11::as_cpp(options["dfs_storage_authority"]); } if (!Rf_isNull(options["blob_storage_scheme"])) { - azure_opts.blob_storage_scheme = cpp11::as_cpp(options["blob_storage_scheme"]); + azure_opts.blob_storage_scheme = + cpp11::as_cpp(options["blob_storage_scheme"]); } if (!Rf_isNull(options["dfs_storage_scheme"])) { - azure_opts.dfs_storage_scheme = cpp11::as_cpp(options["dfs_storage_scheme"]); + azure_opts.dfs_storage_scheme = + cpp11::as_cpp(options["dfs_storage_scheme"]); } if (!Rf_isNull(options["client_id"])) { if (Rf_isNull(options["tenant_id"]) && Rf_isNull(options["client_secret"])) { - azure_opts.ConfigureManagedIdentityCredential(cpp11::as_cpp(options["client_id"])); + azure_opts.ConfigureManagedIdentityCredential( + cpp11::as_cpp(options["client_id"])); } else if (!Rf_isNull(options["tenant_id"]) && !Rf_isNull(options["client_secret"])) { azure_opts.ConfigureClientSecretCredential( - cpp11::as_cpp(options["tenant_id"]), - cpp11::as_cpp(options["client_id"]), - cpp11::as_cpp(options["client_secret"]) - ); + cpp11::as_cpp(options["tenant_id"]), + cpp11::as_cpp(options["client_id"]), + cpp11::as_cpp(options["client_secret"])); } } else if (!Rf_isNull(options["account_key"])) { - azure_opts.ConfigureAccountKeyCredential(cpp11::as_cpp(options["account_key"])); + azure_opts.ConfigureAccountKeyCredential( + cpp11::as_cpp(options["account_key"])); } else if (!Rf_isNull(options["sas_token"])) { azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"])); } else { @@ -545,7 +550,6 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti auto io_context = MainRThread::GetInstance().CancellableIOContext(); return ValueOrStop(fs::AzureFileSystem::Make(azure_opts, io_context)); - } #endif diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 671b72a4627a..44721ba1ef66 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -132,7 +132,8 @@ expect_s3_class(fs2, "AzureFileSystem") fs3 <- AzureFileSystem$create( - account_name = "fake-account", account_key = "fakeaccount", + account_name = "fake-account", + account_key = "fakeaccount", blob_storage_authority = "fake-blob-authority", dfs_storage_authority = "fake-dfs-authority", blob_storage_scheme = "https", @@ -164,7 +165,7 @@ expect_s3_class(fs6, "AzureFileSystem") # with appropriate error message. error_msg_1 <- "`client_id` must be given with `tenant_id` and `client_secret`" -error_msg_2 <- "Provide only `client_id` to authenticate with Managed Identity Credential, or provide `client_id`, `tenant_id`, and`client_secret` to authenticate with Client Secret Credential" +error_msg_2 <- "Provide only `client_id` to authenticate with Managed Identity Credential, or provide `client_id`, `tenant_id`, and`client_secret` to authenticate with Client Secret Credential" # nolint test_that("client_id must be specified with account_name and tenant_id", { expect_error( @@ -227,8 +228,11 @@ test_that("client_id must be provided alone or with tenant_id and client_secret" test_that("cannot specify both account_key and sas_token", { expect_error( - AzureFileSystem$create(account_name='fake-account-name', account_key='fakeaccount', - sas_token='fakesastoken'), + AzureFileSystem$create( + account_name = "fake-account-name", + account_key = "fakeaccount", + sas_token = "fakesastoken" + ), "Cannot specify both `account_key` and `sas_token`", fixed = TRUE ) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 12634db4a6cc..4b3c8691e237 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -895,7 +895,7 @@ with_cloud_support <- function(env_var_list) { # Make sure that we have curl and openssl system libs feats <- c( if (arrow_s3) "S3", - if (arrow_gcs) "GCS" + if (arrow_gcs) "GCS", if (arrow_azure) "AZURE" ) start_msg <- paste(feats, collapse = "/") From 99a85989f65e60164cd453801729bc6fcc4de35b Mon Sep 17 00:00:00 2001 From: Steve Martin Date: Wed, 18 Mar 2026 23:13:55 -0400 Subject: [PATCH 36/39] Removed tmp.md --- r/tmp.md | 96 -------------------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 r/tmp.md diff --git a/r/tmp.md b/r/tmp.md deleted file mode 100644 index c7c7118779ba..000000000000 --- a/r/tmp.md +++ /dev/null @@ -1,96 +0,0 @@ -# Temporary development notes - -> TODO: Remove this before we open a PR to upstream arrow library. - -## Using codegen.R - -1. Install repo dependencies in `arrow/r`: `install.packages("remotes")`, then `remotes::install_deps(dependencies = TRUE)` - -2. Rscript `data-raw/codegen.R` - -The second step auto-generates stubs in `arrowExports.R` and `arrowExports.cpp` based on which C++ functions have `// [[arrow::export]]` comments above them. - -**Note**: at the moment we need to run `export ARROW_R_WITH_AZUREFS=true` before `R CMD INSTALL .` to export the environment variable that "forces" the Azure build flag. - -## Using Azurite - -`docker run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite` (see README.md in https://github.com/Azure/Azurite) - -## Build troubleshooting continued - -```bash -export ARROW_HOME=/workspaces/arrow/dist - -cmake \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DARROW_COMPUTE=ON \ - -DARROW_CSV=ON \ - -DARROW_DATASET=ON \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DARROW_FILESYSTEM=ON \ - -DARROW_INSTALL_NAME_RPATH=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_JSON=ON \ - -DARROW_PARQUET=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_AZURE=ON \ - .. - -cmake --build . --target install -j8 - - -R -e 'install.packages("remotes"); remotes::install_deps(dependencies = TRUE)' - -R CMD INSTALL --no-multiarch . - -# ---------------------- - - -# Try building from source via R with the relevant env vars set for feature flags. - -# Core Build Settings -export LIBARROW_MINIMAL=false -export FORCE_BUNDLED_BUILD=true -export BOOST_SOURCE=BUNDLED - -# Feature Toggles -export ARROW_COMPUTE=ON -export ARROW_CSV=ON -export ARROW_DATASET=ON -export ARROW_EXTRA_ERROR_CONTEXT=ON -export ARROW_FILESYSTEM=ON -export ARROW_JEMALLOC=ON -export ARROW_JSON=ON -export ARROW_PARQUET=ON -export ARROW_AZURE=ON - -# Visibility into build -export ARROW_R_DEV=TRUE - -# Use multiple available cores -export MAKEFLAGS="-j8" - -# Compression Codecs -export ARROW_WITH_SNAPPY=ON -export ARROW_WITH_ZLIB=ON - -# Library Linkage -export ARROW_BUILD_STATIC=ON -export ARROW_BUILD_SHARED=OFF - -# For R-specific behavior (replaces CMAKE_INSTALL_LIBDIR=lib) -export LIBARROW_BINARY=false - -export EXTRA_CMAKE_FLAGS="-DARROW_INSTALL_NAME_RPATH=OFF -DARROW_AZURE=ON -DCMAKE_SHARED_LINKER_FLAGS=-lxml2" - -export PKG_CONFIG_PATH="/usr/lib/x86_64-linux-gnu/pkgconfig" -export LDFLAGS=$(pkg-config --libs libxml-2.0) -export PKG_LIBS=$(pkg-config --libs libxml-2.0) - -# export LIBARROW_EXTERNAL_LIBDIR=/workspaces/arrow/r/libarrow - -R CMD INSTALL . --preclean - -``` \ No newline at end of file From 942d3fc0f18403f5bbf4fcc6db679706f528d422 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 19 Mar 2026 14:30:44 +0000 Subject: [PATCH 37/39] wrap credential configuration methods with StopIfNotOk --- r/src/filesystem.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 8d5c199afa4f..774139f3d80b 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -531,21 +531,21 @@ std::shared_ptr fs___AzureFileSystem__Make(cpp11::list opti if (!Rf_isNull(options["client_id"])) { if (Rf_isNull(options["tenant_id"]) && Rf_isNull(options["client_secret"])) { - azure_opts.ConfigureManagedIdentityCredential( - cpp11::as_cpp(options["client_id"])); + StopIfNotOk(azure_opts.ConfigureManagedIdentityCredential( + cpp11::as_cpp(options["client_id"]))); } else if (!Rf_isNull(options["tenant_id"]) && !Rf_isNull(options["client_secret"])) { - azure_opts.ConfigureClientSecretCredential( + StopIfNotOk(azure_opts.ConfigureClientSecretCredential( cpp11::as_cpp(options["tenant_id"]), cpp11::as_cpp(options["client_id"]), - cpp11::as_cpp(options["client_secret"])); + cpp11::as_cpp(options["client_secret"]))); } } else if (!Rf_isNull(options["account_key"])) { - azure_opts.ConfigureAccountKeyCredential( - cpp11::as_cpp(options["account_key"])); + StopIfNotOk(azure_opts.ConfigureAccountKeyCredential( + cpp11::as_cpp(options["account_key"]))); } else if (!Rf_isNull(options["sas_token"])) { - azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"])); + StopIfNotOk(azure_opts.ConfigureSASCredential(cpp11::as_cpp(options["sas_token"]))); } else { - azure_opts.ConfigureDefaultCredential(); + StopIfNotOk(azure_opts.ConfigureDefaultCredential()); } auto io_context = MainRThread::GetInstance().CancellableIOContext(); From f079ac04799efd3990a6f3b594baffd19c88a4e1 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 19 Mar 2026 14:53:05 +0000 Subject: [PATCH 38/39] move link flags to arrow_built_with ARROW_AZURE block in configure script --- r/configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/configure b/r/configure index 973234ac6658..7642b9b9dd3f 100755 --- a/r/configure +++ b/r/configure @@ -271,8 +271,7 @@ set_pkg_vars_with_pc () { PKG_CFLAGS="`${PKG_CONFIG} --cflags ${pkg_config_names}` $PKG_CFLAGS" PKG_CFLAGS="$PKG_CFLAGS $PKG_CFLAGS_FEATURES" PKG_LIBS=`${PKG_CONFIG} --libs-only-l --libs-only-other ${pkg_config_names}` - # TODO: Figure out how to pass these link flags properly, need this temporarily to get R to link properly. - PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES -lcurl -lxml2 -lssl" + PKG_LIBS="$PKG_LIBS $PKG_LIBS_FEATURES" PKG_DIRS=`${PKG_CONFIG} --libs-only-L ${pkg_config_names}` } @@ -362,6 +361,7 @@ add_feature_flags () { fi if arrow_built_with ARROW_AZURE; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_AZURE" + PKG_LIBS_FEATURES="$PKG_LIBS_FEATURES -lcurl -lxml2 -lssl" fi if arrow_built_with ARROW_GCS; then PKG_CFLAGS_FEATURES="$PKG_CFLAGS_FEATURES -DARROW_R_WITH_GCS" From 0e4e2e24ca1af352c1a30aab542459d58fb13ed2 Mon Sep 17 00:00:00 2001 From: Collin Brown Date: Thu, 19 Mar 2026 14:53:44 +0000 Subject: [PATCH 39/39] fix error message to check in test for empty call to AzureFileSystem$create() --- r/tests/testthat/test-azure.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-azure.R b/r/tests/testthat/test-azure.R index 44721ba1ef66..378444791981 100644 --- a/r/tests/testthat/test-azure.R +++ b/r/tests/testthat/test-azure.R @@ -241,7 +241,7 @@ test_that("cannot specify both account_key and sas_token", { test_that("at a minimum account_name must be passed", { expect_error( AzureFileSystem$create(), - "Missing `account_name`", + 'argument "account_name" is missing, with no default', fixed = TRUE ) })