microsoft · tianleiwu · Mar 31, 2026 · Feb 17, 2026 · Feb 18, 2026 · Feb 19, 2026
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -901,6 +901,14 @@ if(NOT IOS)
 
     list(REMOVE_ITEM onnx_test_runner_common_srcs ${onnx_test_runner_src_dir}/main.cc)
 
+    # if training is disabled, endian_utils are still used in tests
+    if (NOT onnxruntime_ENABLE_TRAINING)
+      list(APPEND onnx_test_runner_common_srcs
+          ${ONNXRUNTIME_ROOT}/core/framework/endian_utils.cc
+          ${ONNXRUNTIME_ROOT}/core/framework/endian_utils.h
+          )
+    endif ()
+
     onnxruntime_add_static_library(onnx_test_runner_common ${onnx_test_runner_common_srcs})
     if(MSVC)
       target_compile_options(onnx_test_runner_common PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"

diff --git a/onnxruntime/core/framework/endian_utils.cc b/onnxruntime/core/framework/endian_utils.cc
@@ -83,5 +83,11 @@ common::Status ReadLittleEndian(size_t element_size,
   return detail::CopyLittleEndian(element_size, source_bytes, destination_bytes);
 }
 
+common::Status WriteLittleEndian(size_t element_size,
+                                 gsl::span<const unsigned char> source_bytes,
+                                 gsl::span<unsigned char> destination_bytes) {
+  return detail::CopyLittleEndian(element_size, source_bytes, destination_bytes);
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/endian_utils.h b/onnxruntime/core/framework/endian_utils.h
@@ -76,14 +76,21 @@ common::Status ReadLittleEndian(gsl::span<const unsigned char> source_bytes, gsl
   return ReadLittleEndian(sizeof(T), source_bytes, destination_bytes);
 }
 
+/**
+ * Writes to a little-endian destination.
+ */
+common::Status WriteLittleEndian(size_t element_size,
+                                 gsl::span<const unsigned char> source_bytes,
+                                 gsl::span<unsigned char> destination_bytes);
+
 /**
  * Writes to a little-endian destination.
  */
 template <typename T>
 common::Status WriteLittleEndian(gsl::span<const T> source, gsl::span<unsigned char> destination_bytes) {
   static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable");
   const auto source_bytes = gsl::make_span(reinterpret_cast<const unsigned char*>(source.data()), source.size_bytes());
-  return detail::CopyLittleEndian(sizeof(T), source_bytes, destination_bytes);
+  return WriteLittleEndian(sizeof(T), source_bytes, destination_bytes);
 }
 
 }  // namespace utils

diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
@@ -48,6 +48,17 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
  */
 void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto);
 
+/**
+ * This function is used to get element size of tensor data.
+ *
+ * For complex types it returns size of one of elements of complex value.
+ *
+ * It will be used mostly to convert data on big endian systems
+ * after unpacking data.
+ * @param tensor_data_type tensor data type to get element size from
+ */
+size_t GetElementSizeOfTensor(ONNX_NAMESPACE::TensorProto_DataType tensor_data_type);
+
 /**
  * Wrapper function for set_raw_data.
  * First calls the set_raw_data and then calls ConvertRawDataInTensorProto
@@ -156,7 +167,7 @@ common::Status CreateTensorFromTensorProto(const Env& env, const std::filesystem
 
 /// The threshold for small tensors. If the size of the tensor is LE to this value,
 /// The data will stay in the TensorProto. Otherwise, the data will be moved to a Tensor instance
-/// and TensorProto will contain a kTensorProtoMemoryAddressTag reference as a result of
+/// and TensorProto will contain a kTensorProtoNativeEndianMemoryAddressTag reference as a result of
 /// TensorToTensorProto() below. This is because shape inferencing code in onnx for
 /// like Reshape parses weights data and it needs to be in the TensorProto.
 /// The value of 127 was chosen empirically to be the smallest value that is required
@@ -177,7 +188,7 @@ constexpr const size_t kMaxEmbeddedInitializerSizeInBytes = size_t{2} * 1024 * 1
  * @param[in] tensor the Tensor whose data and shape will be used to create the TensorProto.
  * @param[in] tensor_proto_name the name of the TensorProto.
  * @param[in] use_tensor_buffer the tensor proto is set to use external location, with
- *                              'location' set to onnxruntime::utils::kTensorProtoMemoryAddressTag
+ *                              'location' set to onnxruntime::utils::kTensorProtoNativeEndianMemoryAddressTag
  *                              'offset' set to tensor's memory location, and 'length' set to tensor's
  *                              memory size. The caller is responsible to maintain the lifetime of
  *                              the allocated memory buffer. Use with caution.
@@ -215,8 +226,19 @@ common::Status ValidateEmbeddedTensorProtoDataSizeAndShape(const ONNX_NAMESPACE:
 Special marker used to indicate an existing memory buffer contains the TensorProto external data.
 If the 'location' field of the external data info is set to this marker, the 'offset' field should contain the
 address of the memory containing the data.
+
+This marker is used when data is always in little endian format.
+*/
+constexpr const ORTCHAR_T* kTensorProtoLittleEndianMemoryAddressTag = ORT_TSTR("*/_ORT_MEM_ADDR_/*");
+
+/**
+Special marker used to indicate an existing memory buffer contains the TensorProto external data.
+If the 'location' field of the external data info is set to this marker, the 'offset' field should contain the
+address of the memory containing the data.
+
+This marker is used when data is in native endian format, i.e. big endian on big endian systems.
 */
-constexpr const ORTCHAR_T* kTensorProtoMemoryAddressTag = ORT_TSTR("*/_ORT_MEM_ADDR_/*");
+constexpr const ORTCHAR_T* kTensorProtoNativeEndianMemoryAddressTag = ORT_TSTR("*/_ORT_NATIVE_ENDIAN_MEM_ADDR_/*");
 
 /// <summary>
 /// Creates a OrtValue with a tensor on top of the external data.

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
@@ -1242,15 +1242,6 @@ Graph::Graph(const Model& owning_model,
 
     const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
     ORT_THROW_IF_ERROR(utils::ConstantNodeProtoToTensorProto(node, model_path, *tensor));
-    if constexpr (endian::native != endian::little) {
-      const AttributeProto& attrib = node.attribute(0);
-      if (attrib.type() == AttributeProto_AttributeType_SPARSE_TENSOR) {
-        const TensorProto& sparse_values = node.attribute(0).sparse_tensor().values();
-        if ((!(sparse_values.has_raw_data())) && utils::HasRawData(*tensor)) {
-          onnxruntime::utils::ConvertRawDataInTensorProto(*tensor);
-        }
-      }
-    }
 
     // Ensure initializers are also graph inputs.
     if (ir_version_ < 4) {
@@ -4901,6 +4892,18 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
       std::vector<uint8_t> raw_data;
       ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
       size_t tensor_bytes_size = raw_data.size();
+
+      // Convert it data to little endian before saving to file
+      if constexpr (endian::native != endian::little) {
+        size_t element_size = onnxruntime::utils::GetElementSizeOfTensor(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type()));
+
+        if (element_size > 1) {
+          onnxruntime::utils::SwapByteOrderInplace(
+              element_size,
+              gsl::make_span(reinterpret_cast<std::byte*>(raw_data.data()), tensor_bytes_size));
+        }
+      }
+
       if (model_saving_options.force_embed_external_ini ||
           tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
@@ -6655,13 +6658,13 @@ Status Graph::LoadFromModelEditorApiModel(const OrtGraph& api_graph, bool updati
         const void* data_offset = t.DataRaw();  // address of memory not offset into file
         auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
 
-        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoMemoryAddressTag,
+        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoNativeEndianMemoryAddressTag,
                                                      offset, t.SizeInBytes(), tensor_proto);
 
         // add OrtValue to ortvalue_initializers_ to keep it alive and to store the deleter if provided.
         ortvalue_initializers_.emplace(name, std::move(v));
       } else {
-        tensor_proto.set_raw_data(t.DataRaw(), t.SizeInBytes());
+        onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, t.DataRaw(), t.SizeInBytes());
       }
 
       TypeProto type_proto{utils::TypeProtoFromTensorProto(tensor_proto)};

diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
@@ -50,15 +50,19 @@
     string_data = builder.CreateVectorOfStrings(string_data_vec);
   } else {
     std::vector<uint8_t> unpacked_tensor;
-    // We can not convert this in place, because the session may be used
-    // after the model was saved in ort format. If the session is continued to be used, then
-    // we continue with initializers in memory with wrong endianess
+    ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(initializer, model_path, unpacked_tensor));
+
+    // We cannot convert data before unpacking due to
+    // external data not getting converted by ConvertRawDataInTensorProto function.
+    // Instead convert data after unpacking it
     if constexpr (endian::native != endian::little) {
-      auto be_copy{initializer};
-      onnxruntime::utils::ConvertRawDataInTensorProto(be_copy);
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(be_copy, model_path, unpacked_tensor));
-    } else {
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(initializer, model_path, unpacked_tensor));
+      size_t element_size = onnxruntime::utils::GetElementSizeOfTensor(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type()));
+
+      if (element_size > 1) {
+        onnxruntime::utils::SwapByteOrderInplace(
+            element_size,
+            gsl::make_span(reinterpret_cast<std::byte*>(unpacked_tensor.data()), unpacked_tensor.size()));
+      }
     }
 
     if (external_writer && unpacked_tensor.size() >= kMinimumSizeForExternalData) {
@@ -316,7 +320,7 @@
         // high bit, but that should be unlikely in a scenario where we care about memory usage enough to use this path.
         auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
 
-        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoMemoryAddressTag,
+        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoLittleEndianMemoryAddressTag,
                                                      offset, fbs_raw_data->size(), initializer);
 
       } else {
@@ -473,9 +477,31 @@
   // To avoid issues with vtable offsets, raw_data fbs::vector must be constructed before the TensorBuilder begins
   // building the tensor. See flatbuffer_builder.h's NotNested() function for more details.
   flatbuffers::Offset<flatbuffers::Vector<uint8_t>> raw_data;
+
+  auto unpack_tensor_data_be = [&ort_tensor](std::vector<uint8_t>& unpacked_tensor_data) -> Status {
+    unpacked_tensor_data.resize(ort_tensor.SizeInBytes());
+
+    size_t element_size = onnxruntime::utils::GetElementSizeOfTensor(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(ort_tensor.GetElementType()));
+    auto src_span = gsl::make_span(reinterpret_cast<const unsigned char*>(ort_tensor.DataRaw()), ort_tensor.SizeInBytes());
+    auto dst_span = gsl::make_span(reinterpret_cast<unsigned char*>(unpacked_tensor_data.data()), unpacked_tensor_data.size());
+
+    // If element size is unknown, set it to 1 to disable byteswapping
+    if (element_size < 1) element_size = 1;
+
+    return onnxruntime::utils::WriteLittleEndian(element_size, src_span, dst_span);
+  };
+
   if (!external_data_writer) {
-    raw_data = builder.CreateVector(static_cast<const uint8_t*>(ort_tensor.DataRaw()),
-                                    ort_tensor.SizeInBytes());
+    if constexpr (endian::native != endian::little) {
+      std::vector<uint8_t> unpacked_tensor;
+
+      ORT_RETURN_IF_ERROR(unpack_tensor_data_be(unpacked_tensor));
+
+      raw_data = builder.CreateVector(unpacked_tensor.data(), unpacked_tensor.size());
+    } else {
+      raw_data = builder.CreateVector(static_cast<const uint8_t*>(ort_tensor.DataRaw()),
+                                      ort_tensor.SizeInBytes());
+    }
   }
 
   fbs::TensorBuilder tb(builder);
@@ -485,8 +511,17 @@
   tb.add_data_type(static_cast<fbs::TensorDataType>(ort_tensor.GetElementType()));
   if (external_data_writer) {
     uint64_t offset = 0;
-    gsl::span<const uint8_t> ort_tensor_data_span(static_cast<const uint8_t*>(ort_tensor.DataRaw()), ort_tensor.SizeInBytes());
-    ORT_RETURN_IF_ERROR(external_data_writer(ort_tensor.GetElementType(), ort_tensor_data_span, offset));
+    if constexpr (endian::native != endian::little) {
+      std::vector<uint8_t> unpacked_tensor;
+
+      ORT_RETURN_IF_ERROR(unpack_tensor_data_be(unpacked_tensor));
+
+      gsl::span<const uint8_t> ort_tensor_data_span(static_cast<const uint8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
+      ORT_RETURN_IF_ERROR(external_data_writer(ort_tensor.GetElementType(), ort_tensor_data_span, offset));
+    } else {
+      gsl::span<const uint8_t> ort_tensor_data_span(static_cast<const uint8_t*>(ort_tensor.DataRaw()), ort_tensor.SizeInBytes());
+      ORT_RETURN_IF_ERROR(external_data_writer(ort_tensor.GetElementType(), ort_tensor_data_span, offset));
+    }
     int64_t external_data_offset = onnxruntime::narrow<int64_t>(offset);
     tb.add_external_data_offset(external_data_offset);
   } else {
@@ -546,8 +581,21 @@
   const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(
                                          tensor_data_type)
                                          ->GetElementType();
-  ort_tensor = onnxruntime::Tensor(
-      tensor_dtype, TensorShape(tensor_dims->data(), tensor_dims->size()), allocator);
+
+  if constexpr (endian::native != endian::little) {
+    std::vector<typename std::remove_reference_t<decltype(*tensor_dims)>::return_type> byteswapped_data;
+    byteswapped_data.resize(tensor_dims->size());
+
+    for (size_t i = 0; i < tensor_dims->size(); ++i) {
+      byteswapped_data[i] = tensor_dims->Get(i);
+    }
+
+    ort_tensor = onnxruntime::Tensor(
+        tensor_dtype, TensorShape(byteswapped_data.data(), byteswapped_data.size()), allocator);
+  } else {
+    ort_tensor = onnxruntime::Tensor(
+        tensor_dtype, TensorShape(tensor_dims->data(), tensor_dims->size()), allocator);
+  }
 
   if (fbs_tensor.raw_data() && fbs_tensor.raw_data()->size() == 0U) {
     // Empty tensor. Nothing to unpack.

diff --git a/onnxruntime/core/graph/graph_utils.cc b/onnxruntime/core/graph/graph_utils.cc
@@ -230,7 +230,8 @@ bool CheckInMemoryDataMatch(const ONNX_NAMESPACE::TensorProto& tensor_proto, con
     // Retrieve external data using ExternalData structure
     std::unique_ptr<ExternalDataInfo> external_data;
     ORT_THROW_IF_ERROR(ExternalDataInfo::Create(tensor_proto.external_data(), external_data));
-    return (external_data->GetRelPath().compare(utils::kTensorProtoMemoryAddressTag) == 0) &&
+    return ((external_data->GetRelPath().compare(utils::kTensorProtoLittleEndianMemoryAddressTag) == 0) ||
+            (external_data->GetRelPath().compare(utils::kTensorProtoNativeEndianMemoryAddressTag) == 0)) &&
            (tensor.DataRaw() == reinterpret_cast<const void*>(external_data->GetOffset()));
   }
   return false;

diff --git a/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc b/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc
@@ -63,29 +63,29 @@ Status WhereDummyDq::InsertDummyDQ(Node& node, Graph& graph, bool& modified, con
     case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
       int8_t zp = 0;
       int8_t dummy_data = 1;
-      dummy_zp_proto.set_raw_data(&zp, 1);
-      dummy_data_proto.set_raw_data(&dummy_data, 1);
+      utils::SetRawDataInTensorProto(dummy_zp_proto, &zp, 1);
+      utils::SetRawDataInTensorProto(dummy_data_proto, &dummy_data, 1);
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
       uint8_t zp = 0;
       uint8_t dummy_data = 1;
-      dummy_zp_proto.set_raw_data(&zp, 1);
-      dummy_data_proto.set_raw_data(&dummy_data, 1);
+      utils::SetRawDataInTensorProto(dummy_zp_proto, &zp, 1);
+      utils::SetRawDataInTensorProto(dummy_data_proto, &dummy_data, 1);
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
       int16_t zp = 0;
       int16_t dummy_data = 1;
-      dummy_zp_proto.set_raw_data(&zp, 2);
-      dummy_data_proto.set_raw_data(&dummy_data, 2);
+      utils::SetRawDataInTensorProto(dummy_zp_proto, &zp, 2);
+      utils::SetRawDataInTensorProto(dummy_data_proto, &dummy_data, 2);
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT16: {
       uint16_t zp = 0;
       uint16_t dummy_data = 1;
-      dummy_zp_proto.set_raw_data(&zp, 2);
-      dummy_data_proto.set_raw_data(&dummy_data, 2);
+      utils::SetRawDataInTensorProto(dummy_zp_proto, &zp, 2);
+      utils::SetRawDataInTensorProto(dummy_data_proto, &dummy_data, 2);
       break;
     }
     default:
@@ -110,7 +110,7 @@ Status WhereDummyDq::InsertDummyDQ(Node& node, Graph& graph, bool& modified, con
   switch (initializer.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
       float* where_const_scalar = initializer.data<float>();
-      dummy_scale_proto.set_raw_data(where_const_scalar, sizeof(float));
+      utils::SetRawDataInTensorProto(dummy_scale_proto, where_const_scalar, sizeof(float));
       break;
     }
     default:
@@ -166,4 +166,4 @@ Status WhereDummyDq::ApplyImpl(Graph& graph, bool& modified, int graph_level, co
 
   return Status::OK();
 }
-}  // namespace onnxruntime
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -3,6 +3,9 @@
 #include "DmlGraphFusionHelper.h"
 #include "DmlRuntimeFusedGraphKernel.h"
 
+#include "core/common/endian.h"
+#include "core/framework/endian_utils.h"
+
 using namespace Windows::AI::MachineLearning::Adapter;
 
 namespace Dml
@@ -121,7 +124,31 @@ namespace DmlGraphFusionHelper
             onnxruntime::FileOffsetType fileOffset;
             SafeInt<size_t> safeTensorByteSize;
             THROW_IF_NOT_OK(onnxruntime::utils::GetExternalDataInfo(*initializer,  graph.ModelPath(), /*out*/ externalFilePath, /*out*/ fileOffset, /*out*/ safeTensorByteSize));
-            if (externalFilePath == onnxruntime::utils::kTensorProtoMemoryAddressTag)
+            if (externalFilePath == onnxruntime::utils::kTensorProtoLittleEndianMemoryAddressTag)
+            {
+                if constexpr (onnxruntime::endian::native != onnxruntime::endian::little)
+                {
+                    unpackedTensor.reset(new std::byte[safeTensorByteSize]);
+
+                    auto src = gsl::make_span<const unsigned char>(reinterpret_cast<const unsigned char*>(fileOffset), safeTensorByteSize);
+                    auto dst = gsl::make_span<unsigned char>(reinterpret_cast<unsigned char*>(unpackedTensor.get()), safeTensorByteSize);
+                    size_t element_size = onnxruntime::utils::GetElementSizeOfTensor(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer->data_type()));
+
+                    // If element size is unknown, set it to 1 to disable byteswapping
+                    if (element_size < 1) element_size = 1;
+
+                    THROW_IF_NOT_OK(onnxruntime::utils::ReadLittleEndian(element_size, src, dst));
+
+                    tensorPtr = unpackedTensor.get();
+                    tensorByteSize = safeTensorByteSize;
+                }
+                else
+                {
+                    tensorPtr = reinterpret_cast<std::byte*>(fileOffset);
+                    tensorByteSize = safeTensorByteSize;
+                }
+            }
+            else if (externalFilePath == onnxruntime::utils::kTensorProtoNativeEndianMemoryAddressTag)
             {
                 tensorPtr = reinterpret_cast<std::byte*>(fileOffset);
                 tensorByteSize = safeTensorByteSize;