diff --git a/.gitignore b/.gitignore index 1e834015454..156413e3727 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,10 @@ cli/azd/extensions/microsoft.azd.concurx/concurx.exe cli/azd/extensions/azure.appservice/azureappservice cli/azd/extensions/azure.appservice/azureappservice.exe .squad/ + +# Session artifacts +cli/azd/cover-* +cli/azd/cover_* +review-*.diff + +.playwright-mcp/ diff --git a/.vscode/cspell.misc.yaml b/.vscode/cspell.misc.yaml index 9b6242d0f58..86ed51bc94c 100644 --- a/.vscode/cspell.misc.yaml +++ b/.vscode/cspell.misc.yaml @@ -39,6 +39,26 @@ overrides: - filename: ./README.md words: - VSIX + - filename: ./docs/specs/perf-foundations/** + words: + - Alives + - appsettings + - appservice + - armdeploymentstacks + - azapi + - azsdk + - containerapps + - golangci + - goroutines + - httputil + - keepalives + - nilerr + - nolint + - remotebuild + - resourcegroup + - singleflight + - stdlib + - whatif - filename: schemas/**/azure.yaml.json words: - prodapi diff --git a/cli/azd/.vscode/cspell-azd-dictionary.txt b/cli/azd/.vscode/cspell-azd-dictionary.txt index a1b73e669dd..7cec16daec8 100644 --- a/cli/azd/.vscode/cspell-azd-dictionary.txt +++ b/cli/azd/.vscode/cspell-azd-dictionary.txt @@ -46,6 +46,7 @@ appinsightsexporter appinsightsstorage appplatform appservice +appsettings appuser arget armapimanagement @@ -174,6 +175,7 @@ jmespath jongio jquery keychain +keepalives kubelogin langchain langchaingo @@ -204,6 +206,7 @@ mysqlclient mysqldb nazd ndjson +nilerr nobanner nodeapp nolint @@ -248,9 +251,11 @@ rabbitmq reauthentication relogin remarshal +remotebuild repourl requirepass resourcegraph +resourcegroup restoreapp retriable runtimes @@ -303,6 +308,7 @@ vuejs webappignore webfrontend westus2 +whatif wireinject yacspin yamlnode diff --git a/cli/azd/cmd/deps.go b/cli/azd/cmd/deps.go index a37ea50b48a..9d318018ca7 100644 --- a/cli/azd/cmd/deps.go +++ b/cli/azd/cmd/deps.go @@ -8,11 +8,12 @@ package cmd import ( "net/http" + "github.com/azure/azure-dev/cli/azd/pkg/httputil" "github.com/benbjohnson/clock" ) func createHttpClient() *http.Client { - return http.DefaultClient + return &http.Client{Transport: httputil.TunedTransport()} } func createClock() clock.Clock { diff --git a/cli/azd/pkg/azapi/container_registry.go b/cli/azd/pkg/azapi/container_registry.go index 5f9dd79b650..0240dad3a66 100644 --- a/cli/azd/pkg/azapi/container_registry.go +++ b/cli/azd/pkg/azapi/container_registry.go @@ -13,6 +13,8 @@ import ( "net/url" "slices" "strings" + "sync" + "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -25,6 +27,7 @@ import ( "github.com/azure/azure-dev/cli/azd/pkg/azure" "github.com/azure/azure-dev/cli/azd/pkg/httputil" "github.com/azure/azure-dev/cli/azd/pkg/tools/docker" + "golang.org/x/sync/singleflight" ) // Credentials for authenticating with a docker registry, @@ -61,6 +64,12 @@ type containerRegistryService struct { docker *docker.Cli armClientOptions *arm.ClientOptions coreClientOptions *azcore.ClientOptions + // loginGroup deduplicates concurrent login attempts to the same registry. + // When multiple services share one ACR, only the first goroutine performs + // the credential exchange + docker login; others wait for its result. + loginGroup singleflight.Group + // loginDone tracks registries that have already been authenticated this session. + loginDone sync.Map } // Creates a new instance of the ContainerRegistryService @@ -118,20 +127,51 @@ func (crs *containerRegistryService) FindContainerRegistryResourceGroup( } func (crs *containerRegistryService) Login(ctx context.Context, subscriptionId string, loginServer string) error { - dockerCreds, err := crs.Credentials(ctx, subscriptionId, loginServer) - if err != nil { - return err + cacheKey := subscriptionId + ":" + loginServer + if _, ok := crs.loginDone.Load(cacheKey); ok { + log.Printf("skipping redundant login to %q (already authenticated this session)\n", loginServer) + return nil } - err = crs.docker.Login(ctx, dockerCreds.LoginServer, dockerCreds.Username, dockerCreds.Password) - if err != nil { - return fmt.Errorf( - "failed logging into docker registry %s: %w", - loginServer, - err) - } + // singleflight deduplicates concurrent login attempts to the same registry. + // We use DoChan so each caller can select on its own ctx.Done(), avoiding the + // problem where one caller's cancellation fails all waiters. + ch := crs.loginGroup.DoChan(cacheKey, func() (any, error) { + // Double-check after winning the singleflight race. + if _, ok := crs.loginDone.Load(cacheKey); ok { + return nil, nil + } + + // Use context.WithoutCancel so the shared work isn't tied to a single + // caller's context. Add a bounded timeout so the shared login cannot + // hang indefinitely if Credentials or docker login gets stuck. + const loginTimeout = 5 * time.Minute + opCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), loginTimeout) + defer cancel() + + dockerCreds, err := crs.Credentials(opCtx, subscriptionId, loginServer) + if err != nil { + return nil, err + } + + err = crs.docker.Login(opCtx, dockerCreds.LoginServer, dockerCreds.Username, dockerCreds.Password) + if err != nil { + return nil, fmt.Errorf( + "failed logging into docker registry %q: %w", + loginServer, + err) + } - return nil + crs.loginDone.Store(cacheKey, true) + return nil, nil + }) + + select { + case <-ctx.Done(): + return ctx.Err() + case res := <-ch: + return res.Err + } } // Credentials gets the credentials that could be used to login to the specified container registry. It prefers to use diff --git a/cli/azd/pkg/azapi/resource_service.go b/cli/azd/pkg/azapi/resource_service.go index 5439934bf0c..07abf2f9c6c 100644 --- a/cli/azd/pkg/azapi/resource_service.go +++ b/cli/azd/pkg/azapi/resource_service.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "net/http" + "sync" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -61,6 +62,13 @@ type ListResourceGroupResourcesOptions struct { type ResourceService struct { credentialProvider account.SubscriptionCredentialProvider armClientOptions *arm.ClientOptions + + // resourcesClients caches armresources.Client instances per subscription ID. + // Azure SDK ARM clients are safe for concurrent use. + resourcesClients sync.Map // map[string]*armresources.Client + + // resourceGroupClients caches ResourceGroupsClient instances per subscription ID. + resourceGroupClients sync.Map // map[string]*armresources.ResourceGroupsClient } func NewResourceService( @@ -320,6 +328,10 @@ func (rs *ResourceService) DeleteResourceGroup(ctx context.Context, subscription } func (rs *ResourceService) createResourcesClient(ctx context.Context, subscriptionId string) (*armresources.Client, error) { + if cached, ok := rs.resourcesClients.Load(subscriptionId); ok { + return cached.(*armresources.Client), nil + } + credential, err := rs.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { return nil, err @@ -330,13 +342,19 @@ func (rs *ResourceService) createResourcesClient(ctx context.Context, subscripti return nil, fmt.Errorf("creating Resource client: %w", err) } - return client, nil + // Benign race: concurrent miss creates an extra client; LoadOrStore ensures one winner. + actual, _ := rs.resourcesClients.LoadOrStore(subscriptionId, client) + return actual.(*armresources.Client), nil } func (rs *ResourceService) createResourceGroupClient( ctx context.Context, subscriptionId string, ) (*armresources.ResourceGroupsClient, error) { + if cached, ok := rs.resourceGroupClients.Load(subscriptionId); ok { + return cached.(*armresources.ResourceGroupsClient), nil + } + credential, err := rs.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { return nil, err @@ -347,7 +365,9 @@ func (rs *ResourceService) createResourceGroupClient( return nil, fmt.Errorf("creating ResourceGroup client: %w", err) } - return client, nil + // Benign race: concurrent miss creates an extra client; LoadOrStore ensures one winner. + actual, _ := rs.resourceGroupClients.LoadOrStore(subscriptionId, client) + return actual.(*armresources.ResourceGroupsClient), nil } // GroupByResourceGroup creates a map of resources group by their resource group name. diff --git a/cli/azd/pkg/azapi/stack_deployments.go b/cli/azd/pkg/azapi/stack_deployments.go index 7a7b0ba71ec..43df69828b5 100644 --- a/cli/azd/pkg/azapi/stack_deployments.go +++ b/cli/azd/pkg/azapi/stack_deployments.go @@ -13,13 +13,16 @@ import ( "os" "path/filepath" "strconv" + "sync" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armdeploymentstacks" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" "github.com/azure/azure-dev/cli/azd/internal" + "github.com/azure/azure-dev/cli/azd/internal/tracing" "github.com/azure/azure-dev/cli/azd/pkg/account" "github.com/azure/azure-dev/cli/azd/pkg/alpha" "github.com/azure/azure-dev/cli/azd/pkg/async" @@ -30,6 +33,7 @@ import ( "github.com/azure/azure-dev/cli/azd/pkg/output" "github.com/benbjohnson/clock" "github.com/sethvargo/go-retry" + "go.opentelemetry.io/otel/attribute" ) var FeatureDeploymentStacks = alpha.MustFeatureKey("deployment.stacks") @@ -57,6 +61,10 @@ type StackDeployments struct { armClientOptions *arm.ClientOptions standardDeployments *StandardDeployments cloud *cloud.Cloud + + // stackClients caches deployment stacks Client instances per subscription ID. + // Azure SDK ARM clients are safe for concurrent use. + stackClients sync.Map // map[string]*armdeploymentstacks.Client } type deploymentStackOptions struct { @@ -250,7 +258,14 @@ func (d *StackDeployments) DeployToSubscription( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) (*ResourceDeployment, error) { +) (_ *ResourceDeployment, err error) { + ctx, span := tracing.Start(ctx, "arm.stack.deploy.subscription") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.deployment", deploymentName), + ) + client, err := d.createClient(ctx, subscriptionId) if err != nil { return nil, err @@ -266,7 +281,9 @@ func (d *StackDeployments) DeployToSubscription( return nil, err } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to subscription: %w", createDeploymentError(err, DeploymentOperationDeploy)) } @@ -324,7 +341,15 @@ func (d *StackDeployments) DeployToResourceGroup( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) (*ResourceDeployment, error) { +) (_ *ResourceDeployment, err error) { + ctx, span := tracing.Start(ctx, "arm.stack.deploy.resourcegroup") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.resourcegroup", resourceGroup), + attribute.String("arm.deployment", deploymentName), + ) + client, err := d.createClient(ctx, subscriptionId) if err != nil { return nil, err @@ -340,7 +365,9 @@ func (d *StackDeployments) DeployToResourceGroup( return nil, err } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to resource group: %w", createDeploymentError(err, DeploymentOperationDeploy)) } @@ -489,7 +516,9 @@ func (d *StackDeployments) DeleteSubscriptionDeployment( return err } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { progress.SetProgress(DeleteDeploymentProgress{ Name: deploymentName, @@ -629,7 +658,9 @@ func (d *StackDeployments) DeleteResourceGroupDeployment( return err } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { progress.SetProgress(DeleteDeploymentProgress{ Name: deploymentName, @@ -661,12 +692,23 @@ func (d *StackDeployments) CalculateTemplateHash( } func (d *StackDeployments) createClient(ctx context.Context, subscriptionId string) (*armdeploymentstacks.Client, error) { + if cached, ok := d.stackClients.Load(subscriptionId); ok { + return cached.(*armdeploymentstacks.Client), nil + } + credential, err := d.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { return nil, err } - return armdeploymentstacks.NewClient(subscriptionId, credential, d.armClientOptions) + client, err := armdeploymentstacks.NewClient(subscriptionId, credential, d.armClientOptions) + if err != nil { + return nil, fmt.Errorf("creating deployment stacks client: %w", err) + } + + // Benign race: concurrent miss creates an extra client; LoadOrStore ensures one winner. + actual, _ := d.stackClients.LoadOrStore(subscriptionId, client) + return actual.(*armdeploymentstacks.Client), nil } // Converts from an ARM Extended Deployment to Azd Generic deployment @@ -795,7 +837,9 @@ func (d *StackDeployments) ValidatePreflightToResourceGroup( createDeploymentError(err, DeploymentOperationValidate), ) } - _, err = validateResult.PollUntilDone(ctx, nil) + _, err = validateResult.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return fmt.Errorf( "validating deployment to resource group: %w", @@ -877,7 +921,9 @@ func (d *StackDeployments) ValidatePreflightToSubscription( createDeploymentError(err, DeploymentOperationValidate), ) } - _, err = validateResult.PollUntilDone(ctx, nil) + _, err = validateResult.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return fmt.Errorf( "validating deployment to subscription: %w", diff --git a/cli/azd/pkg/azapi/standard_deployments.go b/cli/azd/pkg/azapi/standard_deployments.go index efc55ed44cb..03dfb0e008b 100644 --- a/cli/azd/pkg/azapi/standard_deployments.go +++ b/cli/azd/pkg/azapi/standard_deployments.go @@ -11,11 +11,15 @@ import ( "maps" "net/url" "slices" + "sync" + "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" + "github.com/azure/azure-dev/cli/azd/internal/tracing" "github.com/azure/azure-dev/cli/azd/pkg/account" "github.com/azure/azure-dev/cli/azd/pkg/async" "github.com/azure/azure-dev/cli/azd/pkg/azure" @@ -23,6 +27,7 @@ import ( "github.com/azure/azure-dev/cli/azd/pkg/convert" "github.com/azure/azure-dev/cli/azd/pkg/output" "github.com/benbjohnson/clock" + "go.opentelemetry.io/otel/attribute" ) // cArmDeploymentNameLengthMax is the maximum length of the name of a deployment in ARM. @@ -30,6 +35,17 @@ const ( cArmDeploymentNameLengthMax = 64 cPortalUrlFragment = "#view/HubsExtension/DeploymentDetailsBlade/~/overview/id" cOutputsUrlFragment = "#view/HubsExtension/DeploymentDetailsBlade/~/outputs/id" + + // deployPollFrequency is the polling interval for ARM deploy/delete operations. + // Deployments complete in variable time, so polling faster than the 30s SDK default + // reduces tail latency. 5s balances responsiveness against the ARM read-rate limit + // (1200 reads/5min/subscription) when many LROs run in parallel. + deployPollFrequency = 5 * time.Second + + // slowPollFrequency is the polling interval for ARM WhatIf and Validate operations. + // These are consistently slow (30-90s), so aggressive polling provides no benefit and + // risks hitting the ARM read rate limit (1200 reads/5min) during large parallel deployments. + slowPollFrequency = 15 * time.Second ) type StandardDeployments struct { @@ -38,6 +54,14 @@ type StandardDeployments struct { resourceService *ResourceService cloud *cloud.Cloud clock clock.Clock + + // deploymentsClients caches DeploymentsClient instances per subscription ID. + // Azure SDK ARM clients are safe for concurrent use and hold no per-request state, + // so reusing them avoids redundant pipeline construction on every API call. + deploymentsClients sync.Map // map[string]*armresources.DeploymentsClient + + // deploymentsOpsClients caches DeploymentOperationsClient instances per subscription ID. + deploymentsOpsClients sync.Map // map[string]*armresources.DeploymentOperationsClient } func NewStandardDeployments( @@ -186,6 +210,10 @@ func (ds *StandardDeployments) createDeploymentsClient( ctx context.Context, subscriptionId string, ) (*armresources.DeploymentsClient, error) { + if cached, ok := ds.deploymentsClients.Load(subscriptionId); ok { + return cached.(*armresources.DeploymentsClient), nil + } + credential, err := ds.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { return nil, err @@ -196,7 +224,9 @@ func (ds *StandardDeployments) createDeploymentsClient( return nil, fmt.Errorf("creating deployments client: %w", err) } - return client, nil + // Benign race: concurrent miss creates an extra client; LoadOrStore ensures one winner. + actual, _ := ds.deploymentsClients.LoadOrStore(subscriptionId, client) + return actual.(*armresources.DeploymentsClient), nil } func (ds *StandardDeployments) DeployToSubscription( @@ -208,7 +238,14 @@ func (ds *StandardDeployments) DeployToSubscription( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) (*ResourceDeployment, error) { +) (_ *ResourceDeployment, err error) { + ctx, span := tracing.Start(ctx, "arm.deploy.subscription") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -230,7 +267,9 @@ func (ds *StandardDeployments) DeployToSubscription( } // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, nil) + deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to subscription: %w", createDeploymentError(err, DeploymentOperationDeploy)) } @@ -245,7 +284,15 @@ func (ds *StandardDeployments) DeployToResourceGroup( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) (*ResourceDeployment, error) { +) (_ *ResourceDeployment, err error) { + ctx, span := tracing.Start(ctx, "arm.deploy.resourcegroup") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.resourcegroup", resourceGroup), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -266,7 +313,9 @@ func (ds *StandardDeployments) DeployToResourceGroup( } // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, nil) + deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deployPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to resource group: %w", createDeploymentError(err, DeploymentOperationDeploy)) } @@ -558,7 +607,14 @@ func (ds *StandardDeployments) WhatIfDeployToSubscription( deploymentName string, armTemplate azure.RawArmTemplate, parameters azure.ArmParameters, -) (*armresources.WhatIfOperationResult, error) { +) (_ *armresources.WhatIfOperationResult, err error) { + ctx, span := tracing.Start(ctx, "arm.whatif.subscription") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -580,7 +636,9 @@ func (ds *StandardDeployments) WhatIfDeployToSubscription( } // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, nil) + deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to subscription: %w", createDeploymentError(err, DeploymentOperationPreview)) } @@ -593,7 +651,15 @@ func (ds *StandardDeployments) WhatIfDeployToResourceGroup( subscriptionId, resourceGroup, deploymentName string, armTemplate azure.RawArmTemplate, parameters azure.ArmParameters, -) (*armresources.WhatIfOperationResult, error) { +) (_ *armresources.WhatIfOperationResult, err error) { + ctx, span := tracing.Start(ctx, "arm.whatif.resourcegroup") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.resourcegroup", resourceGroup), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return nil, fmt.Errorf("creating deployments client: %w", err) @@ -613,7 +679,9 @@ func (ds *StandardDeployments) WhatIfDeployToResourceGroup( } // wait for deployment creation - deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, nil) + deployResult, err := createFromTemplateOperation.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return nil, fmt.Errorf("deploying to resource group: %w", createDeploymentError(err, DeploymentOperationPreview)) } @@ -625,6 +693,10 @@ func (ds *StandardDeployments) createDeploymentsOperationsClient( ctx context.Context, subscriptionId string, ) (*armresources.DeploymentOperationsClient, error) { + if cached, ok := ds.deploymentsOpsClients.Load(subscriptionId); ok { + return cached.(*armresources.DeploymentOperationsClient), nil + } + credential, err := ds.credentialProvider.CredentialForSubscription(ctx, subscriptionId) if err != nil { return nil, err @@ -635,7 +707,9 @@ func (ds *StandardDeployments) createDeploymentsOperationsClient( return nil, fmt.Errorf("creating deployments client: %w", err) } - return client, nil + // Benign race: concurrent miss creates an extra client; LoadOrStore ensures one winner. + actual, _ := ds.deploymentsOpsClients.LoadOrStore(subscriptionId, client) + return actual.(*armresources.DeploymentOperationsClient), nil } // Converts from an ARM Extended Deployment to Azd Generic deployment @@ -714,7 +788,14 @@ func (ds *StandardDeployments) ValidatePreflightToSubscription( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) error { +) (err error) { + ctx, span := tracing.Start(ctx, "arm.validate.subscription") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return fmt.Errorf("creating deployments client: %w", err) @@ -737,7 +818,9 @@ func (ds *StandardDeployments) ValidatePreflightToSubscription( createDeploymentError(err, DeploymentOperationValidate), ) } - _, err = validateResult.PollUntilDone(ctx, nil) + _, err = validateResult.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return fmt.Errorf( "validating deployment to subscription: %w", @@ -757,7 +840,15 @@ func (ds *StandardDeployments) ValidatePreflightToResourceGroup( parameters azure.ArmParameters, tags map[string]*string, options map[string]any, -) error { +) (err error) { + ctx, span := tracing.Start(ctx, "arm.validate.resourcegroup") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.String("arm.subscription", subscriptionId), + attribute.String("arm.resourcegroup", resourceGroup), + attribute.String("arm.deployment", deploymentName), + ) + deploymentClient, err := ds.createDeploymentsClient(ctx, subscriptionId) if err != nil { return fmt.Errorf("creating deployments client: %w", err) @@ -778,7 +869,9 @@ func (ds *StandardDeployments) ValidatePreflightToResourceGroup( createDeploymentError(err, DeploymentOperationValidate), ) } - _, err = validateResult.PollUntilDone(ctx, nil) + _, err = validateResult.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: slowPollFrequency, + }) if err != nil { return fmt.Errorf( "validating deployment to resource group: %w", diff --git a/cli/azd/pkg/azapi/webapp.go b/cli/azd/pkg/azapi/webapp.go index 5aa72c7126b..0975f836aa0 100644 --- a/cli/azd/pkg/azapi/webapp.go +++ b/cli/azd/pkg/azapi/webapp.go @@ -8,12 +8,16 @@ import ( "errors" "fmt" "io" + "log" "net/http" "strings" + "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/appservice/armappservice/v2" + "github.com/azure/azure-dev/cli/azd/internal/tracing" "github.com/azure/azure-dev/cli/azd/pkg/azsdk" + "go.opentelemetry.io/otel/attribute" ) type AzCliAppServiceProperties struct { @@ -153,7 +157,15 @@ func (cli *AzureClient) DeployAppServiceZip( appName string, deployZipFile io.ReadSeeker, progressLog func(string), -) (*string, error) { +) (_ *string, err error) { + ctx, span := tracing.Start(ctx, "deploy.appservice.zip") + defer func() { span.EndWithStatus(err) }() + + span.SetAttributes( + attribute.String("deploy.appservice.app", appName), + attribute.String("deploy.appservice.rg", resourceGroup), + ) + app, err := cli.appService(ctx, subscriptionId, resourceGroup, appName) if err != nil { return nil, err @@ -169,20 +181,63 @@ func (cli *AzureClient) DeployAppServiceZip( return nil, err } + isLinux := isLinuxWebApp(app) + span.SetAttributes(attribute.Bool("deploy.appservice.linux", isLinux)) + // Deployment Status API only support linux web app for now - if isLinuxWebApp(app) { - if err := client.DeployTrackStatus( - ctx, deployZipFile, subscriptionId, resourceGroup, appName, progressLog); err != nil { - if !resumeDeployment(err, progressLog) { - return nil, err + if isLinux { + // Build failures can be caused by an SCM container restart triggered by ARM + // applying site config (app settings) shortly after the site is created. + // When concurrent provisioning + deployment is used (e.g. `azd up --concurrent`), + // the deploy step may start while the SCM container is still restarting. + // Retry the entire zip deploy when the build fails, giving the SCM time to stabilize. + const maxBuildRetries = 2 + for attempt := range maxBuildRetries + 1 { + span.SetAttributes(attribute.Int("deploy.appservice.attempt", attempt+1)) + + if attempt > 0 { + // Reset the zip file reader so the retry re-uploads the full content. + if _, seekErr := deployZipFile.Seek(0, io.SeekStart); seekErr != nil { + return nil, fmt.Errorf("resetting zip file for retry: %w", seekErr) + } + progressLog(fmt.Sprintf( + "Retrying deployment (attempt %d/%d) — the previous build may have been "+ + "interrupted by an SCM container restart", attempt+1, maxBuildRetries+1)) + + // Wait for the SCM site to become ready before retrying. + if waitErr := waitForScmReady(ctx, client, 5*time.Second, progressLog); waitErr != nil { + // Propagate context cancellation (Ctrl+C) or deadline exceeded + // immediately — these indicate the user or system requested abort. + if errors.Is(waitErr, context.Canceled) || errors.Is(waitErr, context.DeadlineExceeded) { + return nil, waitErr + } + log.Printf("SCM readiness check failed (non-fatal): %v", waitErr) + } } - } else { - // Deployment is successful - statusText := "OK" - return new(statusText), nil + + err = client.DeployTrackStatus( + ctx, deployZipFile, subscriptionId, resourceGroup, appName, progressLog) + if err != nil { + if isBuildFailure(err) && attempt < maxBuildRetries { + progressLog("Build process failed — will retry after SCM stabilizes") + continue + } + if !resumeDeployment(err, progressLog) { + return nil, err + } + } else { + // Deployment is successful + return new("OK"), nil + } + break } } + // Rewind the zip file before the fallback deploy — the tracked deploy consumed it. + if _, seekErr := deployZipFile.Seek(0, io.SeekStart); seekErr != nil { + return nil, fmt.Errorf("resetting zip file for fallback deploy: %w", seekErr) + } + response, err := client.Deploy(ctx, deployZipFile) if err != nil { return nil, err @@ -191,6 +246,83 @@ func (cli *AzureClient) DeployAppServiceZip( return &response.StatusText, nil } +// isBuildFailure returns true when the deployment error indicates a transient +// Oryx/Kudu build failure caused by an SCM container restart — not a genuine +// application build error. We detect the transient case by matching the exact +// prefix "the build process failed" (which comes from the Kudu deployment API) +// while excluding messages from the deployment status API that start with +// "Deployment failed because the build process failed" (genuine build errors). +// +// NOTE: This heuristic is tied to exact Azure/Kudu error message wording. +// If the upstream messages change, detection breaks silently and retries stop +// occurring (falling back to the non-retry deploy path). The Azure SDK does +// not surface structured error codes for these build failures. +func isBuildFailure(err error) bool { + if err == nil { + return false + } + msg := err.Error() + // Genuine build failures from logWebAppDeploymentStatus start with + // "Deployment failed because the build process failed". Transient SCM + // failures use the shorter "the build process failed" without that prefix. + return strings.Contains(msg, "the build process failed") && + !strings.Contains(msg, "Deployment failed because") && + !strings.Contains(msg, "logs for more info") +} + +// scmReadyChecker abstracts the IsScmReady probe so waitForScmReady can be +// unit-tested with a mock. *azsdk.ZipDeployClient satisfies this interface. +type scmReadyChecker interface { + IsScmReady(ctx context.Context) (bool, error) +} + +// waitForScmReady pings the SCM /api/deployments endpoint until it responds +// with 200, indicating the SCM container has finished restarting. This avoids +// pushing a new zip deploy into a container that is about to restart. +// pollInterval controls the polling frequency; callers pass the production value +// (typically 5s) while tests can use shorter intervals to avoid wall-time delays. +func waitForScmReady( + ctx context.Context, client scmReadyChecker, pollInterval time.Duration, progressLog func(string), +) error { + const scmReadyTimeout = 90 * time.Second + + ctx, cancel := context.WithTimeout(ctx, scmReadyTimeout) + defer cancel() + + progressLog("Waiting for SCM site to become ready...") + + // Probe once immediately to avoid a needless pollInterval delay when SCM is already ready. + ready, err := client.IsScmReady(ctx) + if err == nil && ready { + progressLog("SCM site is ready") + return nil + } + + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("SCM site did not become ready within %v: %w", scmReadyTimeout, ctx.Err()) + case <-ticker.C: + } + + ready, err := client.IsScmReady(ctx) + if err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + log.Printf("SCM readiness probe error: %v", err) + continue + } + if ready { + progressLog("SCM site is ready") + return nil + } + } +} + func (cli *AzureClient) createWebAppsClient( ctx context.Context, subscriptionId string, @@ -265,6 +397,52 @@ func (cli *AzureClient) GetAppServiceSlots( return slots, nil } +// UpdateAppServiceAppSetting reads the existing application settings, adds or overwrites +// the specified key-value pair, and writes all settings back. This preserves other settings +// that are not being modified. +// +// Note: This function is not safe for concurrent use. The Azure App Service settings API +// replaces the full set on write and does not support ETags, so concurrent updates may +// result in lost writes. Callers should serialize updates to the same app's settings. +func (cli *AzureClient) UpdateAppServiceAppSetting( + ctx context.Context, + subscriptionId string, + resourceGroup string, + appName string, + key string, + value string, +) error { + client, err := cli.createWebAppsClient(ctx, subscriptionId) + if err != nil { + return err + } + + // Read existing settings so that we preserve them. + existing, err := client.ListApplicationSettings(ctx, resourceGroup, appName, nil) + if err != nil { + return fmt.Errorf("listing existing app settings: %w", err) + } + + if existing.Properties == nil { + existing.Properties = map[string]*string{} + } + + existing.Properties[key] = &value + + // Write all settings back (the API replaces the full set). + if _, err := client.UpdateApplicationSettings( + ctx, + resourceGroup, + appName, + armappservice.StringDictionary{Properties: existing.Properties}, + nil, + ); err != nil { + return fmt.Errorf("updating app settings: %w", err) + } + + return nil +} + // DeployAppServiceSlotZip deploys a zip file to a specific deployment slot. func (cli *AzureClient) DeployAppServiceSlotZip( ctx context.Context, diff --git a/cli/azd/pkg/azapi/webapp_test.go b/cli/azd/pkg/azapi/webapp_test.go new file mode 100644 index 00000000000..ec0990510c3 --- /dev/null +++ b/cli/azd/pkg/azapi/webapp_test.go @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +package azapi + +import ( + "context" + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func Test_isBuildFailure(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + expect bool + }{ + {"nil error", nil, false}, + {"unrelated error", errors.New("connection refused"), false}, + {"transient build failure", errors.New("the build process failed"), true}, + {"transient build failure wrapped", errors.New("deploy error: the build process failed with exit code 1"), true}, + {"real build failure with logs", errors.New("the build process failed, check logs for more info"), false}, + {"genuine build failure from status API", + errors.New("Deployment failed because the build process failed\n"), false}, + {"partial match", errors.New("build process"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.expect, isBuildFailure(tt.err)) + }) + } +} + +// mockScmChecker is a mock implementation of scmReadyChecker for testing. +type mockScmChecker struct { + calls atomic.Int32 + fn func(ctx context.Context, call int) (bool, error) +} + +func (m *mockScmChecker) IsScmReady(ctx context.Context) (bool, error) { + call := int(m.calls.Add(1)) + return m.fn(ctx, call) +} + +func Test_waitForScmReady_ImmediateSuccess(t *testing.T) { + t.Parallel() + ctx := t.Context() + + mock := &mockScmChecker{fn: func(_ context.Context, _ int) (bool, error) { + return true, nil + }} + + var logs []string + err := waitForScmReady(ctx, mock, time.Millisecond, func(msg string) { logs = append(logs, msg) }) + + require.NoError(t, err) + require.Equal(t, int32(1), mock.calls.Load()) + require.Contains(t, logs, "SCM site is ready") +} + +func Test_waitForScmReady_ContextCanceled(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + // Mock returns not-ready on first probe, then context.Canceled on the second call + // to exercise the error propagation path inside the ticker loop (lines 306-311). + mock := &mockScmChecker{fn: func(_ context.Context, call int) (bool, error) { + if call == 1 { + return false, nil // immediate probe: not ready + } + cancel() + return false, context.Canceled + }} + + err := waitForScmReady(ctx, mock, time.Millisecond, func(string) {}) + + require.Error(t, err) + require.ErrorIs(t, err, context.Canceled) + require.GreaterOrEqual(t, int(mock.calls.Load()), 2, "IsScmReady should be called at least twice") +} + +func Test_waitForScmReady_TransientErrorsThenSuccess(t *testing.T) { + t.Parallel() + ctx := t.Context() + + mock := &mockScmChecker{fn: func(_ context.Context, call int) (bool, error) { + if call < 3 { + return false, errors.New("transient error") + } + return true, nil + }} + + err := waitForScmReady(ctx, mock, time.Millisecond, func(string) {}) + + require.NoError(t, err) + require.GreaterOrEqual(t, int(mock.calls.Load()), 3) +} + +func Test_waitForScmReady_ContextDeadlineExceeded(t *testing.T) { + t.Parallel() + + // Mock returns not-ready on first probe, then DeadlineExceeded on subsequent calls + // to exercise the error propagation path inside the ticker loop. + mock := &mockScmChecker{fn: func(_ context.Context, call int) (bool, error) { + if call == 1 { + return false, nil // immediate probe: not ready + } + return false, context.DeadlineExceeded + }} + + err := waitForScmReady(t.Context(), mock, time.Millisecond, func(string) {}) + + require.Error(t, err) + require.ErrorIs(t, err, context.DeadlineExceeded) + require.GreaterOrEqual(t, int(mock.calls.Load()), 2, "IsScmReady should be called at least twice") +} diff --git a/cli/azd/pkg/azsdk/zip_deploy_client.go b/cli/azd/pkg/azsdk/zip_deploy_client.go index cdc774c84de..0a4c48104e8 100644 --- a/cli/azd/pkg/azsdk/zip_deploy_client.go +++ b/cli/azd/pkg/azsdk/zip_deploy_client.go @@ -432,3 +432,36 @@ func (h *deployPollingHandler) Result(ctx context.Context, out **DeployResponse) return nil } + +// IsScmReady pings the SCM /api/deployments endpoint to check if the Kudu +// service is responsive. Returns true when the endpoint responds with HTTP 200, +// false for transient errors (503, connection refused, etc.), or an error for +// unexpected failures. This is used to detect SCM container restarts that occur +// when ARM applies site configuration changes after provisioning. +func (c *ZipDeployClient) IsScmReady(ctx context.Context) (bool, error) { + endpoint := fmt.Sprintf("https://%s/api/deployments", c.hostName) + req, err := runtime.NewRequest(ctx, http.MethodGet, endpoint) + if err != nil { + return false, fmt.Errorf("creating SCM readiness request: %w", err) + } + + resp, err := c.pipeline.Do(req) + if err != nil { + // Propagate context cancellation / deadline so callers (and Ctrl+C) react immediately. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || ctx.Err() != nil { + return false, err + } + // Connection and other transport errors mean SCM is still restarting. + return false, nil //nolint:nilerr + } + defer resp.Body.Close() + + switch resp.StatusCode { + case http.StatusOK: + return true, nil + case http.StatusBadGateway, http.StatusServiceUnavailable: + return false, nil + default: + return false, runtime.NewResponseError(resp) + } +} diff --git a/cli/azd/pkg/containerapps/container_app.go b/cli/azd/pkg/containerapps/container_app.go index d661b05b01b..d3bf6db6d73 100644 --- a/cli/azd/pkg/containerapps/container_app.go +++ b/cli/azd/pkg/containerapps/container_app.go @@ -13,6 +13,7 @@ import ( "net/http" "slices" "strings" + "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" @@ -33,6 +34,12 @@ import ( ) const ( + // containerAppPollFrequency is the polling interval for Container App create/update operations. + // Default SDK polling is 30s; Container App updates typically complete in 10-60s, so 5s polling + // reduces tail latency by up to 25s per operation while leaving headroom against the ARM + // read-rate limit (1200 reads/5min/subscription) during large parallel deployments. + containerAppPollFrequency = 5 * time.Second + pathLatestRevisionName = "properties.latestRevisionName" pathTemplate = "properties.template" pathTemplateRevisionSuffix = "properties.template.revisionSuffix" @@ -184,8 +191,7 @@ func (cas *containerAppService) persistSettings( aca, err := cas.getContainerApp(ctx, subscriptionId, resourceGroupName, appName, options) if err != nil { // On first deploy the app doesn't exist yet (404) — proceed without persisting. - var respErr *azcore.ResponseError - if errors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound { + if respErr, ok := errors.AsType[*azcore.ResponseError](err); ok && respErr.StatusCode == http.StatusNotFound { return obj, nil } @@ -312,7 +318,9 @@ func (cas *containerAppService) DeployYaml( poller = p } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: containerAppPollFrequency, + }) if err != nil { return fmt.Errorf("polling for container app update completion: %w", err) } @@ -535,7 +543,9 @@ func (cas *containerAppService) updateContainerApp( return fmt.Errorf("begin updating ingress traffic: %w", err) } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: containerAppPollFrequency, + }) if err != nil { return fmt.Errorf("polling for container app update completion: %w", err) } @@ -767,7 +777,9 @@ func (cas *containerAppService) UpdateContainerAppJobImage( ) } - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: containerAppPollFrequency, + }) if err != nil { return fmt.Errorf( "waiting for container app job update: %w", err, diff --git a/cli/azd/pkg/httputil/util.go b/cli/azd/pkg/httputil/util.go index 4dcebe17e10..78a2c973794 100644 --- a/cli/azd/pkg/httputil/util.go +++ b/cli/azd/pkg/httputil/util.go @@ -15,9 +15,12 @@ import ( "time" ) -// Reads the raw HTTP response and attempt to convert it into the specified type +// Reads the raw HTTP response and attempt to convert it into the specified type. // Typically used in conjunction with runtime.WithCaptureResponse(...) to get access to the underlying HTTP response of the // SDK API call. +// +// ReadRawResponse fully drains the response body via io.ReadAll but does not close it. +// Callers that return the response to the azcore poller framework must leave the body unclosed. func ReadRawResponse[T any](response *http.Response) (*T, error) { data, err := io.ReadAll(response.Body) if err != nil { @@ -34,6 +37,21 @@ func ReadRawResponse[T any](response *http.Response) (*T, error) { return instance, nil } +// TunedTransport returns an http.Transport cloned from http.DefaultTransport with +// connection pooling parameters optimized for Azure CLI workloads. The key tunings +// are MaxConnsPerHost and MaxIdleConnsPerHost (raised from Go defaults) to avoid +// unnecessary TLS handshakes when making many concurrent requests to ARM endpoints. +// With parallel execution, 8+ services may hit ARM simultaneously. +func TunedTransport() *http.Transport { + transport := http.DefaultTransport.(*http.Transport).Clone() + transport.MaxIdleConns = 200 + transport.MaxConnsPerHost = 50 + transport.MaxIdleConnsPerHost = 50 + transport.IdleConnTimeout = 90 * time.Second + transport.DisableKeepAlives = false // keep-alive must remain enabled for pooling + return transport +} + // TlsEnabledTransport returns a http.Transport that has TLS configured to use the provided // Base64 DER-encoded certificate. The returned http.Transport inherits defaults from http.DefaultTransport. func TlsEnabledTransport(derBytes string) (*http.Transport, error) { diff --git a/cli/azd/pkg/httputil/util_test.go b/cli/azd/pkg/httputil/util_test.go index 8b42b4d8ed8..c49e67a681a 100644 --- a/cli/azd/pkg/httputil/util_test.go +++ b/cli/azd/pkg/httputil/util_test.go @@ -9,6 +9,7 @@ import ( "io" "net/http" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -19,6 +20,35 @@ type exampleResponse struct { C string `json:"c"` } +func TestTunedTransport(t *testing.T) { + transport := TunedTransport() + + require.NotNil(t, transport) + require.Equal(t, 200, transport.MaxIdleConns) + require.Equal(t, 50, transport.MaxConnsPerHost) + require.Equal(t, 50, transport.MaxIdleConnsPerHost) + require.Equal(t, 90*time.Second, transport.IdleConnTimeout) + require.False(t, transport.DisableKeepAlives) +} + +func TestTunedTransport_DoesNotMutateDefault(t *testing.T) { + defaultTransport := http.DefaultTransport.(*http.Transport) + origMaxIdle := defaultTransport.MaxIdleConns + origMaxConns := defaultTransport.MaxConnsPerHost + origMaxIdlePerHost := defaultTransport.MaxIdleConnsPerHost + origIdleTimeout := defaultTransport.IdleConnTimeout + origKeepAlives := defaultTransport.DisableKeepAlives + + _ = TunedTransport() + + // Verify no field of the default transport was mutated. + require.Equal(t, origMaxIdle, defaultTransport.MaxIdleConns) + require.Equal(t, origMaxConns, defaultTransport.MaxConnsPerHost) + require.Equal(t, origMaxIdlePerHost, defaultTransport.MaxIdleConnsPerHost) + require.Equal(t, origIdleTimeout, defaultTransport.IdleConnTimeout) + require.Equal(t, origKeepAlives, defaultTransport.DisableKeepAlives) +} + func TestReadRawResponse(t *testing.T) { t.Run("Success", func(t *testing.T) { expectedResponse := &exampleResponse{ diff --git a/cli/azd/pkg/project/container_helper.go b/cli/azd/pkg/project/container_helper.go index 74c5d15e0ba..83423bef933 100644 --- a/cli/azd/pkg/project/container_helper.go +++ b/cli/azd/pkg/project/container_helper.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" "log" + "net" + "net/url" "os" "path" "path/filepath" @@ -39,6 +41,7 @@ import ( "github.com/azure/azure-dev/cli/azd/pkg/tools/pack" "github.com/benbjohnson/clock" "github.com/sethvargo/go-retry" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" ) @@ -281,14 +284,17 @@ func (ch *ContainerHelper) Login( return registryName, nil } -var defaultCredentialsRetryDelay = 20 * time.Second +var defaultCredentialsRetryDelay = 2 * time.Second func (ch *ContainerHelper) Credentials( ctx context.Context, serviceConfig *ServiceConfig, targetResource *environment.TargetResource, env *environment.Environment, -) (*azapi.DockerCredentials, error) { +) (_ *azapi.DockerCredentials, err error) { + ctx, span := tracing.Start(ctx, "container.credentials") + defer func() { span.EndWithStatus(err) }() + loginServer, err := ch.RegistryName(ctx, serviceConfig, env) if err != nil { return nil, err @@ -297,15 +303,31 @@ func (ch *ContainerHelper) Credentials( var credential *azapi.DockerCredentials credentialsError := retry.Do( ctx, - // will retry just once after 1 minute based on: - // https://learn.microsoft.com/en-us/azure/dns/dns-faq#how-long-does-it-take-for-dns-changes-to-take-effect- - retry.WithMaxRetries(3, retry.NewConstant(defaultCredentialsRetryDelay)), + // Use exponential backoff (2s → 4s → 8s → 16s → 32s = 62s worst case) instead of + // constant 20s delay. ACR credentials are typically available within seconds after + // resource creation, so aggressive early retries resolve most 404s quickly while + // preserving roughly the same total retry window for slow DNS propagation. + retry.WithMaxRetries(5, retry.NewExponential(defaultCredentialsRetryDelay)), func(ctx context.Context) error { cred, err := ch.containerRegistryService.Credentials(ctx, targetResource.SubscriptionId(), loginServer) if err != nil { if httpErr, ok := errors.AsType[*azcore.ResponseError](err); ok { - if httpErr.StatusCode == 404 { - // Retry if the registry is not found while logging in + if httpErr.StatusCode == 404 || + httpErr.StatusCode == 429 || + httpErr.StatusCode >= 500 { + return retry.RetryableError(err) + } + } + // Retry transient network/DNS errors (e.g. "no such host" + // during DNS propagation after ACR creation). Only retry + // genuinely transient errors, not permanent URL/TLS failures. + if _, ok := errors.AsType[*url.Error](err); ok { + if netErr, ok := errors.AsType[net.Error](err); ok && netErr.Timeout() { + return retry.RetryableError(err) + } + msg := err.Error() + if strings.Contains(msg, "no such host") || + strings.Contains(msg, "connection refused") { return retry.RetryableError(err) } } @@ -330,6 +352,7 @@ func (ch *ContainerHelper) Build( } dockerOptions := getDockerOptionsWithDefaults(serviceConfig.Docker) + resolveDockerPaths(serviceConfig, &dockerOptions) resolveParameters := func(source []string) ([]string, error) { result := make([]string, len(source)) @@ -401,10 +424,8 @@ func (ch *ContainerHelper) Build( strings.ToLower(serviceConfig.Name), ) + // dockerOptions.Path is already resolved to absolute by resolveDockerPaths dockerfilePath := dockerOptions.Path - if !filepath.IsAbs(dockerfilePath) { - dockerfilePath = filepath.Join(serviceConfig.Path(), dockerfilePath) - } _, err = os.Stat(dockerfilePath) if errors.Is(err, os.ErrNotExist) && serviceConfig.Docker.Path == "" { @@ -452,7 +473,7 @@ func (ch *ContainerHelper) Build( return nil, fmt.Errorf("writing dockerfile for service %s: %w", serviceConfig.Name, err) } dockerFilePath = dockerfilePath - if dockerOptions.Context == "" || dockerOptions.Context == "." { + if serviceConfig.Docker.Context == "" || serviceConfig.Docker.Context == "." { dockerOptions.Context = tempDir } @@ -588,9 +609,14 @@ func (ch *ContainerHelper) Publish( env *environment.Environment, progress *async.Progress[ServiceProgress], options *PublishOptions, -) (*ServicePublishResult, error) { +) (_ *ServicePublishResult, err error) { + ctx, span := tracing.Start(ctx, "container.publish") + defer func() { span.EndWithStatus(err) }() + span.SetAttributes( + attribute.Bool("container.remotebuild", serviceConfig.Docker.RemoteBuild), + ) + var remoteImage string - var err error // Parse PublishOptions into ImageOverride imageOverride, err := parseImageOverride(options) @@ -773,16 +799,12 @@ func (ch *ContainerHelper) runRemoteBuild( env *environment.Environment, progress *async.Progress[ServiceProgress], imageOverride *imageOverride, -) (string, error) { - dockerOptions := getDockerOptionsWithDefaults(serviceConfig.Docker) - - if !filepath.IsAbs(dockerOptions.Path) { - dockerOptions.Path = filepath.Join(serviceConfig.Path(), dockerOptions.Path) - } +) (_ string, err error) { + ctx, span := tracing.Start(ctx, "container.remotebuild") + defer func() { span.EndWithStatus(err) }() - if !filepath.IsAbs(dockerOptions.Context) { - dockerOptions.Context = filepath.Join(serviceConfig.Path(), dockerOptions.Context) - } + dockerOptions := getDockerOptionsWithDefaults(serviceConfig.Docker) + resolveDockerPaths(serviceConfig, &dockerOptions) if dockerOptions.Platform != "linux/amd64" { return "", fmt.Errorf("remote build only supports the linux/amd64 platform") @@ -1129,3 +1151,31 @@ func getDockerOptionsWithDefaults(options DockerProjectOptions) DockerProjectOpt return options } + +// resolveServiceDir returns the directory for the service. ServiceConfig.Path() +// may return a file path (e.g., .csproj for Aspire/project.v1), so we check +// whether the path is a directory and fall back to filepath.Dir if it is not. +func resolveServiceDir(serviceConfig *ServiceConfig) string { + servicePath := serviceConfig.Path() + if info, err := os.Stat(servicePath); err == nil && info.IsDir() { + return servicePath + } + return filepath.Dir(servicePath) +} + +// resolveDockerPaths resolves docker.path and docker.context to absolute paths. +// Both user-specified and default paths are resolved relative to the service +// directory to preserve backward compatibility. +func resolveDockerPaths(serviceConfig *ServiceConfig, opts *DockerProjectOptions) { + serviceDir := resolveServiceDir(serviceConfig) + if !filepath.IsAbs(opts.Path) { + opts.Path = filepath.Join(serviceDir, opts.Path) + } + if !filepath.IsAbs(opts.Context) { + opts.Context = filepath.Join(serviceDir, opts.Context) + } + // Clean resolved paths to eliminate ".." components and normalize separators, + // preventing path traversal outside the expected directory tree. + opts.Path = filepath.Clean(opts.Path) + opts.Context = filepath.Clean(opts.Context) +} diff --git a/cli/azd/pkg/project/container_helper_coverage3_test.go b/cli/azd/pkg/project/container_helper_coverage3_test.go index e74d5e8c10c..58c1e66c93e 100644 --- a/cli/azd/pkg/project/container_helper_coverage3_test.go +++ b/cli/azd/pkg/project/container_helper_coverage3_test.go @@ -4,6 +4,8 @@ package project import ( + "os" + "path/filepath" "testing" "github.com/azure/azure-dev/cli/azd/pkg/tools/docker" @@ -47,3 +49,128 @@ func Test_getDockerOptionsWithDefaults_Coverage3(t *testing.T) { assert.Equal(t, ".", result.Context) }) } + +func Test_resolveDockerPaths(t *testing.T) { + projectPath := t.TempDir() + servicePath := filepath.Join(projectPath, "src", "web") + require.NoError(t, os.MkdirAll(servicePath, 0755)) + + t.Run("DefaultPaths_RelativeToServiceDir", func(t *testing.T) { + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{}, // no user overrides + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + assert.Equal(t, filepath.Join(servicePath, "Dockerfile"), opts.Path) + assert.Equal(t, servicePath, opts.Context) + }) + + t.Run("UserSpecifiedPaths_RelativeToServiceDir", func(t *testing.T) { + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{ + Path: "docker/Dockerfile.prod", + Context: ".", + }, + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + assert.Equal(t, filepath.Join(servicePath, "docker", "Dockerfile.prod"), opts.Path) + assert.Equal(t, servicePath, opts.Context) + }) + + t.Run("AbsolutePaths_Unchanged", func(t *testing.T) { + absDockerfile := filepath.Join(projectPath, "other", "Dockerfile") + absContext := filepath.Join(projectPath, "other", "ctx") + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{ + Path: absDockerfile, + Context: absContext, + }, + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + assert.Equal(t, absDockerfile, opts.Path) + assert.Equal(t, absContext, opts.Context) + }) + + t.Run("DefaultPaths_RelativePathIsFile", func(t *testing.T) { + // When RelativePath points to a file (e.g., Aspire project.v1 .csproj), + // default docker paths should resolve relative to the parent directory. + serviceDir := filepath.Join(projectPath, "src") + require.NoError(t, os.MkdirAll(serviceDir, 0755)) + csprojFile := filepath.Join(serviceDir, "app.csproj") + require.NoError(t, os.WriteFile(csprojFile, []byte(""), 0600)) + + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "app.csproj"), + Docker: DockerProjectOptions{}, // no user overrides + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + // Defaults should resolve to the directory containing app.csproj, not app.csproj itself. + assert.Equal(t, filepath.Join(serviceDir, "Dockerfile"), opts.Path) + assert.Equal(t, serviceDir, opts.Context) + }) + + t.Run("MixedPaths_UserPathDefaultContext", func(t *testing.T) { + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{ + Path: "docker/Dockerfile.dev", + // Context not set — will default to "." + }, + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + // Both resolve relative to service dir + assert.Equal(t, filepath.Join(servicePath, "docker", "Dockerfile.dev"), opts.Path) + assert.Equal(t, servicePath, opts.Context) + }) + + t.Run("PathTraversal_DotDot_Normalized", func(t *testing.T) { + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{ + Path: "../shared/Dockerfile", + Context: "../shared", + }, + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + // ".." segments are resolved and cleaned by filepath.Clean + assert.Equal(t, filepath.Join(projectPath, "src", "shared", "Dockerfile"), opts.Path) + assert.Equal(t, filepath.Join(projectPath, "src", "shared"), opts.Context) + }) + + t.Run("PathTraversal_DoubleDotDot_Normalized", func(t *testing.T) { + svc := &ServiceConfig{ + Project: &ProjectConfig{Path: projectPath}, + RelativePath: filepath.Join("src", "web"), + Docker: DockerProjectOptions{ + Path: "../../Dockerfile", + Context: "../../", + }, + } + opts := getDockerOptionsWithDefaults(svc.Docker) + resolveDockerPaths(svc, &opts) + + // Double ".." walks back to projectPath root + assert.Equal(t, filepath.Join(projectPath, "Dockerfile"), opts.Path) + assert.Equal(t, projectPath, opts.Context) + }) +} diff --git a/cli/azd/pkg/project/container_helper_test.go b/cli/azd/pkg/project/container_helper_test.go index bac6366aea3..7d484979454 100644 --- a/cli/azd/pkg/project/container_helper_test.go +++ b/cli/azd/pkg/project/container_helper_test.go @@ -683,7 +683,7 @@ func Test_ContainerHelper_Deploy(t *testing.T) { mockContainerRegistryService.AssertCalled( t, "Login", - *mockContext.Context, + mock.Anything, env.GetSubscriptionId(), registryName, ) @@ -921,7 +921,7 @@ func Test_ContainerHelper_Credential_Retry(t *testing.T) { func setupContainerRegistryMocks(mockContext *mocks.MockContext, mockContainerRegistryService *mock.Mock) { mockContainerRegistryService.On( "Login", - *mockContext.Context, + mock.Anything, mock.AnythingOfType("string"), mock.AnythingOfType("string")). Return(nil) @@ -1250,7 +1250,7 @@ func Test_ContainerHelper_Publish(t *testing.T) { mockContainerRegistryService.AssertCalled( t, "Login", - *mockContext.Context, + mock.Anything, env.GetSubscriptionId(), registryName, ) diff --git a/cli/azd/pkg/project/framework_service_docker.go b/cli/azd/pkg/project/framework_service_docker.go index 68c8915deb1..dab9544e1b2 100644 --- a/cli/azd/pkg/project/framework_service_docker.go +++ b/cli/azd/pkg/project/framework_service_docker.go @@ -7,7 +7,6 @@ import ( "context" "errors" "os" - "path/filepath" "github.com/azure/azure-dev/cli/azd/pkg/alpha" "github.com/azure/azure-dev/cli/azd/pkg/async" @@ -171,21 +170,10 @@ func useDotnetPublishForDockerBuild(serviceConfig *ServiceConfig) bool { serviceConfig.useDotNetPublishForDockerBuild = new(false) if serviceConfig.Language.IsDotNet() { - projectPath := serviceConfig.Path() - dockerOptions := getDockerOptionsWithDefaults(serviceConfig.Docker) + resolveDockerPaths(serviceConfig, &dockerOptions) - dockerfilePath := dockerOptions.Path - if !filepath.IsAbs(dockerfilePath) { - s, err := os.Stat(projectPath) - if err == nil && s.IsDir() { - dockerfilePath = filepath.Join(projectPath, dockerfilePath) - } else { - dockerfilePath = filepath.Join(filepath.Dir(projectPath), dockerfilePath) - } - } - - if _, err := os.Stat(dockerfilePath); errors.Is(err, os.ErrNotExist) { + if _, err := os.Stat(dockerOptions.Path); errors.Is(err, os.ErrNotExist) { serviceConfig.useDotNetPublishForDockerBuild = new(true) } } diff --git a/cli/azd/pkg/project/framework_service_docker_test.go b/cli/azd/pkg/project/framework_service_docker_test.go index 91da78812f5..dbc19518b1a 100644 --- a/cli/azd/pkg/project/framework_service_docker_test.go +++ b/cli/azd/pkg/project/framework_service_docker_test.go @@ -46,6 +46,9 @@ services: ` ran := false + // Create service temp dir upfront so the mock closure can reference it. + serviceDir := t.TempDir() + env := environment.NewWithValues("test-env", nil) env.SetSubscriptionId("sub") @@ -81,10 +84,10 @@ services: require.Equal(t, []string{ "build", - "-f", "./Dockerfile", + "-f", filepath.Join(serviceDir, "Dockerfile"), "--platform", docker.DefaultPlatform, "-t", "test-proj-web", - ".", + serviceDir, }, argsNoFile) // create the file as expected @@ -102,10 +105,9 @@ services: require.NoError(t, err) service := projectConfig.Services["web"] - temp := t.TempDir() - service.Project.Path = temp + service.Project.Path = serviceDir service.RelativePath = "" - err = os.WriteFile(filepath.Join(temp, "Dockerfile"), []byte("FROM node:14"), 0600) + err = os.WriteFile(filepath.Join(serviceDir, "Dockerfile"), []byte("FROM node:14"), 0600) require.NoError(t, err) npmCli := node.NewCli(mockContext.CommandRunner) @@ -162,6 +164,10 @@ services: env := environment.NewWithValues("test-env", nil) env.SetSubscriptionId("sub") + + // Create service temp dir upfront so the mock closure can reference it. + serviceDir := t.TempDir() + mockContext := mocks.NewMockContext(context.Background()) envManager := &mockenv.MockEnvManager{} envManager.On("Get", mock.Anything, "test-env").Return(env, nil) @@ -196,10 +202,10 @@ services: require.Equal(t, []string{ "build", - "-f", "./Dockerfile.dev", + "-f", filepath.Join(serviceDir, "Dockerfile.dev"), "--platform", docker.DefaultPlatform, "-t", "test-proj-web", - "../", + filepath.Join(serviceDir, ".."), }, argsNoFile) // create the file as expected @@ -221,10 +227,9 @@ services: require.NoError(t, err) service := projectConfig.Services["web"] - temp := t.TempDir() - service.Project.Path = temp + service.Project.Path = serviceDir service.RelativePath = "" - err = os.WriteFile(filepath.Join(temp, "Dockerfile.dev"), []byte("FROM node:14"), 0600) + err = os.WriteFile(filepath.Join(serviceDir, "Dockerfile.dev"), []byte("FROM node:14"), 0600) require.NoError(t, err) internalFramework := NewNodeProject(npmCli, env, mockContext.CommandRunner) @@ -406,7 +411,7 @@ func Test_DockerProject_Build(t *testing.T) { language: ServiceLanguageJavaScript, hasDockerFile: true, init: func(t *testing.T) error { - os.Setenv("AZD_CUSTOM_OS_VAR", "os-value") + t.Setenv("AZD_CUSTOM_OS_VAR", "os-value") return nil }, env: environment.NewWithValues("test", map[string]string{ @@ -559,7 +564,30 @@ func Test_DockerProject_Build(t *testing.T) { require.NoError(t, err) require.NotNil(t, result) require.Equal(t, tt.expectedBuildResult, result) - require.Equal(t, tt.expectedDockerBuildArgs, dockerBuildArgs.Args) + + if tt.expectedDockerBuildArgs != nil { + // Build expected absolute paths directly from test temp dir. + // Do NOT call resolveDockerPaths here - it is the production helper + // under test, so using it would make assertions self-referential. + serviceDir := filepath.Join(temp, tt.project) + expected := make([]string, len(tt.expectedDockerBuildArgs)) + copy(expected, tt.expectedDockerBuildArgs) + for i := range expected { + if i > 0 && expected[i-1] == "-f" { + if !filepath.IsAbs(expected[i]) { + expected[i] = filepath.Clean(filepath.Join(serviceDir, expected[i])) + } + } + } + if n := len(expected); n > 0 { + last := expected[n-1] + if !filepath.IsAbs(last) && + (last == "." || strings.HasPrefix(last, "./") || strings.HasPrefix(last, "../")) { + expected[n-1] = filepath.Clean(filepath.Join(serviceDir, last)) + } + } + require.Equal(t, expected, dockerBuildArgs.Args) + } } }) } diff --git a/docs/specs/perf-foundations/spec.md b/docs/specs/perf-foundations/spec.md new file mode 100644 index 00000000000..67309a84718 --- /dev/null +++ b/docs/specs/perf-foundations/spec.md @@ -0,0 +1,287 @@ +# Performance Foundations for Azure Developer CLI + +## Overview + +This changeset introduces targeted performance optimizations to reduce deployment latency in `azd`, +particularly when multiple services are provisioned and deployed concurrently. Each change addresses a +specific bottleneck observed during parallel `azd up` runs with 4-8 services. + +## Changes + +### 1. HTTP Connection Pooling (TunedTransport) + +**Files:** `pkg/httputil/util.go`, `cmd/deps.go` + +**Problem:** Go's `http.DefaultTransport` limits connections to 2 per host (`MaxIdleConnsPerHost=2`) and +100 total idle connections. When `azd` deploys 8 services in parallel, each hitting ARM endpoints +(`management.azure.com`), connections are torn down and re-established constantly. Every new TLS 1.2+ +handshake to ARM costs 1-2 round trips (50-150ms per handshake depending on region). + +**Solution:** `TunedTransport()` clones `http.DefaultTransport` and raises: +- `MaxIdleConns`: 100 -> 200 (total idle pool across all hosts) +- `MaxConnsPerHost`: 0 (unlimited) -> 50 (bounded but generous) +- `MaxIdleConnsPerHost`: 2 -> 50 (matches MaxConnsPerHost so idle connections aren't evicted) +- `IdleConnTimeout`: 90s (kept explicit for documentation; matches Go default) +- `DisableKeepAlives`: false (explicit; HTTP/1.1 keep-alive per RFC 7230 Section 6.3) + +**Evidence:** Go stdlib defaults are documented in `net/http/transport.go`. The per-host idle limit of 2 +is the primary bottleneck; raising it to match `MaxConnsPerHost` ensures connections created during a +burst are retained for subsequent requests to the same host. The 30s idle timeout is sufficient because +`azd` operations complete within seconds of each other during parallel deployment. + +**Wiring:** `cmd/deps.go` replaces `http.DefaultClient` with `&http.Client{Transport: TunedTransport()}` +so all SDK clients created through dependency injection inherit the tuned transport. + +### 2. ARM Client Caching (sync.Map) + +**Files:** `pkg/azapi/resource_service.go`, `pkg/azapi/standard_deployments.go`, +`pkg/azapi/stack_deployments.go` + +**Problem:** ARM SDK clients (`armresources.Client`, `DeploymentsClient`, `DeploymentOperationsClient`, +`armdeploymentstacks.Client`) were re-created on every API call. Each construction builds an HTTP +pipeline (retry policy, logging policy, auth policy). While not as expensive as a TLS handshake, this +adds unnecessary CPU and allocation overhead when the same subscription is used repeatedly. + +**Solution:** Cache clients in `sync.Map` fields keyed by subscription ID. The pattern uses +`Load` for the fast path and `LoadOrStore` on miss. The "benign race" comment documents that concurrent +cache misses may create duplicate clients; `LoadOrStore` ensures only one is retained. This is safe +because Azure SDK ARM clients are stateless and goroutine-safe (they hold only the pipeline +configuration, not per-request state). + +**Evidence:** Azure SDK for Go documentation confirms ARM clients are safe for concurrent use: +"A client is safe for concurrent use across goroutines" (azure-sdk-for-go design guidelines). `sync.Map` +is the standard Go pattern for read-heavy caches with infrequent writes, avoiding lock contention on the +hot path. + +### 3. Adaptive Poll Frequency + +**Files:** `pkg/azapi/standard_deployments.go`, `pkg/azapi/stack_deployments.go` + +**Problem:** `PollUntilDone(ctx, nil)` uses the Azure SDK's default polling interval of 30 seconds. +For deploy operations that typically complete in 30-120 seconds, this means the first successful poll may +arrive 30s after the deployment actually finished, adding unnecessary wall-clock latency. + +**Solution:** Two tuned frequencies: +- `deployPollFrequency = 5s` for deploy and delete operations (variable completion time; 6x faster + than the 30s default while leaving ample headroom against ARM rate limits) +- `slowPollFrequency = 5s` for WhatIf and Validate operations (consistently slow at 30-90s; aggressive + polling wastes ARM read quota without benefit) + +The 5s interval balances latency reduction against ARM rate limits: 1200 reads per 5 minutes per +subscription. With 8 parallel deployments polling at 5s, that's 96 reads/minute per operation type, +leaving substantial headroom for other ARM reads (list operations, status checks, etc.). + +**Evidence:** Azure SDK `runtime.PollUntilDoneOptions.Frequency` is the documented mechanism. ARM rate +limits are documented at +`learn.microsoft.com/en-us/azure/azure-resource-manager/management/request-limits-and-throttling`. + +### 4. Zip Deploy Retry with SCM Readiness Probe + +**Files:** `pkg/azapi/webapp.go`, `pkg/azsdk/zip_deploy_client.go` + +**Problem:** During concurrent `azd up`, ARM applies site configuration (app settings) shortly after the +App Service resource is created. This triggers an SCM (Kudu) container restart. If the zip deploy starts +while the SCM container is restarting, the Oryx build process fails with "the build process failed". +This is a transient failure specific to the concurrent provisioning + deployment pattern. + +**Solution:** +- `isBuildFailure(err)` detects the specific transient error string ("the build process failed") while + excluding genuine build errors that contain "logs for more info" +- On build failure, retry up to 2 additional times (3 total attempts) +- Before each retry, `waitForScmReady()` polls the SCM `/api/deployments` endpoint until it returns + HTTP 200, indicating the container has finished restarting (90s timeout, 5s poll interval) +- The zip file `ReadSeeker` is rewound via `Seek(0, io.SeekStart)` before each retry +- After the retry loop exits (success or exhausted), the zip file is also rewound before the non-Linux + fallback `Deploy()` path, since the tracked deploy consumed the reader + +**OTel Tracing:** The entire `DeployAppServiceZip` function is wrapped in a tracing span named +`"deploy.appservice.zip"` with attributes: `deploy.appservice.app` (app name), +`deploy.appservice.rg` (resource group), `deploy.appservice.linux` (boolean), and +`deploy.appservice.attempt` (1-indexed attempt number, updated per retry iteration). The span uses +named return `err` with `span.EndWithStatus(err)` to automatically record failure status. + +`IsScmReady()` on `ZipDeployClient` sends a lightweight GET to `/api/deployments`. Connection errors +return `(false, nil)` rather than propagating - the SCM is still restarting, which is the expected state. +The `//nolint:nilerr` directive documents this intentional error suppression for the linter. + +### 5. ACR Credential Exponential Backoff + +**File:** `pkg/project/container_helper.go` + +**Problem:** ACR credential retrieval after resource creation used a constant 20-second retry delay +(`retry.NewConstant(20s)`). In the common case, credentials are available within 1-5 seconds after the +ACR resource is provisioned. The 20s constant delay means even a single retry wastes 15-19 seconds. + +**Solution:** Changed to exponential backoff starting at 2 seconds: +`retry.NewExponential(2s)` with max 5 retries produces delays of 2s, 4s, 8s, 16s, 32s (62s worst case +vs 60s worst case with the old constant 20s * 3 retries). The exponential curve means most transient +404s (DNS propagation, eventual consistency) resolve on the first or second retry, while the longer +tail preserves roughly the same total retry window for slow DNS propagation edge cases. + +**Evidence:** Azure DNS propagation FAQ confirms changes "typically take effect within 60 seconds" but +are often faster. The old link in the comment +(`learn.microsoft.com/en-us/azure/dns/dns-faq#how-long-does-it-take-for-dns-changes-to-take-effect-`) +documents worst-case TTL, not typical latency. + +### 6. Docker Path Resolution Fix + +**Files:** `pkg/project/container_helper.go`, `pkg/project/framework_service_docker.go`, +`pkg/project/framework_service_docker_test.go` + +**Problem:** Docker path resolution was inconsistent across `Build()`, `runRemoteBuild()`, +`packBuild()`, and `useDotNetPublishForDockerBuild()`. User-specified `docker.path` and +`docker.context` in `azure.yaml` are relative to the project root (where `azure.yaml` lives), but +default paths (`./Dockerfile`, `.`) are relative to the service directory. The old code used +`serviceConfig.Path()` for both cases, which broke when a .NET service's `azure.yaml` specified a +custom `docker.path` pointing to a Dockerfile outside the service directory. + +**Solution:** Extracted `resolveDockerPaths()` that resolves docker.path and docker.context +to absolute paths. User-specified paths (from `azure.yaml`) are resolved relative to the project +root (where `azure.yaml` lives), while default paths (`./Dockerfile`, `.`) are resolved relative to +the service directory via `resolveServiceDir()`. This centralizes path resolution that was +previously duplicated across `Build()`, `runRemoteBuild()`, and other call sites. + +Called from `Build()`, `runRemoteBuild()`, and `useDotNetPublishForDockerBuild()` (which now calls +`resolveDockerPaths()` directly instead of reimplementing the logic). Tests updated to expect +absolute resolved paths, including dynamic resolution in the table-driven `Test_DockerProject_Build` +test. `filepath.Clean` is applied to both resolved paths to normalize separators and prevent +path traversal. + +### 7. ReadRawResponse Body Close Fix + +**File:** `pkg/httputil/util.go` + +**Problem:** `ReadRawResponse` read `response.Body` via `io.ReadAll` but never closed it. While Go's +HTTP client reuses connections when the body is fully read and closed, an unclosed body prevents the +underlying TCP connection from returning to the pool. + +**Solution:** Added `defer response.Body.Close()` at the top of the function. + +### 8. UpdateAppServiceAppSetting Helper + +**File:** `pkg/azapi/webapp.go` + +**Problem:** No API existed to update a single App Service application setting without replacing the +entire set. Callers that needed to add or modify one setting had to manually read-modify-write the +full settings dictionary, duplicating boilerplate. + +**Solution:** Added `UpdateAppServiceAppSetting(ctx, subscriptionId, resourceGroup, appName, key, +value)` that reads existing settings via `ListApplicationSettings`, adds/overwrites the specified +key-value pair, and writes all settings back via `UpdateApplicationSettings`. This preserves other +settings that are not being modified. The function handles the nil-properties edge case by +initializing the map if empty. + +### 9. OTel Tracing Spans for Deployment Profiling + +**Files:** `pkg/azapi/standard_deployments.go`, `pkg/azapi/stack_deployments.go`, +`pkg/project/container_helper.go`, `pkg/project/container_helper_test.go` + +**Problem:** Only `DeployAppServiceZip` had OTel tracing. ARM deployments, validation, +WhatIf operations, and container build/publish had no span coverage, making it impossible to +profile where wall-clock time is spent during parallel deployment. + +**Solution:** Added 11 OTel tracing spans to all performance-critical deployment paths using +the established pattern: `tracing.Start(ctx, spanName)` + named return `err` + `defer func() +{ span.EndWithStatus(err) }()`. Each span includes `attribute.String` for subscription, +resource group, and deployment name to enable correlation in trace viewers. + +**ARM Standard Deployments** (`standard_deployments.go` — 6 spans): +- `arm.deploy.subscription` — `DeployToSubscription` (ARM deploy at subscription scope) +- `arm.deploy.resourcegroup` — `DeployToResourceGroup` (ARM deploy at resource group scope) +- `arm.whatif.subscription` — `WhatIfDeployToSubscription` (WhatIf preview at subscription) +- `arm.whatif.resourcegroup` — `WhatIfDeployToResourceGroup` (WhatIf preview at resource group) +- `arm.validate.subscription` — `ValidatePreflightToSubscription` (preflight validation) +- `arm.validate.resourcegroup` — `ValidatePreflightToResourceGroup` (preflight validation) + +**Deployment Stacks** (`stack_deployments.go` — 2 spans): +- `arm.stack.deploy.subscription` — `DeployToSubscription` (deployment stack at subscription) +- `arm.stack.deploy.resourcegroup` — `DeployToResourceGroup` (deployment stack at resource group) + +**Container Operations** (`container_helper.go` — 3 spans): +- `container.publish` — `Publish` (end-to-end container publish orchestration; includes + `container.remotebuild` boolean attribute) +- `container.credentials` — `Credentials` (ACR credential retrieval with exponential backoff) +- `container.remotebuild` — `runRemoteBuild` (ACR remote build including context upload) + +**Test Updates** (`container_helper_test.go`): Mock assertions for `Login` updated from exact +context matching (`*mockContext.Context`) to `mock.Anything` because `tracing.Start` wraps the +context with a span value, causing strict equality to fail. This applies to +`setupContainerRegistryMocks`, `Test_ContainerHelper_Deploy`, and +`Test_ContainerHelper_Deploy_ImageOverride` test assertions. + +**Evidence:** These spans cover the four longest-running operation categories during parallel +deployment: ARM provisioning (30-120s), WhatIf/Validate (30-90s), container build/push (30- +300s), and ACR credential retrieval (2-62s with backoff). Together with the existing +`deploy.appservice.zip` span, they provide full end-to-end visibility for profiling `azd up`. + +### 10. Supporting Changes + +**Files:** `.gitignore`, `cli/azd/.vscode/cspell-azd-dictionary.txt` + +- `.gitignore`: Added coverage artifacts (`cover-*`, `cover_*`, `review-*.diff`) and Playwright MCP + directory (`.playwright-mcp/`) +- `cspell-azd-dictionary.txt`: Added `keepalives` (HTTP keep-alive terminology in TunedTransport), + `nilerr` (golangci-lint nolint directive in `IsScmReady`), `appsettings` (App Service terminology) + +### 11. Container App Adaptive Poll Frequency + +**File:** `pkg/containerapps/container_app.go` + +**Problem:** Container App create/update/job operations used `PollUntilDone(ctx, nil)`, defaulting +to the Azure SDK's 30-second polling interval. Unlike ARM template deployments (which go through +`standard_deployments.go` where poll frequency was already tuned in §3), Container App revision +updates go through a separate code path (`containerAppService.DeployYaml`, `AddRevision`, +`updateContainerApp`, `UpdateContainerAppJob`). Templates with multiple Container Apps (e.g., the +5-service "shop" template) suffered compounded tail latency: up to 28 seconds wasted per service × +5 services = **140 seconds** of unnecessary wait time. + +**Solution:** Added `containerAppPollFrequency = 5 * time.Second` constant and applied it to all +three `PollUntilDone` call sites: +- `DeployYaml` → `BeginCreateOrUpdate` poller (line 321) +- `updateContainerApp` → `BeginUpdate` poller (line 546) +- `UpdateContainerAppJob` → `BeginUpdate` poller (line 780) + +**Evidence:** Container App revision updates typically complete in 10-60 seconds (observed in perf +benchmarks). The same 5s frequency used for ARM deploy operations (§3) balances latency reduction +(6x faster than the 30s default) against ARM rate limits. With 8 parallel services, this generates +~96 polls/min per operation type — well within the 1200 reads/5min/subscription limit. + +### 12. ACR Login Caching per Registry + +**File:** `pkg/azapi/container_registry.go` + +**Problem:** `containerRegistryService.Login()` was called for every service during deploy, +regardless of whether the same registry had already been authenticated. For templates pushing +multiple container images to the same ACR (e.g., shop with 5 services sharing one registry), +this meant 4 redundant credential exchanges (AAD → ACR token swap via `/oauth2/exchange`) and +4 redundant `docker login` commands. Each credential exchange involves an HTTP round-trip to the +ACR token endpoint plus an ARM token acquisition — typically 2-5 seconds per call. + +**Solution:** Added two-layer deduplication to `containerRegistryService`: +- `loginGroup singleflight.Group` deduplicates in-flight concurrent login attempts. Uses `DoChan` + so each caller can independently cancel via its own context without affecting other waiters. + The shared work runs under `context.WithoutCancel` to avoid tying it to any single caller. +- `loginDone sync.Map` tracks registries that have already been authenticated this session. + Subsequent `Login()` calls return immediately with a log message. + +**Evidence:** Docker's credential store also caches logins, so the `docker login` call itself +would have been idempotent. However, the expensive part is the *credential exchange* (`Credentials()` +→ `getAcrToken()` → AAD token → ACR refresh token), which is not cached by Docker. With 5 +services sharing one ACR, this saves 4 × ~3s = **~12 seconds** plus 4 redundant Docker CLI +invocations. + +## Test Coverage + +| Change | Test File | Coverage | +|--------|-----------|----------| +| `TunedTransport` | `pkg/httputil/util_test.go` | Verifies all pool params; verifies DefaultTransport not mutated | +| `isBuildFailure` | `pkg/azapi/webapp_test.go` | 6 cases: nil, unrelated, exact match, wrapped, real build failure, partial | +| Docker path resolution | `pkg/project/framework_service_docker_test.go` | Updated to verify absolute resolved paths via dynamic resolution | +| `resolveDockerPaths` | `pkg/project/container_helper_coverage3_test.go` | 4 sub-tests: defaults, user-specified, absolute, mixed | +| Docker build args | `pkg/project/framework_service_docker_test.go` | Table-driven tests resolve expected paths to match production behavior | +| Tracing context propagation | `pkg/project/container_helper_test.go` | Mock assertions updated for tracing context wrapping (`mock.Anything`) | + +The ARM client caching, TunedTransport wiring, poll frequency (both ARM and Container App), +SCM retry logic, ACR login caching, `UpdateAppServiceAppSetting`, and OTel tracing spans are +infrastructure changes that operate through Azure SDK interactions and are validated through +integration and end-to-end playback tests rather than isolated unit tests.