Skip to content

Commit 9350962

Browse files
committed
Handle error events in HCS notifications.
The shim parses the JSON result document and handles the error events (via `processHcsResult`) returned directly by HCS calls (e.g.,`vmcompute.HcsCreateComputeSystem`), but ignores the JSON (data) payload for notifications (which are either received from a `processAsyncHcsResult` in the appropriate system or process call in `"internal.hcs"`, or via `waitForNotification` in `waitBackground`). This leads to ambiguous failure errors (e.g., `"The data is invalid."`) that require ETW traces to track down the appropriate HCS logs, even though the error events already provide enough information to identify the issue. Add `processNotification()` to: 1. parse the `notificationData` JSON payload provided by HCS to the `notificationWatcher` callback into the an `hcsResult` struct; and 2. validate that the JSON data matches the notification HResult. Create a new error type (`resultError`) to mirror the original [HCS ResultError][result-error] and consolidate the events alongside the original error. Also, since `ErrorEvent`s are always converted to strings in the context of concatenating several of them together, add an `(*ErrorEvent).writeTo(*string.Builder)` function to provide more efficient error string generation for `(HCS|System|Process)Error`s. Additionally, consolidate the joining and formatting of error events for those error types. [result-error]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#ResultError Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
1 parent 89d2357 commit 9350962

7 files changed

Lines changed: 201 additions & 70 deletions

File tree

internal/hcs/callback.go

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package hcs
44

55
import (
66
"context"
7+
"encoding/json"
78
"fmt"
89
"sync"
910
"sync/atomic"
@@ -18,6 +19,8 @@ import (
1819
)
1920

2021
var (
22+
// TODO: don't delete notification contexts on close, so callback can handle delayed notifications
23+
2124
// used to lock [callbackMap].
2225
callbackMapLock = sync.RWMutex{}
2326
callbackMap = map[callbackNumber]*notificationWatcherContext{}
@@ -167,14 +170,14 @@ func notificationWatcher(
167170
notificationStatus uintptr,
168171
notificationData *uint16,
169172
) uintptr {
170-
entry := log.G(context.Background()).WithFields(logrus.Fields{
173+
ctx, entry := log.SetEntry(context.Background(), logrus.Fields{
171174
logfields.CallbackNumber: callbackNum,
172175
"notification-type": notificationType.String(),
173176
})
174177

175-
var result error
176-
if int32(notificationStatus) < 0 {
177-
result = interop.Win32FromHresult(notificationStatus)
178+
result := processNotification(ctx, notificationStatus, notificationData)
179+
if result != nil {
180+
entry.Data[logrus.ErrorKey] = result
178181
}
179182

180183
callbackMapLock.RLock()
@@ -198,3 +201,44 @@ func notificationWatcher(
198201

199202
return 0
200203
}
204+
205+
// processNotification parses and validates HCS notifications and returns the result as an error.
206+
func processNotification(ctx context.Context, notificationStatus uintptr, notificationData *uint16) (err error) {
207+
// TODO: merge/unify with [processHcsResult]
208+
status := int32(notificationStatus)
209+
if status < 0 {
210+
err = interop.Win32FromHresult(notificationStatus)
211+
}
212+
213+
if notificationData == nil {
214+
return err
215+
}
216+
217+
// don't call CoTaskMemFree since HCS_NOTIFICATION_CALLBACK's notificationData is PCWSTR.
218+
resultJSON := interop.ConvertString(notificationData)
219+
result := &hcsResult{}
220+
if jsonErr := json.Unmarshal([]byte(resultJSON), result); jsonErr != nil {
221+
log.G(ctx).WithFields(logrus.Fields{
222+
logfields.JSON: resultJSON,
223+
logrus.ErrorKey: jsonErr,
224+
}).Warn("could not unmarshal HCS result")
225+
return err
226+
}
227+
log.G(ctx).WithField("result", result).Trace("parsed notification data")
228+
229+
// the HResult and data payload should have the same error value
230+
if result.Error < 0 && status < 0 && status != result.Error {
231+
log.G(ctx).WithFields(logrus.Fields{
232+
"status": status,
233+
"data": result.Error,
234+
}).Warn("mismatched notification status and data HResult values")
235+
}
236+
237+
if len(result.ErrorEvents) > 0 {
238+
return &resultError{
239+
Err: err,
240+
Events: result.ErrorEvents,
241+
}
242+
}
243+
return err
244+
}

internal/hcs/errors.go

Lines changed: 111 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@ import (
88
"errors"
99
"fmt"
1010
"net"
11+
"strings"
1112
"syscall"
1213

14+
"github.com/sirupsen/logrus"
15+
1316
"github.com/Microsoft/hcsshim/internal/log"
17+
"github.com/Microsoft/hcsshim/internal/logfields"
1418
)
1519

1620
var (
@@ -102,43 +106,95 @@ type ErrorEvent struct {
102106
//Data []EventData `json:"Data,omitempty"` // Omit this as HCS doesn't encode this well. It's more confusing to include. It is however logged in debug mode (see processHcsResult function)
103107
}
104108

105-
type hcsResult struct {
106-
Error int32
107-
ErrorMessage string
108-
ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"`
109+
func (ev *ErrorEvent) String() string {
110+
sb := new(strings.Builder)
111+
ev.writeTo(sb)
112+
return sb.String()
109113
}
110114

111-
func (ev *ErrorEvent) String() string {
112-
evs := "[Event Detail: " + ev.Message
115+
func (ev *ErrorEvent) writeTo(b *strings.Builder) {
116+
// rough wag at needed length
117+
b.Grow(64 + len(ev.Message) + len(ev.StackTrace) + len(ev.Provider) + len(ev.Source))
118+
119+
// [strings.Builder] Write* functions always return nil errors
120+
_, _ = b.WriteString("[Event Detail: " + ev.Message)
113121
if ev.StackTrace != "" {
114-
evs += " Stack Trace: " + ev.StackTrace
122+
_, _ = b.WriteString(" Stack Trace: " + ev.StackTrace)
115123
}
116124
if ev.Provider != "" {
117-
evs += " Provider: " + ev.Provider
125+
_, _ = b.WriteString(" Provider: " + ev.Provider)
118126
}
119127
if ev.EventID != 0 {
120-
evs = fmt.Sprintf("%s EventID: %d", evs, ev.EventID)
128+
fmt.Fprintf(b, " EventID: %d", ev.EventID)
121129
}
122130
if ev.Flags != 0 {
123-
evs = fmt.Sprintf("%s flags: %d", evs, ev.Flags)
131+
fmt.Fprintf(b, " flags: %d", ev.Flags)
124132
}
125133
if ev.Source != "" {
126-
evs += " Source: " + ev.Source
134+
_, _ = b.WriteString(" Source: " + ev.Source)
127135
}
128-
evs += "]"
129-
return evs
136+
_ = b.WriteByte(']')
137+
}
138+
139+
type hcsResult struct {
140+
Error int32
141+
ErrorMessage string // ErrorMessage should be the same as `windows.Errno(Err).Error()`.
142+
ErrorEvents []ErrorEvent `json:"ErrorEvents,omitempty"`
143+
// TODO: AttributionRecords
130144
}
131145

132146
func processHcsResult(ctx context.Context, resultJSON string) []ErrorEvent {
133-
if resultJSON != "" {
134-
result := &hcsResult{}
135-
if err := json.Unmarshal([]byte(resultJSON), result); err != nil {
136-
log.G(ctx).WithError(err).Warning("Could not unmarshal HCS result")
137-
return nil
138-
}
139-
return result.ErrorEvents
147+
if resultJSON == "" {
148+
return nil
140149
}
141-
return nil
150+
151+
result := &hcsResult{}
152+
if err := json.Unmarshal([]byte(resultJSON), result); err != nil {
153+
log.G(ctx).WithFields(logrus.Fields{
154+
logfields.JSON: resultJSON,
155+
logrus.ErrorKey: err,
156+
}).Warn("Could not unmarshal HCS result")
157+
return nil
158+
}
159+
return result.ErrorEvents
160+
}
161+
162+
// TODO: move [resultError] and [ErrorEvent] to schema2 and handle parsing results/data directly in vmcompute
163+
// See: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#ResultError
164+
165+
// resultError describes an HCS operation result and allows retaining the ErrorEvents
166+
// when an [error] is expected but an [HcsError] is not appropriate (i.e., the operation is not known).
167+
//
168+
// It is used in [notificationWatcher] to send the [ErrorEvent]s in a callback's notification data
169+
// to the corresponding [waitForNotification] via a [notificationChannel].
170+
type resultError struct {
171+
Err error
172+
Events []ErrorEvent
173+
}
174+
175+
func (e *resultError) Error() string {
176+
return appendErrorEvents(e.Err.Error(), e.Events)
177+
}
178+
179+
func (e *resultError) Is(target error) bool {
180+
return errors.Is(e.Err, target)
181+
}
182+
183+
func (e *resultError) Unwrap() error {
184+
return e.Err
185+
}
186+
187+
// getEvents checks to see if err is a [resultError], and if so, returns the unwrapped error
188+
// and [ErrorEvent]s.
189+
//
190+
// Currently, only [notificationWatcher] creates [resultError]s, and they are subsequently
191+
// handled in [waitForNotification].
192+
func getEvents(err error) ([]ErrorEvent, error) {
193+
var rErr *resultError
194+
if errors.As(err, &rErr) {
195+
return rErr.Events, rErr.Err
196+
}
197+
return nil, err
142198
}
143199

144200
type HcsError struct {
@@ -149,12 +205,22 @@ type HcsError struct {
149205

150206
var _ net.Error = &HcsError{}
151207

152-
func (e *HcsError) Error() string {
153-
s := e.Op + ": " + e.Err.Error()
154-
for _, ev := range e.Events {
155-
s += "\n" + ev.String()
208+
func makeHCSError(op string, err error, events []ErrorEvent) error {
209+
// Don't double wrap errors
210+
var e *HcsError
211+
if errors.As(err, &e) {
212+
return err
213+
}
214+
215+
return &HcsError{
216+
Op: op,
217+
Err: err,
218+
Events: events,
156219
}
157-
return s
220+
}
221+
222+
func (e *HcsError) Error() string {
223+
return appendErrorEvents(e.Op+": "+e.Err.Error(), e.Events)
158224
}
159225

160226
func (e *HcsError) Is(target error) bool {
@@ -194,11 +260,7 @@ type SystemError struct {
194260
var _ net.Error = &SystemError{}
195261

196262
func (e *SystemError) Error() string {
197-
s := e.Op + " " + e.ID + ": " + e.Err.Error()
198-
for _, ev := range e.Events {
199-
s += "\n" + ev.String()
200-
}
201-
return s
263+
return appendErrorEvents(fmt.Sprintf("%s %s: %s", e.Op, e.ID, e.Err.Error()), e.Events)
202264
}
203265

204266
func makeSystemError(system *System, op string, err error, events []ErrorEvent) error {
@@ -228,11 +290,7 @@ type ProcessError struct {
228290
var _ net.Error = &ProcessError{}
229291

230292
func (e *ProcessError) Error() string {
231-
s := fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error())
232-
for _, ev := range e.Events {
233-
s += "\n" + ev.String()
234-
}
235-
return s
293+
return appendErrorEvents(fmt.Sprintf("%s %s:%d: %s", e.Op, e.SystemID, e.Pid, e.Err.Error()), e.Events)
236294
}
237295

238296
func makeProcessError(process *Process, op string, err error, events []ErrorEvent) error {
@@ -241,6 +299,7 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven
241299
if errors.As(err, &e) {
242300
return err
243301
}
302+
244303
return &ProcessError{
245304
Pid: process.Pid(),
246305
SystemID: process.SystemID(),
@@ -252,6 +311,22 @@ func makeProcessError(process *Process, op string, err error, events []ErrorEven
252311
}
253312
}
254313

314+
// common formatting for error strings followed by event data,
315+
func appendErrorEvents(s string, events []ErrorEvent) string {
316+
if len(events) == 0 {
317+
return s
318+
}
319+
320+
sb := new(strings.Builder)
321+
_, _ = sb.WriteString(s)
322+
for _, ev := range events {
323+
// don't join with newlines since those are ... awkward within error strings
324+
_, _ = sb.WriteString(": ")
325+
ev.writeTo(sb)
326+
}
327+
return sb.String()
328+
}
329+
255330
// IsNotExist checks if an error is caused by the Container or Process not existing.
256331
// Note: Currently, ErrElementNotFound can mean that a Process has either
257332
// already exited, or does not exist. Both IsAlreadyStopped and IsNotExist

internal/hcs/process.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,16 +232,15 @@ func (process *Process) waitBackground() {
232232
trace.Int64Attribute(logfields.ProcessID, int64(process.processID)))
233233

234234
var (
235-
err error
236235
exitCode = -1
237236
propertiesJSON string
238237
resultJSON string
239238
)
240239

241-
err = waitForNotification(ctx, process.callbackNumber, hcsNotificationProcessExited, nil)
240+
events, err := waitForNotification(ctx, process.callbackNumber, hcsNotificationProcessExited, nil)
242241
if err != nil {
243-
err = makeProcessError(process, operation, err, nil)
244-
log.G(ctx).WithError(err).Error("failed wait")
242+
err = makeProcessError(process, operation, err, events)
243+
log.G(ctx).WithError(err).Error("failed wait on process exit")
245244
} else {
246245
process.handleLock.RLock()
247246
defer process.handleLock.RUnlock()

internal/hcs/service.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func GetServiceProperties(ctx context.Context, q hcsschema.PropertyQuery) (*hcss
2121
propertiesJSON, resultJSON, err := vmcompute.HcsGetServiceProperties(ctx, string(queryb))
2222
events := processHcsResult(ctx, resultJSON)
2323
if err != nil {
24-
return nil, &HcsError{Op: operation, Err: err, Events: events}
24+
return nil, makeHCSError(operation, err, events)
2525
}
2626

2727
if propertiesJSON == "" {
@@ -45,7 +45,7 @@ func ModifyServiceSettings(ctx context.Context, settings hcsschema.ModificationR
4545
resultJSON, err := vmcompute.HcsModifyServiceSettings(ctx, string(settingsJSON))
4646
events := processHcsResult(ctx, resultJSON)
4747
if err != nil {
48-
return &HcsError{Op: operation, Err: err, Events: events}
48+
return makeHCSError(operation, err, events)
4949
}
5050
return nil
5151
}

internal/hcs/system.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func GetComputeSystems(ctx context.Context, q schema1.ComputeSystemQuery) ([]sch
180180
computeSystemsJSON, resultJSON, err := vmcompute.HcsEnumerateComputeSystems(ctx, string(queryb))
181181
events := processHcsResult(ctx, resultJSON)
182182
if err != nil {
183-
return nil, &HcsError{Op: operation, Err: err, Events: events}
183+
return nil, makeHCSError(operation, err, events)
184184
}
185185

186186
if computeSystemsJSON == "" {
@@ -284,15 +284,16 @@ func (computeSystem *System) waitBackground() {
284284
defer span.End()
285285
span.AddAttributes(trace.StringAttribute(logfields.SystemID, computeSystem.id))
286286

287-
err := waitForNotification(ctx, computeSystem.callbackNumber, hcsNotificationSystemExited, nil)
287+
events, err := waitForNotification(ctx, computeSystem.callbackNumber, hcsNotificationSystemExited, nil)
288288
if err == nil {
289289
log.G(ctx).Debug("system exited")
290290
} else if errors.Is(err, ErrVmcomputeUnexpectedExit) {
291291
log.G(ctx).Debug("unexpected system exit")
292292
computeSystem.exitError = makeSystemError(computeSystem, operation, err, nil)
293293
err = nil
294294
} else {
295-
err = makeSystemError(computeSystem, operation, err, nil)
295+
log.G(ctx).WithError(err).Error("failed wait on system exit")
296+
err = makeSystemError(computeSystem, operation, err, events)
296297
}
297298
computeSystem.closedWaitOnce.Do(func() {
298299
computeSystem.waitError = err

0 commit comments

Comments
 (0)