Skip to content

Commit 0336432

Browse files
committed
[shimV2] added network controller implementation
This change adds the network controller implementation for V2 shims which manages the network lifecycle for a single pod running inside a UVM. Signed-off-by: Harsh Rawat <harshrawat@microsoft.com>
1 parent 3e63b84 commit 0336432

9 files changed

Lines changed: 713 additions & 68 deletions

File tree

internal/controller/network/doc.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
//go:build windows
2+
3+
// Package network provides a controller for managing the network lifecycle of a pod
4+
// running inside a Utility VM (UVM).
5+
//
6+
// It handles attaching an HCN namespace and its endpoints to the guest VM,
7+
// and tearing them down on pod removal. The [Controller] interface is the
8+
// primary entry point, with [Manager] as its concrete implementation.
9+
//
10+
// # Lifecycle
11+
//
12+
// A network follows the state machine below.
13+
//
14+
// ┌────────────────────┐
15+
// │ StateNotConfigured │
16+
// └───┬────────────┬───┘
17+
// Setup ok │ │ Setup fails
18+
// ▼ ▼
19+
// ┌─────────────────┐ ┌──────────────┐
20+
// │ StateConfigured │ │ StateInvalid │
21+
// └────────┬────────┘ └──────┬───────┘
22+
// │ Teardown │ Teardown
23+
// ▼ ▼
24+
// ┌─────────────────────────────────────┐
25+
// │ StateTornDown │
26+
// └─────────────────────────────────────┘
27+
//
28+
// State descriptions:
29+
//
30+
// - [StateNotConfigured]: initial state; no namespace or NICs have been configured.
31+
// - [StateConfigured]: after [Controller.Setup] succeeds; the HCN namespace is attached
32+
// and all endpoints are wired up inside the guest.
33+
// - [StateInvalid]: entered when [Controller.Setup] fails mid-way; best-effort
34+
// cleanup should be performed via [Controller.Teardown].
35+
// - [StateTornDown]: terminal state reached after [Controller.Teardown] completes.
36+
//
37+
// # Platform Variants
38+
//
39+
// Guest-side operations differ between LCOW and WCOW and are implemented in
40+
// platform-specific source files selected via build tags
41+
// (default for LCOW shim, "wcow" tag for WCOW shim).
42+
package network
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//go:build windows
2+
3+
package network
4+
5+
import (
6+
"context"
7+
8+
"github.com/Microsoft/hcsshim/internal/gcs"
9+
)
10+
11+
// Controller manages the network lifecycle for a single pod running inside a UVM.
12+
type Controller interface {
13+
// Setup attaches the HCN namespace and its endpoints to the guest VM.
14+
Setup(ctx context.Context, opts *SetupOptions) error
15+
16+
// Teardown removes all guest-side NICs and the network namespace from the VM.
17+
// It is idempotent: calling it on an already torn-down or unconfigured network is a no-op.
18+
Teardown(ctx context.Context) error
19+
}
20+
21+
// SetupOptions holds the configuration required to set up the network for a pod.
22+
type SetupOptions struct {
23+
// PodID is the identifier of the pod whose network is being configured.
24+
PodID string
25+
26+
// NetworkNamespace is the HCN namespace ID to attach to the guest.
27+
NetworkNamespace string
28+
29+
// PolicyBasedRouting controls whether policy-based routing is configured
30+
// for the endpoints added to the guest. Only relevant for LCOW.
31+
PolicyBasedRouting bool
32+
}
33+
34+
// capabilitiesProvider is a narrow interface satisfied by guestmanager.Manager.
35+
// It exists so callers pass the guest manager scoped only to Capabilities(),
36+
// avoiding a hard dependency on the full guestmanager.Manager interface here.
37+
type capabilitiesProvider interface {
38+
Capabilities() gcs.GuestDefinedCapabilities
39+
}
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
//go:build windows
2+
3+
package network
4+
5+
import (
6+
"context"
7+
"errors"
8+
"fmt"
9+
"slices"
10+
"strings"
11+
"sync"
12+
13+
"github.com/Microsoft/hcsshim/hcn"
14+
"github.com/Microsoft/hcsshim/internal/log"
15+
"github.com/Microsoft/hcsshim/internal/logfields"
16+
"github.com/Microsoft/hcsshim/internal/vm/guestmanager"
17+
"github.com/Microsoft/hcsshim/internal/vm/vmmanager"
18+
19+
"github.com/Microsoft/go-winio/pkg/guid"
20+
"github.com/sirupsen/logrus"
21+
)
22+
23+
// Manager is the concrete implementation of [Controller].
24+
type Manager struct {
25+
mu sync.Mutex
26+
27+
// podID is the identifier of the pod whose network this Controller manages.
28+
podID string
29+
30+
// namespaceID is the HCN namespace ID in use after a successful Setup.
31+
namespaceID string
32+
33+
// vmEndpoints maps nicID (ID within UVM) -> HCN endpoint.
34+
vmEndpoints map[string]*hcn.HostComputeEndpoint
35+
36+
// netState is the current lifecycle state of the network.
37+
netState State
38+
39+
// isNamespaceSupportedByGuest determines if network namespace is supported inside the guest
40+
isNamespaceSupportedByGuest bool
41+
42+
// vmNetManager performs host-side NIC hot-add/remove on the UVM.
43+
vmNetManager vmmanager.NetworkManager
44+
45+
// linuxGuestMgr performs guest-side NIC inject/remove for LCOW.
46+
linuxGuestMgr guestmanager.LCOWNetworkManager
47+
48+
// winGuestMgr performs guest-side NIC/namespace operations for WCOW.
49+
winGuestMgr guestmanager.WCOWNetworkManager
50+
51+
// capsProvider exposes the guest's declared capabilities.
52+
// Used to check IsNamespaceAddRequestSupported.
53+
capsProvider capabilitiesProvider
54+
}
55+
56+
// Assert that Manager implements Controller.
57+
var _ Controller = (*Manager)(nil)
58+
59+
// New creates a ready-to-use Manager in [StateNotConfigured].
60+
//
61+
// This method is called from [VMController.CreateNetworkController()]
62+
// which injects the necessary dependencies.
63+
func New(
64+
vmNetManager vmmanager.NetworkManager,
65+
linuxGuestMgr guestmanager.LCOWNetworkManager,
66+
windowsGuestMgr guestmanager.WCOWNetworkManager,
67+
capsProvider capabilitiesProvider,
68+
) *Manager {
69+
m := &Manager{
70+
vmNetManager: vmNetManager,
71+
linuxGuestMgr: linuxGuestMgr,
72+
winGuestMgr: windowsGuestMgr,
73+
capsProvider: capsProvider,
74+
netState: StateNotConfigured,
75+
vmEndpoints: make(map[string]*hcn.HostComputeEndpoint),
76+
}
77+
78+
// Cache once at construction so hot-add paths can branch without re-querying.
79+
if caps := capsProvider.Capabilities(); caps != nil {
80+
m.isNamespaceSupportedByGuest = caps.IsNamespaceAddRequestSupported()
81+
}
82+
83+
return m
84+
}
85+
86+
// Setup attaches the requested HCN namespace to the guest VM
87+
// and hot-adds all endpoints found in that namespace.
88+
// It must be called only once; subsequent calls return an error.
89+
func (m *Manager) Setup(ctx context.Context, opts *SetupOptions) (err error) {
90+
ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "Network Setup"))
91+
92+
m.mu.Lock()
93+
defer m.mu.Unlock()
94+
95+
log.G(ctx).WithFields(logrus.Fields{
96+
logfields.PodID: opts.PodID,
97+
logfields.Namespace: opts.NetworkNamespace,
98+
}).Debug("starting network setup")
99+
100+
// If Setup has already been called, then error out.
101+
if m.netState != StateNotConfigured {
102+
return fmt.Errorf("cannot set up network in state %s", m.netState)
103+
}
104+
105+
defer func() {
106+
if err != nil {
107+
// If setup fails for any reason, move to invalid so no further
108+
// Setup calls are accepted.
109+
m.netState = StateInvalid
110+
log.G(ctx).WithError(err).Error("network setup failed, moving to invalid state")
111+
}
112+
}()
113+
114+
if opts.NetworkNamespace == "" {
115+
return fmt.Errorf("network namespace must not be empty")
116+
}
117+
118+
// Validate that the provided namespace exists.
119+
hcnNamespace, err := hcn.GetNamespaceByID(opts.NetworkNamespace)
120+
if err != nil {
121+
return fmt.Errorf("get network namespace %s: %w", opts.NetworkNamespace, err)
122+
}
123+
124+
// Fetch all endpoints in the namespace.
125+
endpoints, err := m.fetchEndpointsInNamespace(ctx, hcnNamespace)
126+
if err != nil {
127+
return fmt.Errorf("fetch endpoints in namespace %s: %w", hcnNamespace.Id, err)
128+
}
129+
130+
// Add the namespace to the guest.
131+
if err = m.addNetNSInsideGuest(ctx, hcnNamespace); err != nil {
132+
return fmt.Errorf("add network namespace to guest: %w", err)
133+
}
134+
135+
// Hot-add all endpoints in the namespace to the guest.
136+
for _, endpoint := range endpoints {
137+
nicGUID, err := guid.NewV4()
138+
if err != nil {
139+
return fmt.Errorf("generate NIC GUID: %w", err)
140+
}
141+
if err = m.addEndpointToGuestNamespace(ctx, nicGUID.String(), endpoint, opts.PolicyBasedRouting); err != nil {
142+
return fmt.Errorf("add endpoint %s to guest: %w", endpoint.Name, err)
143+
}
144+
}
145+
146+
m.podID = opts.PodID
147+
m.namespaceID = hcnNamespace.Id
148+
m.netState = StateConfigured
149+
150+
log.G(ctx).WithFields(logrus.Fields{
151+
logfields.PodID: opts.PodID,
152+
logfields.Namespace: hcnNamespace.Id,
153+
}).Info("network setup completed successfully")
154+
155+
return nil
156+
}
157+
158+
// Teardown removes all guest-side NICs and the HCN namespace from the UVM.
159+
//
160+
// It is idempotent: calling it when the network is already torn down or not yet
161+
// configured is a no-op.
162+
func (m *Manager) Teardown(ctx context.Context) error {
163+
ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "Network Teardown"))
164+
165+
m.mu.Lock()
166+
defer m.mu.Unlock()
167+
168+
log.G(ctx).WithFields(logrus.Fields{
169+
logfields.PodID: m.podID,
170+
logfields.Namespace: m.namespaceID,
171+
"State": m.netState,
172+
}).Debug("starting network teardown")
173+
174+
if m.netState == StateTornDown {
175+
// Teardown is idempotent, so return nil if already torn down.
176+
log.G(ctx).Info("network already torn down, skipping")
177+
return nil
178+
}
179+
180+
if m.netState == StateNotConfigured {
181+
// Nothing was configured; nothing to clean up.
182+
log.G(ctx).Info("network not configured, skipping")
183+
return nil
184+
}
185+
186+
// Remove all endpoints from the guest.
187+
// Use a continue-on-error strategy: attempt every NIC regardless of individual
188+
// failures, then collect all errors.
189+
var teardownErrs []error
190+
for nicID, endpoint := range m.vmEndpoints {
191+
if err := m.removeEndpointFromGuestNamespace(ctx, nicID, endpoint); err != nil {
192+
teardownErrs = append(teardownErrs, fmt.Errorf("remove endpoint %s from guest: %w", endpoint.Name, err))
193+
continue // continue attempting to remove other endpoints
194+
}
195+
196+
delete(m.vmEndpoints, nicID)
197+
}
198+
199+
if err := m.removeNetNSInsideGuest(ctx, m.namespaceID); err != nil {
200+
teardownErrs = append(teardownErrs, fmt.Errorf("remove network namespace from guest: %w", err))
201+
}
202+
203+
if len(teardownErrs) > 0 {
204+
// If any errors were encountered during teardown, mark the state as invalid.
205+
m.netState = StateInvalid
206+
return errors.Join(teardownErrs...)
207+
}
208+
209+
// Mark as torn down if we do not encounter any errors.
210+
// No further Setup or Teardown calls are allowed.
211+
m.netState = StateTornDown
212+
213+
log.G(ctx).WithFields(logrus.Fields{
214+
logfields.PodID: m.podID,
215+
"networkNamespace": m.namespaceID,
216+
}).Info("network teardown completed successfully")
217+
218+
return nil
219+
}
220+
221+
// fetchEndpointsInNamespace retrieves all HCN endpoints present in
222+
// the given namespace.
223+
// Endpoints are sorted so that those with names ending in "eth0" appear first.
224+
func (m *Manager) fetchEndpointsInNamespace(ctx context.Context, ns *hcn.HostComputeNamespace) ([]*hcn.HostComputeEndpoint, error) {
225+
ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Namespace, ns.Id))
226+
log.G(ctx).Info("fetching endpoints from the network namespace")
227+
228+
ids, err := hcn.GetNamespaceEndpointIds(ns.Id)
229+
if err != nil {
230+
return nil, fmt.Errorf("get endpoint IDs for namespace %s: %w", ns.Id, err)
231+
}
232+
endpoints := make([]*hcn.HostComputeEndpoint, 0, len(ids))
233+
for _, id := range ids {
234+
ep, err := hcn.GetEndpointByID(id)
235+
if err != nil {
236+
return nil, fmt.Errorf("get endpoint %s: %w", id, err)
237+
}
238+
endpoints = append(endpoints, ep)
239+
}
240+
241+
// Ensure the endpoint named "eth0" is added first when multiple endpoints are present,
242+
// so it maps to eth0 inside the guest. CNI results aren't available here, so we rely
243+
// on the endpoint name suffix as a heuristic.
244+
cmp := func(a, b *hcn.HostComputeEndpoint) int {
245+
if strings.HasSuffix(a.Name, "eth0") {
246+
return -1
247+
}
248+
if strings.HasSuffix(b.Name, "eth0") {
249+
return 1
250+
}
251+
return 0
252+
}
253+
254+
slices.SortStableFunc(endpoints, cmp)
255+
256+
log.G(ctx).Tracef("fetched endpoints from the network namespace %+v", endpoints)
257+
258+
return endpoints, nil
259+
}

0 commit comments

Comments
 (0)