Songmin Li 6269f399a5
Fix unnecessary retrying when the link is down in dhcp.
From the dhcp daemon log, we can see that dhcp will fail to acquire
the lease when the link is down, and success on retry.

```
2024/08/21 21:30:44 macvlan-dhcp/eth1: acquiring lease
2024/08/21 21:30:44 Link "eth1" down. Attempting to set up
2024/08/21 21:30:44 network is down
2024/08/21 21:30:44 retrying in 2.641696 seconds
2024/08/21 21:30:49 macvlan-dhcp/eth1: lease acquired, expiration is 2024-08-22 09:30:49.755367962 +0800 CST m=+43205.712107889
```

After move the code of set up link to the beginning of the function, the
dhcp success on first time.

```
2024/08/21 22:04:02 macvlan-dhcp/eth1: acquiring lease
2024/08/21 22:04:02 Link "eth1" down. Attempting to set up
2024/08/21 22:04:05 macvlan-dhcp/eth1: lease acquired, expiration is 2024-08-22 10:04:05.297887726 +0800 CST m=+43203.081141304
```

Signed-off-by: Songmin Li <lisongmin@protonmail.com>
2024-08-24 19:54:34 +08:00

504 lines
12 KiB
Go

// Copyright 2015 CNI authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"fmt"
"log"
"math/rand"
"net"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/d2g/dhcp4"
"github.com/d2g/dhcp4client"
"github.com/vishvananda/netlink"
"github.com/containernetworking/cni/pkg/types"
"github.com/containernetworking/plugins/pkg/ns"
)
// RFC 2131 suggests using exponential backoff, starting with 4sec
// and randomized to +/- 1sec
const (
resendDelay0 = 4 * time.Second
resendDelayMax = 62 * time.Second
)
// To speed up the retry for first few failures, we retry without
// backoff for a few times
const (
resendFastDelay = 2 * time.Second
resendFastMax = 4
)
const (
leaseStateBound = iota
leaseStateRenewing
leaseStateRebinding
)
// This implementation uses 1 OS thread per lease. This is because
// all the network operations have to be done in network namespace
// of the interface. This can be improved by switching to the proper
// namespace for network ops and using fewer threads. However, this
// needs to be done carefully as dhcp4client ops are blocking.
type DHCPLease struct {
clientID string
ack *dhcp4.Packet
opts dhcp4.Options
link netlink.Link
renewalTime time.Time
rebindingTime time.Time
expireTime time.Time
timeout time.Duration
resendMax time.Duration
broadcast bool
stopping uint32
stop chan struct{}
check chan struct{}
wg sync.WaitGroup
// list of requesting and providing options and if they are necessary / their value
optsRequesting map[dhcp4.OptionCode]bool
optsProviding map[dhcp4.OptionCode][]byte
}
var requestOptionsDefault = map[dhcp4.OptionCode]bool{
dhcp4.OptionRouter: true,
dhcp4.OptionSubnetMask: true,
}
func prepareOptions(cniArgs string, provideOptions []ProvideOption, requestOptions []RequestOption) (
map[dhcp4.OptionCode]bool, map[dhcp4.OptionCode][]byte, error,
) {
var optsRequesting map[dhcp4.OptionCode]bool
var optsProviding map[dhcp4.OptionCode][]byte
var err error
// parse CNI args
cniArgsParsed := map[string]string{}
for _, argPair := range strings.Split(cniArgs, ";") {
args := strings.SplitN(argPair, "=", 2)
if len(args) > 1 {
cniArgsParsed[args[0]] = args[1]
}
}
// parse providing options map
var optParsed dhcp4.OptionCode
optsProviding = make(map[dhcp4.OptionCode][]byte)
for _, opt := range provideOptions {
optParsed, err = parseOptionName(string(opt.Option))
if err != nil {
return nil, nil, fmt.Errorf("Can not parse option %q: %w", opt.Option, err)
}
if len(opt.Value) > 0 {
if len(opt.Value) > 255 {
return nil, nil, fmt.Errorf("value too long for option %q: %q", opt.Option, opt.Value)
}
optsProviding[optParsed] = []byte(opt.Value)
}
if value, ok := cniArgsParsed[opt.ValueFromCNIArg]; ok {
if len(value) > 255 {
return nil, nil, fmt.Errorf("value too long for option %q from CNI_ARGS %q: %q", opt.Option, opt.ValueFromCNIArg, opt.Value)
}
optsProviding[optParsed] = []byte(value)
}
}
// parse necessary options map
optsRequesting = make(map[dhcp4.OptionCode]bool)
skipRequireDefault := false
for _, opt := range requestOptions {
if opt.SkipDefault {
skipRequireDefault = true
}
optParsed, err = parseOptionName(string(opt.Option))
if err != nil {
return nil, nil, fmt.Errorf("Can not parse option %q: %w", opt.Option, err)
}
optsRequesting[optParsed] = true
}
for k, v := range requestOptionsDefault {
// only set if not skipping default and this value does not exists
if _, ok := optsRequesting[k]; !ok && !skipRequireDefault {
optsRequesting[k] = v
}
}
return optsRequesting, optsProviding, err
}
// AcquireLease gets an DHCP lease and then maintains it in the background
// by periodically renewing it. The acquired lease can be released by
// calling DHCPLease.Stop()
func AcquireLease(
clientID, netns, ifName string,
optsRequesting map[dhcp4.OptionCode]bool, optsProviding map[dhcp4.OptionCode][]byte,
timeout, resendMax time.Duration, broadcast bool,
) (*DHCPLease, error) {
errCh := make(chan error, 1)
l := &DHCPLease{
clientID: clientID,
stop: make(chan struct{}),
check: make(chan struct{}),
timeout: timeout,
resendMax: resendMax,
broadcast: broadcast,
optsRequesting: optsRequesting,
optsProviding: optsProviding,
}
log.Printf("%v: acquiring lease", clientID)
l.wg.Add(1)
go func() {
errCh <- ns.WithNetNSPath(netns, func(_ ns.NetNS) error {
defer l.wg.Done()
link, err := netlink.LinkByName(ifName)
if err != nil {
return fmt.Errorf("error looking up %q: %v", ifName, err)
}
l.link = link
if err = l.acquire(); err != nil {
return err
}
log.Printf("%v: lease acquired, expiration is %v", l.clientID, l.expireTime)
errCh <- nil
l.maintain()
return nil
})
}()
if err := <-errCh; err != nil {
return nil, err
}
return l, nil
}
// Stop terminates the background task that maintains the lease
// and issues a DHCP Release
func (l *DHCPLease) Stop() {
if atomic.CompareAndSwapUint32(&l.stopping, 0, 1) {
close(l.stop)
}
l.wg.Wait()
}
func (l *DHCPLease) Check() {
l.check <- struct{}{}
}
func (l *DHCPLease) getOptionsWithClientID() dhcp4.Options {
opts := make(dhcp4.Options)
opts[dhcp4.OptionClientIdentifier] = []byte(l.clientID)
// client identifier's first byte is "type"
newClientID := []byte{0}
newClientID = append(newClientID, opts[dhcp4.OptionClientIdentifier]...)
opts[dhcp4.OptionClientIdentifier] = newClientID
return opts
}
func (l *DHCPLease) getAllOptions() dhcp4.Options {
opts := l.getOptionsWithClientID()
for k, v := range l.optsProviding {
opts[k] = v
}
opts[dhcp4.OptionParameterRequestList] = []byte{}
for k := range l.optsRequesting {
opts[dhcp4.OptionParameterRequestList] = append(opts[dhcp4.OptionParameterRequestList], byte(k))
}
return opts
}
func (l *DHCPLease) acquire() error {
if (l.link.Attrs().Flags & net.FlagUp) != net.FlagUp {
log.Printf("Link %q down. Attempting to set up", l.link.Attrs().Name)
if err := netlink.LinkSetUp(l.link); err != nil {
return err
}
}
c, err := newDHCPClient(l.link, l.timeout, l.broadcast)
if err != nil {
return err
}
defer c.Close()
opts := l.getAllOptions()
pkt, err := backoffRetry(l.resendMax, func() (*dhcp4.Packet, error) {
ok, ack, err := DhcpRequest(c, opts)
switch {
case err != nil:
return nil, err
case !ok:
return nil, fmt.Errorf("DHCP server NACK'd own offer")
default:
return &ack, nil
}
})
if err != nil {
return err
}
return l.commit(pkt)
}
func (l *DHCPLease) commit(ack *dhcp4.Packet) error {
opts := ack.ParseOptions()
leaseTime, err := parseLeaseTime(opts)
if err != nil {
return err
}
rebindingTime, err := parseRebindingTime(opts)
if err != nil || rebindingTime > leaseTime {
// Per RFC 2131 Section 4.4.5, it should default to 85% of lease time
rebindingTime = leaseTime * 85 / 100
}
renewalTime, err := parseRenewalTime(opts)
if err != nil || renewalTime > rebindingTime {
// Per RFC 2131 Section 4.4.5, it should default to 50% of lease time
renewalTime = leaseTime / 2
}
now := time.Now()
l.expireTime = now.Add(leaseTime)
l.renewalTime = now.Add(renewalTime)
l.rebindingTime = now.Add(rebindingTime)
l.ack = ack
l.opts = opts
return nil
}
func (l *DHCPLease) maintain() {
state := leaseStateBound
for {
var sleepDur time.Duration
switch state {
case leaseStateBound:
sleepDur = time.Until(l.renewalTime)
if sleepDur <= 0 {
log.Printf("%v: renewing lease", l.clientID)
state = leaseStateRenewing
continue
}
case leaseStateRenewing:
if err := l.renew(); err != nil {
log.Printf("%v: %v", l.clientID, err)
if time.Now().After(l.rebindingTime) {
log.Printf("%v: renewal time expired, rebinding", l.clientID)
state = leaseStateRebinding
}
} else {
log.Printf("%v: lease renewed, expiration is %v", l.clientID, l.expireTime)
state = leaseStateBound
}
case leaseStateRebinding:
if err := l.acquire(); err != nil {
log.Printf("%v: %v", l.clientID, err)
if time.Now().After(l.expireTime) {
log.Printf("%v: lease expired, bringing interface DOWN", l.clientID)
l.downIface()
return
}
} else {
log.Printf("%v: lease rebound, expiration is %v", l.clientID, l.expireTime)
state = leaseStateBound
}
}
select {
case <-time.After(sleepDur):
case <-l.check:
log.Printf("%v: Checking lease", l.clientID)
case <-l.stop:
if err := l.release(); err != nil {
log.Printf("%v: failed to release DHCP lease: %v", l.clientID, err)
}
return
}
}
}
func (l *DHCPLease) downIface() {
if err := netlink.LinkSetDown(l.link); err != nil {
log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.link.Attrs().Name, err)
}
}
func (l *DHCPLease) renew() error {
c, err := newDHCPClient(l.link, l.timeout, l.broadcast)
if err != nil {
return err
}
defer c.Close()
opts := l.getAllOptions()
pkt, err := backoffRetry(l.resendMax, func() (*dhcp4.Packet, error) {
ok, ack, err := DhcpRenew(c, *l.ack, opts)
switch {
case err != nil:
return nil, err
case !ok:
return nil, fmt.Errorf("DHCP server did not renew lease")
default:
return &ack, nil
}
})
if err != nil {
return err
}
l.commit(pkt)
return nil
}
func (l *DHCPLease) release() error {
log.Printf("%v: releasing lease", l.clientID)
c, err := newDHCPClient(l.link, l.timeout, l.broadcast)
if err != nil {
return err
}
defer c.Close()
opts := l.getOptionsWithClientID()
if err = DhcpRelease(c, *l.ack, opts); err != nil {
return fmt.Errorf("failed to send DHCPRELEASE")
}
return nil
}
func (l *DHCPLease) IPNet() (*net.IPNet, error) {
mask := parseSubnetMask(l.opts)
if mask == nil {
return nil, fmt.Errorf("DHCP option Subnet Mask not found in DHCPACK")
}
return &net.IPNet{
IP: l.ack.YIAddr(),
Mask: mask,
}, nil
}
func (l *DHCPLease) Gateway() net.IP {
return parseRouter(l.opts)
}
func (l *DHCPLease) Routes() []*types.Route {
routes := []*types.Route{}
// RFC 3442 states that if Classless Static Routes (option 121)
// exist, we ignore Static Routes (option 33) and the Router/Gateway.
opt121Routes := parseCIDRRoutes(l.opts)
if len(opt121Routes) > 0 {
return append(routes, opt121Routes...)
}
// Append Static Routes
routes = append(routes, parseRoutes(l.opts)...)
// The CNI spec says even if there is a gateway specified, we must
// add a default route in the routes section.
if gw := l.Gateway(); gw != nil {
_, defaultRoute, _ := net.ParseCIDR("0.0.0.0/0")
routes = append(routes, &types.Route{Dst: *defaultRoute, GW: gw})
}
return routes
}
// jitter returns a random value within [-span, span) range
func jitter(span time.Duration) time.Duration {
return time.Duration(float64(span) * (2.0*rand.Float64() - 1.0))
}
func backoffRetry(resendMax time.Duration, f func() (*dhcp4.Packet, error)) (*dhcp4.Packet, error) {
baseDelay := resendDelay0
var sleepTime time.Duration
fastRetryLimit := resendFastMax
for {
pkt, err := f()
if err == nil {
return pkt, nil
}
log.Print(err)
if fastRetryLimit == 0 {
sleepTime = baseDelay + jitter(time.Second)
} else {
sleepTime = resendFastDelay + jitter(time.Second)
fastRetryLimit--
}
log.Printf("retrying in %f seconds", sleepTime.Seconds())
time.Sleep(sleepTime)
// only adjust delay time if we are in normal backoff stage
if baseDelay < resendMax && fastRetryLimit == 0 {
baseDelay *= 2
} else if fastRetryLimit == 0 { // only break if we are at normal delay
break
}
}
return nil, errNoMoreTries
}
func newDHCPClient(
link netlink.Link,
timeout time.Duration,
broadcast bool,
) (*dhcp4client.Client, error) {
pktsock, err := dhcp4client.NewPacketSock(link.Attrs().Index)
if err != nil {
return nil, err
}
return dhcp4client.New(
dhcp4client.HardwareAddr(link.Attrs().HardwareAddr),
dhcp4client.Timeout(timeout),
dhcp4client.Broadcast(broadcast),
dhcp4client.Connection(pktsock),
)
}