Update Allocate method to reuse lease if present

Previously, the Allocate method of the daemon always created a new Lease
object. However, as both the CNI ADD and CHECK commands call Allocate,
and CHECK can be called multiple times, this resulted in multiple Lease
objects being created per pod.

Each of these leases was long lived with its own maintain() loop -
however the daemon only kept track of the most recent one, meaning any
old lease objects remained running forever (and held open their NetNS
files). After a long enough period, this resulted in the system crashing
out with "too many files" or a similar error limits-related error.

This commit updates the behaviour of Allocate() to first check if a
Lease already exists for the given clientID. If none is found, one is
created as before. If a Lease is found, a new Check() mechanism is
called, which simply wakes up the maintain() loop to cause it to check
the status of the lease.

This may fix #329.

Signed-off-by: Emily Shepherd <emily@redcoat.dev>
This commit is contained in:
Emily Shepherd 2022-12-05 00:13:25 +00:00
parent ec76e3c35c
commit 0fc229df5e
No known key found for this signature in database
GPG Key ID: 6044814E0DD4DF8B
2 changed files with 23 additions and 6 deletions

View File

@ -80,12 +80,20 @@ func (d *DHCP) Allocate(args *skel.CmdArgs, result *current.Result) error {
}
clientID := generateClientID(args.ContainerID, conf.Name, args.IfName)
hostNetns := d.hostNetnsPrefix + args.Netns
l, err := AcquireLease(clientID, hostNetns, args.IfName,
optsRequesting, optsProviding,
d.clientTimeout, d.clientResendMax, d.broadcast)
if err != nil {
return err
// If we already have an active lease for this clientID, do not create
// another one
l := d.getLease(clientID)
if l != nil {
l.Check()
} else {
hostNetns := d.hostNetnsPrefix + args.Netns
l, err = AcquireLease(clientID, hostNetns, args.IfName,
optsRequesting, optsProviding,
d.clientTimeout, d.clientResendMax, d.broadcast)
if err != nil {
return err
}
}
ipn, err := l.IPNet()

View File

@ -67,6 +67,7 @@ type DHCPLease struct {
broadcast bool
stopping uint32
stop chan struct{}
check chan struct{}
wg sync.WaitGroup
// list of requesting and providing options and if they are necessary / their value
optsRequesting map[dhcp4.OptionCode]bool
@ -150,6 +151,7 @@ func AcquireLease(
l := &DHCPLease{
clientID: clientID,
stop: make(chan struct{}),
check: make(chan struct{}),
timeout: timeout,
resendMax: resendMax,
broadcast: broadcast,
@ -200,6 +202,10 @@ func (l *DHCPLease) Stop() {
l.wg.Wait()
}
func (l *DHCPLease) Check() {
l.check <- struct{}{}
}
func (l *DHCPLease) getOptionsWithClientId() dhcp4.Options {
opts := make(dhcp4.Options)
opts[dhcp4.OptionClientIdentifier] = []byte(l.clientID)
@ -334,6 +340,9 @@ func (l *DHCPLease) maintain() {
select {
case <-time.After(sleepDur):
case <-l.check:
log.Printf("%v: Checking lease", l.clientID)
case <-l.stop:
if err := l.release(); err != nil {
log.Printf("%v: failed to release DHCP lease: %v", l.clientID, err)