// +build windows package hcsoci import ( "fmt" "os" "path" "path/filepath" "github.com/Microsoft/hcsshim/internal/guestrequest" "github.com/Microsoft/hcsshim/internal/ospath" "github.com/Microsoft/hcsshim/internal/requesttype" "github.com/Microsoft/hcsshim/internal/schema2" "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/internal/wclayer" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) type lcowLayerEntry struct { hostPath string uvmPath string scsi bool } const scratchPath = "scratch" // mountContainerLayers is a helper for clients to hide all the complexity of layer mounting // Layer folder are in order: base, [rolayer1..rolayern,] scratch // // v1/v2: Argon WCOW: Returns the mount path on the host as a volume GUID. // v1: Xenon WCOW: Done internally in HCS, so no point calling doing anything here. // v2: Xenon WCOW: Returns a CombinedLayersV2 structure where ContainerRootPath is a folder // inside the utility VM which is a GUID mapping of the scratch folder. Each // of the layers are the VSMB locations where the read-only layers are mounted. // func MountContainerLayers(layerFolders []string, guestRoot string, uvm *uvm.UtilityVM) (interface{}, error) { logrus.Debugln("hcsshim::mountContainerLayers", layerFolders) if uvm == nil { if len(layerFolders) < 2 { return nil, fmt.Errorf("need at least two layers - base and scratch") } path := layerFolders[len(layerFolders)-1] rest := layerFolders[:len(layerFolders)-1] logrus.Debugln("hcsshim::mountContainerLayers ActivateLayer", path) if err := wclayer.ActivateLayer(path); err != nil { return nil, err } logrus.Debugln("hcsshim::mountContainerLayers Preparelayer", path, rest) if err := wclayer.PrepareLayer(path, rest); err != nil { if err2 := wclayer.DeactivateLayer(path); err2 != nil { logrus.Warnf("Failed to Deactivate %s: %s", path, err) } return nil, err } mountPath, err := wclayer.GetLayerMountPath(path) if err != nil { if err := wclayer.UnprepareLayer(path); err != nil { logrus.Warnf("Failed to Unprepare %s: %s", path, err) } if err2 := wclayer.DeactivateLayer(path); err2 != nil { logrus.Warnf("Failed to Deactivate %s: %s", path, err) } return nil, err } return mountPath, nil } // V2 UVM logrus.Debugf("hcsshim::mountContainerLayers Is a %s V2 UVM", uvm.OS()) // Add each read-only layers. For Windows, this is a VSMB share with the ResourceUri ending in // a GUID based on the folder path. For Linux, this is a VPMEM device, except where is over the // max size supported, where we put it on SCSI instead. // // Each layer is ref-counted so that multiple containers in the same utility VM can share them. var wcowLayersAdded []string var lcowlayersAdded []lcowLayerEntry attachedSCSIHostPath := "" for _, layerPath := range layerFolders[:len(layerFolders)-1] { var err error if uvm.OS() == "windows" { options := &hcsschema.VirtualSmbShareOptions{ ReadOnly: true, PseudoOplocks: true, TakeBackupPrivilege: true, CacheIo: true, ShareRead: true, } err = uvm.AddVSMB(layerPath, "", options) if err == nil { wcowLayersAdded = append(wcowLayersAdded, layerPath) } } else { uvmPath := "" hostPath := filepath.Join(layerPath, "layer.vhd") var fi os.FileInfo fi, err = os.Stat(hostPath) if err == nil && uint64(fi.Size()) > uvm.PMemMaxSizeBytes() { // Too big for PMEM. Add on SCSI instead (at /tmp/S/). var ( controller int lun int32 ) controller, lun, err = uvm.AddSCSILayer(hostPath) if err == nil { lcowlayersAdded = append(lcowlayersAdded, lcowLayerEntry{ hostPath: hostPath, uvmPath: fmt.Sprintf("/tmp/S%d/%d", controller, lun), scsi: true, }) } } else { _, uvmPath, err = uvm.AddVPMEM(hostPath, true) // UVM path is calculated. Will be /tmp/vN/ if err == nil { lcowlayersAdded = append(lcowlayersAdded, lcowLayerEntry{ hostPath: hostPath, uvmPath: uvmPath, }) } } } if err != nil { cleanupOnMountFailure(uvm, wcowLayersAdded, lcowlayersAdded, attachedSCSIHostPath) return nil, err } } // Add the scratch at an unused SCSI location. The container path inside the // utility VM will be C:\. hostPath := filepath.Join(layerFolders[len(layerFolders)-1], "sandbox.vhdx") // BUGBUG Rename guestRoot better. containerScratchPathInUVM := ospath.Join(uvm.OS(), guestRoot, scratchPath) _, _, err := uvm.AddSCSI(hostPath, containerScratchPathInUVM, false) if err != nil { cleanupOnMountFailure(uvm, wcowLayersAdded, lcowlayersAdded, attachedSCSIHostPath) return nil, err } attachedSCSIHostPath = hostPath if uvm.OS() == "windows" { // Load the filter at the C:\s location calculated above. We pass into this request each of the // read-only layer folders. layers, err := computeV2Layers(uvm, wcowLayersAdded) if err != nil { cleanupOnMountFailure(uvm, wcowLayersAdded, lcowlayersAdded, attachedSCSIHostPath) return nil, err } guestRequest := guestrequest.CombinedLayers{ ContainerRootPath: containerScratchPathInUVM, Layers: layers, } combinedLayersModification := &hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ Settings: guestRequest, ResourceType: guestrequest.ResourceTypeCombinedLayers, RequestType: requesttype.Add, }, } if err := uvm.Modify(combinedLayersModification); err != nil { cleanupOnMountFailure(uvm, wcowLayersAdded, lcowlayersAdded, attachedSCSIHostPath) return nil, err } logrus.Debugln("hcsshim::mountContainerLayers Succeeded") return guestRequest, nil } // This is the LCOW layout inside the utilityVM. NNN is the container "number" // which increments for each container created in a utility VM. // // /run/gcs/c/NNN/config.json // /run/gcs/c/NNN/rootfs // /run/gcs/c/NNN/scratch/upper // /run/gcs/c/NNN/scratch/work // // /dev/sda on /tmp/scratch type ext4 (rw,relatime,block_validity,delalloc,barrier,user_xattr,acl) // /dev/pmem0 on /tmp/v0 type ext4 (ro,relatime,block_validity,delalloc,norecovery,barrier,dax,user_xattr,acl) // /dev/sdb on /run/gcs/c/NNN/scratch type ext4 (rw,relatime,block_validity,delalloc,barrier,user_xattr,acl) // overlay on /run/gcs/c/NNN/rootfs type overlay (rw,relatime,lowerdir=/tmp/v0,upperdir=/run/gcs/c/NNN/scratch/upper,workdir=/run/gcs/c/NNN/scratch/work) // // Where /dev/sda is the scratch for utility VM itself // /dev/pmemX are read-only layers for containers // /dev/sd(b...) are scratch spaces for each container layers := []hcsschema.Layer{} for _, l := range lcowlayersAdded { layers = append(layers, hcsschema.Layer{Path: l.uvmPath}) } guestRequest := guestrequest.CombinedLayers{ ContainerRootPath: path.Join(guestRoot, rootfsPath), Layers: layers, ScratchPath: containerScratchPathInUVM, } combinedLayersModification := &hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ ResourceType: guestrequest.ResourceTypeCombinedLayers, RequestType: requesttype.Add, Settings: guestRequest, }, } if err := uvm.Modify(combinedLayersModification); err != nil { cleanupOnMountFailure(uvm, wcowLayersAdded, lcowlayersAdded, attachedSCSIHostPath) return nil, err } logrus.Debugln("hcsshim::mountContainerLayers Succeeded") return guestRequest, nil } // UnmountOperation is used when calling Unmount() to determine what type of unmount is // required. In V1 schema, this must be unmountOperationAll. In V2, client can // be more optimal and only unmount what they need which can be a minor performance // improvement (eg if you know only one container is running in a utility VM, and // the UVM is about to be torn down, there's no need to unmount the VSMB shares, // just SCSI to have a consistent file system). type UnmountOperation uint const ( UnmountOperationSCSI UnmountOperation = 0x01 UnmountOperationVSMB = 0x02 UnmountOperationVPMEM = 0x04 UnmountOperationAll = UnmountOperationSCSI | UnmountOperationVSMB | UnmountOperationVPMEM ) // UnmountContainerLayers is a helper for clients to hide all the complexity of layer unmounting func UnmountContainerLayers(layerFolders []string, guestRoot string, uvm *uvm.UtilityVM, op UnmountOperation) error { logrus.Debugln("hcsshim::unmountContainerLayers", layerFolders) if uvm == nil { // Must be an argon - folders are mounted on the host if op != UnmountOperationAll { return fmt.Errorf("only operation supported for host-mounted folders is unmountOperationAll") } if len(layerFolders) < 1 { return fmt.Errorf("need at least one layer for Unmount") } path := layerFolders[len(layerFolders)-1] logrus.Debugln("hcsshim::Unmount UnprepareLayer", path) if err := wclayer.UnprepareLayer(path); err != nil { return err } // TODO Should we try this anyway? logrus.Debugln("hcsshim::unmountContainerLayers DeactivateLayer", path) return wclayer.DeactivateLayer(path) } // V2 Xenon // Base+Scratch as a minimum. This is different to v1 which only requires the scratch if len(layerFolders) < 2 { return fmt.Errorf("at least two layers are required for unmount") } var retError error // Unload the storage filter followed by the SCSI scratch if (op & UnmountOperationSCSI) == UnmountOperationSCSI { containerScratchPathInUVM := ospath.Join(uvm.OS(), guestRoot, scratchPath) logrus.Debugf("hcsshim::unmountContainerLayers CombinedLayers %s", containerScratchPathInUVM) combinedLayersModification := &hcsschema.ModifySettingRequest{ GuestRequest: guestrequest.GuestRequest{ ResourceType: guestrequest.ResourceTypeCombinedLayers, RequestType: requesttype.Remove, Settings: guestrequest.CombinedLayers{ContainerRootPath: containerScratchPathInUVM}, }, } if err := uvm.Modify(combinedLayersModification); err != nil { logrus.Errorf(err.Error()) } // Hot remove the scratch from the SCSI controller hostScratchFile := filepath.Join(layerFolders[len(layerFolders)-1], "sandbox.vhdx") logrus.Debugf("hcsshim::unmountContainerLayers SCSI %s %s", containerScratchPathInUVM, hostScratchFile) if err := uvm.RemoveSCSI(hostScratchFile); err != nil { e := fmt.Errorf("failed to remove SCSI %s: %s", hostScratchFile, err) logrus.Debugln(e) if retError == nil { retError = e } else { retError = errors.Wrapf(retError, e.Error()) } } } // Remove each of the read-only layers from VSMB. These's are ref-counted and // only removed once the count drops to zero. This allows multiple containers // to share layers. if uvm.OS() == "windows" && len(layerFolders) > 1 && (op&UnmountOperationVSMB) == UnmountOperationVSMB { for _, layerPath := range layerFolders[:len(layerFolders)-1] { if e := uvm.RemoveVSMB(layerPath); e != nil { logrus.Debugln(e) if retError == nil { retError = e } else { retError = errors.Wrapf(retError, e.Error()) } } } } // Remove each of the read-only layers from VPMEM (or SCSI). These's are ref-counted // and only removed once the count drops to zero. This allows multiple containers to // share layers. Note that SCSI is used on large layers. if uvm.OS() == "linux" && len(layerFolders) > 1 && (op&UnmountOperationVPMEM) == UnmountOperationVPMEM { for _, layerPath := range layerFolders[:len(layerFolders)-1] { hostPath := filepath.Join(layerPath, "layer.vhd") if fi, err := os.Stat(hostPath); err != nil { var e error if uint64(fi.Size()) > uvm.PMemMaxSizeBytes() { e = uvm.RemoveSCSI(hostPath) } else { e = uvm.RemoveVPMEM(hostPath) } if e != nil { logrus.Debugln(e) if retError == nil { retError = e } else { retError = errors.Wrapf(retError, e.Error()) } } } } } // TODO (possibly) Consider deleting the container directory in the utility VM return retError } func cleanupOnMountFailure(uvm *uvm.UtilityVM, wcowLayers []string, lcowLayers []lcowLayerEntry, scratchHostPath string) { for _, wl := range wcowLayers { if err := uvm.RemoveVSMB(wl); err != nil { logrus.Warnf("Possibly leaked vsmbshare on error removal path: %s", err) } } for _, ll := range lcowLayers { if ll.scsi { if err := uvm.RemoveSCSI(ll.hostPath); err != nil { logrus.Warnf("Possibly leaked SCSI on error removal path: %s", err) } } else if err := uvm.RemoveVPMEM(ll.hostPath); err != nil { logrus.Warnf("Possibly leaked vpmemdevice on error removal path: %s", err) } } if scratchHostPath != "" { if err := uvm.RemoveSCSI(scratchHostPath); err != nil { logrus.Warnf("Possibly leaked SCSI disk on error removal path: %s", err) } } } func computeV2Layers(vm *uvm.UtilityVM, paths []string) (layers []hcsschema.Layer, err error) { for _, path := range paths { uvmPath, err := vm.GetVSMBUvmPath(path) if err != nil { return nil, err } layerID, err := wclayer.LayerID(path) if err != nil { return nil, err } layers = append(layers, hcsschema.Layer{Id: layerID.String(), Path: uvmPath}) } return layers, nil }