diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index aebe1609ce12..23987c32e805 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -1233,6 +1233,7 @@ dependencies = [ "prometheus", "protobuf 3.2.0", "protocols", + "rand", "regex", "regorus", "rtnetlink", diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml index 0dd91ffd301f..87a1e8e17507 100644 --- a/src/agent/Cargo.toml +++ b/src/agent/Cargo.toml @@ -19,6 +19,7 @@ serde_json = "1.0.39" scan_fmt = "0.2.3" scopeguard = "1.0.0" thiserror = "1.0.26" +rand = "0.8.5" regex = "1.10.4" serial_test = "0.5.1" kata-sys-util = { path = "../libs/kata-sys-util" } diff --git a/src/agent/src/storage/encryption.rs b/src/agent/src/storage/encryption.rs new file mode 100644 index 000000000000..86c351a084fa --- /dev/null +++ b/src/agent/src/storage/encryption.rs @@ -0,0 +1,223 @@ +use std::env::temp_dir; +use std::fs::File; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::process::Command; + +use anyhow::{anyhow, Result}; +use rand::{distributions::Alphanumeric, Rng}; +use slog::Logger; +use tracing::instrument; + +// encrypt_device encrypts and formats a device, then returns the path +// of the newly-created dm-crypt device. +#[instrument] +pub fn encrypt_device(logger: &Logger, device_path: &Path) -> Result { + // Path to the key file that will be passed to the cryptsetup + // commands. + let key_file_path = { + let random_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(5) + .map(char::from) + .collect(); + let filename = format!("encrypted_storage_key_{}", random_string); + temp_dir().join(filename) + }; + + // Generate a random encryption key and write it to the key file. + let mut key = vec![0u8; 4096]; + rand::thread_rng().fill(&mut key[..]); + let mut key_file = File::create(&key_file_path)?; + key_file.write_all(&key)?; + + // Name of the devmapper that will live under /dev/mapper/. + let devmapper_device_name = device_path + .file_name() + .ok_or_else(|| anyhow!("invalid path"))? + .to_string_lossy() + .into_owned(); + + let script_path: PathBuf = temp_dir().join("luks-encrypt-storage.sh"); + if !script_path.exists() { + let mut script_file = File::create(&script_path)?; + script_file.write_all(LUKS_ENCRYPT_STORAGE_SCRIPT.as_bytes())?; + } + + info!(logger, "Running luks-encrypt-storage.sh"); + let output = Command::new("bash") + .args([ + script_path.display().to_string(), + device_path.display().to_string(), // device_path + devmapper_device_name.to_string(), // opened_device_name + "false".to_string(), // is_encrypted (false so the script encrypts it) + key_file_path.display().to_string(), // storage_key_path + "true".to_string(), // data_integrity + ]) + .output()?; + if !output.status.success() { + info!(logger, "Failed to run luks-encrypt-storage.sh"; + "status" => output.status.code().unwrap_or(-1), + "stdout" => String::from_utf8_lossy(&output.stdout).to_string(), + "stderr" => String::from_utf8_lossy(&output.stderr).to_string(), + ); + assert!(output.status.success()); + } + + // We're now mounting from the dm-crypt device, not the original + // device (now ciphertext), so we return the devmapper device. + let devmapper_device_path = PathBuf::from(format!("/dev/mapper/{devmapper_device_name}")); + Ok(devmapper_device_path) +} + +// Reference: https://github.com/confidential-containers/guest-components/blob/main/confidential-data-hub/storage/scripts/luks-encrypt-storage +static LUKS_ENCRYPT_STORAGE_SCRIPT: &str = r#" +#!/bin/bash +# +# Copyright (c) 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +#[ -n "${DEBUG:-}" ] && set -o xtrace +set -o xtrace + +handle_error() { + local exit_code="${?}" + local line_number="${1:-}" + echo "error:" + echo "Failed at $line_number: ${BASH_COMMAND}" + exit "${exit_code}" +} +trap 'handle_error $LINENO' ERR + +die() +{ + local msg="$*" + echo >&2 "ERROR: $msg" + exit 1 +} + +setup() +{ + local cmds=() + + cmds+=("cryptsetup" "mkfs.ext4" "mount") + + local cmd + for cmd in "${cmds[@]}" + do + command -v "$cmd" &>/dev/null || die "need command: '$cmd'" + done +} + +setup + +device_path=${1:-} +if [ -z "$device_path" ]; then + die "invalid arguments, at least one param for device path" +fi + +opened_device_name=${2:-} +if [ -z "$opened_device_name" ]; then + die "invalid arguments, at least one param for device path" +fi + +is_encrypted="false" +if [ -n "${3-}" ]; then + is_encrypted="$3" +fi + +storage_key_path="/run/encrypt_storage.key" +if [ -n "${4-}" ]; then + storage_key_path="$4" +fi + +data_integrity="true" +if [ -n "${5-}" ]; then + data_integrity="$5" +fi + +if [[ -b "$device_path" ]]; then + + if [ "$is_encrypted" == "false" ]; then + echo >&2 "is_encrypted=false branch" + + if [ "$data_integrity" == "false" ]; then + echo >&2 "integ=false branch" + cryptsetup --verbose --debug --batch-mode luksFormat --type luks2 "$device_path" --sector-size 4096 \ + --cipher aes-xts-plain64 "$storage_key_path" + else + echo >&2 "integ=true branch" + # Wiping a device is a time consuming operation. To avoid a full wipe, integritysetup + # and crypt setup provide a --no-wipe option. + # However, an integrity device that is not wiped will have invalid checksums. Normally + # this should not be a problem since a page must first be written to before it can be read + # (otherwise the data would be arbitrary). The act of writing would populate the checksum + # for the page. + # However, tools like mkfs.ext4 read pages before they are written; sometimes the read + # of an unwritten page happens due to kernel buffering. + # See https://gitlab.com/cryptsetup/cryptsetup/-/issues/525 for explanation and fix. + # The way to propery format the non-wiped dm-integrity device is to figure out which pages + # mkfs.ext4 will write to and then to write to those pages before hand so that they will + # have valid integrity tags. + cryptsetup --verbose --debug --batch-mode luksFormat --type luks2 "$device_path" --sector-size 4096 \ + --cipher aes-xts-plain64 --integrity hmac-sha256 "$storage_key_path" \ + --integrity-no-wipe + fi + fi + + cryptsetup luksOpen -d "$storage_key_path" "$device_path" "$opened_device_name" + rm "$storage_key_path" + + if [ "$data_integrity" == "false" ]; then + mkfs.ext4 "/dev/mapper/$opened_device_name" -E lazy_journal_init + else + # mkfs.ext4 doesn't perform whole sector writes and this will cause checksum failures + # with an unwiped integrity device. Therefore, first perform a dry run. + output=$(mkfs.ext4 "/dev/mapper/$opened_device_name" -F -n) + + # The above command will produce output like + # mke2fs 1.46.5 (30-Dec-2021) + # Creating filesystem with 268435456 4k blocks and 67108864 inodes + # Filesystem UUID: 4a5ff012-91c0-47d9-b4bb-8f83e830825f + # Superblock backups stored on blocks: + # 32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, + # 4096000, 7962624, 11239424, 20480000, 23887872, 71663616, 78675968, + # 102400000, 214990848 + delimiter="Superblock backups stored on blocks:" + blocks_list=$([[ $output =~ $delimiter(.*) ]] && echo "${BASH_REMATCH[1]}") + + # Find list of blocks + block_nums=$(echo "$blocks_list" | grep -Eo '[0-9]{4,}' | sort -n) + + if [ -z "$block_nums" ]; then + die "Block numbers not found" + fi + + # Add zero to list of blocks + block_nums="0 $block_nums" + + # Iterate through each block and write to it to ensure that it has valid checksum + for block_num in $block_nums + do + echo "Clearing page at $block_num" + # Zero out the page + dd if=/dev/zero bs=4k count=1 oflag=direct \ + of="/dev/mapper/$opened_device_name" seek="$block_num" + done + + # Now perform the actual ext4 format. Use lazy_journal_init so that the journal is + # initialized on demand. This is safe for ephemeral storage since we don't expect + # ephemeral storage to survice a power cycle. + mkfs.ext4 "/dev/mapper/$opened_device_name" -E lazy_journal_init + fi +else + die "Invalid device: '$device_path'" +fi +"#; diff --git a/src/agent/src/storage/mod.rs b/src/agent/src/storage/mod.rs index 3440e4177784..642463c01ff7 100644 --- a/src/agent/src/storage/mod.rs +++ b/src/agent/src/storage/mod.rs @@ -25,12 +25,12 @@ use zerocopy::AsBytes; use self::bind_watcher_handler::BindWatcherHandler; use self::block_handler::{PmemHandler, ScsiHandler, VirtioBlkMmioHandler, VirtioBlkPciHandler}; use self::ephemeral_handler::EphemeralHandler; -use self::fs_handler::{OverlayfsHandler, Virtio9pHandler, VirtioFsHandler, SMBHandler}; +use self::fs_handler::{OverlayfsHandler, SMBHandler, Virtio9pHandler, VirtioFsHandler}; use self::local_handler::LocalHandler; use crate::device::{ DRIVER_9P_TYPE, DRIVER_BLK_MMIO_TYPE, DRIVER_BLK_PCI_TYPE, DRIVER_EPHEMERAL_TYPE, DRIVER_LOCAL_TYPE, DRIVER_NVDIMM_TYPE, DRIVER_OVERLAYFS_TYPE, DRIVER_SCSI_TYPE, - DRIVER_VIRTIOFS_TYPE, DRIVER_WATCHABLE_BIND_TYPE, DRIVER_SMB_TYPE, + DRIVER_SMB_TYPE, DRIVER_VIRTIOFS_TYPE, DRIVER_WATCHABLE_BIND_TYPE, }; use crate::mount::{baremount, is_mounted, remove_mounts}; use crate::sandbox::Sandbox; @@ -39,6 +39,7 @@ pub use self::ephemeral_handler::update_ephemeral_mounts; mod bind_watcher_handler; mod block_handler; +mod encryption; mod ephemeral_handler; mod fs_handler; mod local_handler; @@ -380,8 +381,22 @@ fn mount_storage(logger: &Logger, storage: &Storage) -> Result<()> { "mount-options" => options.as_str(), ); + let confidential = storage + .driver_options + .contains(&"confidential=true".to_string()); + let ephemeral = storage + .driver_options + .contains(&"ephemeral=true".to_string()); + + let src_path = if confidential && ephemeral { + // TODO: Call into the CDH instead after we've synced with upstream. + encryption::encrypt_device(&logger, src_path)? + } else { + src_path.to_path_buf() + }; + baremount( - src_path, + &src_path, mount_path, storage.fstype.as_str(), flags, diff --git a/src/runtime/pkg/direct-volume/utils.go b/src/runtime/pkg/direct-volume/utils.go index 2d5e55b62a27..8117c7f82428 100644 --- a/src/runtime/pkg/direct-volume/utils.go +++ b/src/runtime/pkg/direct-volume/utils.go @@ -20,6 +20,8 @@ const ( FSGroupMetadataKey = "fsGroup" FSGroupChangePolicyMetadataKey = "fsGroupChangePolicy" SensitiveMountOptions = "sensitiveMountOptions" + ConfidentialMetadataKey = "confidential" + EphemeralMetadataKey = "ephemeral" ) // FSGroupChangePolicy holds policies that will be used for applying fsGroup to a volume. diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index db5d6b68df24..596e7f8ed55a 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -658,6 +658,20 @@ func (c *Container) createBlockDevices(ctx context.Context) error { c.mounts[i].FSGroupChangePolicy = volume.FSGroupChangePolicy(value) case volume.SensitiveMountOptions: c.mounts[i].Options = append(c.mounts[i].Options, value) + case volume.ConfidentialMetadataKey: + confidential, err := strconv.ParseBool(value) + if err != nil { + c.Logger().Errorf("invalid value %q for metadata key %q, expected boolean string", value, key) + continue + } + c.mounts[i].Confidential = confidential + case volume.EphemeralMetadataKey: + ephemeral, err := strconv.ParseBool(value) + if err != nil { + c.Logger().Errorf("invalid value %q for metadata key %q, expected boolean string", value, key) + continue + } + c.mounts[i].Ephemeral = ephemeral default: c.Logger().Warnf("Ignoring unsupported direct-assignd volume metadata key: %s, value: %s", key, value) } diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 9439afca1bdc..eca5c8a86ff3 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -1617,6 +1617,13 @@ func (k *kataAgent) handleDeviceBlockVolume(c *Container, m Mount, device api.De } } + if m.Confidential { + vol.DriverOptions = append(vol.DriverOptions, fmt.Sprintf("%s=true", volume.ConfidentialMetadataKey)) + } + if m.Ephemeral { + vol.DriverOptions = append(vol.DriverOptions, fmt.Sprintf("%s=true", volume.EphemeralMetadataKey)) + } + return vol, nil } diff --git a/src/runtime/virtcontainers/mount.go b/src/runtime/virtcontainers/mount.go index f261618905d5..e8a1cb984cb1 100644 --- a/src/runtime/virtcontainers/mount.go +++ b/src/runtime/virtcontainers/mount.go @@ -273,6 +273,13 @@ type Mount struct { // FSGroupChangePolicy specifies the policy that will be used when applying // group id ownership change for a volume. FSGroupChangePolicy volume.FSGroupChangePolicy + + // Confidential specifies whether the underlying storage is encrypted. + Confidential bool + + // Ephemeral specifies whether the underlying storage is ephemeral: + // https://kubernetes.io/docs/concepts/storage/ephemeral-volumes/ + Ephemeral bool } func isSymlink(path string) bool { diff --git a/src/tools/genpolicy/genpolicy-settings.json b/src/tools/genpolicy/genpolicy-settings.json index 7d35862afa73..e5c11fb410b2 100644 --- a/src/tools/genpolicy/genpolicy-settings.json +++ b/src/tools/genpolicy/genpolicy-settings.json @@ -277,13 +277,15 @@ "CAP_CHECKPOINT_RESTORE" ], "virtio_blk_storage_classes": [ - "cc-local-csi", "cc-managed-csi", "cc-managed-premium-csi" ], "smb_storage_classes": [ "cc-azurefile-csi", "cc-azurefile-premium-csi" + ], + "coco_ephemeral_storage_classes": [ + "cc-local-csi" ] }, "kata_config": { @@ -322,4 +324,4 @@ "UpdateEphemeralMountsRequest": false, "WriteStreamRequest": false } -} \ No newline at end of file +} diff --git a/src/tools/genpolicy/src/mount_and_storage.rs b/src/tools/genpolicy/src/mount_and_storage.rs index ecb8bf5776ff..0fb0cdace0da 100644 --- a/src/tools/genpolicy/src/mount_and_storage.rs +++ b/src/tools/genpolicy/src/mount_and_storage.rs @@ -246,9 +246,14 @@ fn get_persistent_volume_claim_mount( .and_then(|pvc_resource| pvc_resource.spec.storageClassName.as_ref()) .is_some_and(|sc| settings.common.smb_storage_classes.contains(sc)); + let is_coco_ephemeral_mount = pvc_resource + .and_then(|pvc_resource| pvc_resource.spec.storageClassName.as_ref()) + .is_some_and(|sc| settings.common.coco_ephemeral_storage_classes.contains(sc)); + handle_persistent_volume_claim( is_blk_mount, is_smb_mount, + is_coco_ephemeral_mount, yaml_mount, p_mounts, storages, @@ -431,14 +436,21 @@ fn get_ephemeral_mount( .as_ref() .map(|sc| settings.common.virtio_blk_storage_classes.contains(sc)) .unwrap_or(false); + let is_smb_mount = storage_class .as_ref() .map(|sc| settings.common.smb_storage_classes.contains(sc)) .unwrap_or(false); + let is_coco_ephemeral_mount = storage_class + .as_ref() + .map(|sc| settings.common.coco_ephemeral_storage_classes.contains(sc)) + .unwrap_or(false); + handle_persistent_volume_claim( is_blk_mount, is_smb_mount, + is_coco_ephemeral_mount, yaml_mount, p_mounts, storages, @@ -449,21 +461,28 @@ fn get_ephemeral_mount( pub fn handle_persistent_volume_claim( is_blk_mount: bool, is_smb_mount: bool, + is_coco_ephemeral_mount: bool, yaml_mount: &pod::VolumeMount, p_mounts: &mut Vec, storages: &mut Vec, mount_options: (&str, &str), ) { - if is_blk_mount || is_smb_mount { + if is_blk_mount || is_smb_mount || is_coco_ephemeral_mount { let source = "$(spath)/$(b64-direct-vol-path)".to_string(); + let mut driver_options = Vec::new(); + if is_coco_ephemeral_mount { + driver_options.push("confidential=true".to_string()); + driver_options.push("ephemeral=true".to_string()); + } + storages.push(agent::Storage { - driver: if is_blk_mount { + driver: if is_blk_mount || is_coco_ephemeral_mount { "blk".to_string() } else { "smb".to_string() }, - driver_options: Vec::new(), + driver_options, fs_group: None, source: "$(direct-vol-path)".to_string(), mount_point: source.to_string(), diff --git a/src/tools/genpolicy/src/policy.rs b/src/tools/genpolicy/src/policy.rs index baa382b7646a..d6eb113af65c 100644 --- a/src/tools/genpolicy/src/policy.rs +++ b/src/tools/genpolicy/src/policy.rs @@ -380,6 +380,9 @@ pub struct CommonData { /// Storage classes which mounts should be handled as smb mounts pub smb_storage_classes: Vec, + + /// Storage classes which mounts should be handled as encrypted and ephemeral devices. + pub coco_ephemeral_storage_classes: Vec, } /// Struct used to read data from the settings file and copy that data into the policy. diff --git a/src/tools/genpolicy/src/stateful_set.rs b/src/tools/genpolicy/src/stateful_set.rs index 4c55f59ec3e8..866ae747f4cd 100644 --- a/src/tools/genpolicy/src/stateful_set.rs +++ b/src/tools/genpolicy/src/stateful_set.rs @@ -226,6 +226,12 @@ impl StatefulSet { } else { false }; + // check if a storage class is set and if it is a coco ephemeral storage class + let is_coco_ephemeral_mount = if let Some(storage_class) = &claim.spec.storageClassName { + settings.common.coco_ephemeral_storage_classes.contains(storage_class) + } else { + false + }; let propagation = match &mount.mountPropagation { Some(p) if p == "Bidirectional" => "rshared", @@ -242,6 +248,7 @@ impl StatefulSet { mount_and_storage::handle_persistent_volume_claim( is_blk_mount, is_smb_mount, + is_coco_ephemeral_mount, mount, policy_mounts, storages, diff --git a/tools/osbuilder/node-builder/azure-linux/package_install.sh b/tools/osbuilder/node-builder/azure-linux/package_install.sh index ecc2b8bee02b..65a443457b96 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_install.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_install.sh @@ -48,8 +48,10 @@ if [ "${CONF_PODS}" == "yes" ]; then if [ "${SHIM_USE_DEBUG_CONFIG}" == "yes" ]; then # We simply override the release config with the debug config, - # which is probably fine when debugging. - ln -sf src/runtime/config/"${SHIM_DBG_CONFIG_FILE_NAME}" src/runtime/config/"${SHIM_CONFIG_FILE_NAME}" + # which is probably fine when debugging. Not symlinking as that + # would create cycles the next time this script is called. + echo "Overriding shim configuration with debug shim configuration" + cp -a --backup=numbered src/runtime/config/"${SHIM_DBG_CONFIG_FILE_NAME}" src/runtime/config/"${SHIM_CONFIG_FILE_NAME}" fi echo "Enabling and starting snapshotter service"