diff --git a/docs/ipc.md b/docs/ipc.md index ba58e3e5..9f567767 100644 --- a/docs/ipc.md +++ b/docs/ipc.md @@ -24,7 +24,7 @@ cargo make --makefile infra/Makefile.toml \ -e BOOTSTRAPS= -e PARENT_REGISTRY= \ -e PARENT_GATEWAY= \ - -e CMT_EXTERNAL_ADDR= \ + -e CMT_P2P_EXTERNAL_ADDR= \ bootstrap ``` You'll see that by the end of the output, this command should output the network address of your bootstrap. You can use this endpoint to include this bootstrap node as a seed in the `seeds` configuration of CometBFT. @@ -47,7 +47,7 @@ cargo make --makefile infra/Makefile.toml \ - `SUBNET_ID`: SubnetID the bootstrap is operating in. - `NODE_NAME` (optional): Node name information to attach to the containers of the deployment. This will be needed to deploy more than one bootstrap in the same local environment. - `BOOTSTRAPS`: Comma separated list of bootstraps (or seeds in CometBFT parlance) that we want this bootstrap to also be connected to. -- `CMT_EXTERNAL_ADDR`: Address to advertise to peers for them to dial. If empty, will use the same as the default listening address from CometBFT (generally `0.0.0.0:`). +- `CMT_P2P_EXTERNAL_ADDR`: Address to advertise to peers for them to dial. If empty, will use the same as the default listening address from CometBFT (generally `0.0.0.0:`). - `PARENT_ENDPOINT`: Public endpoint that the validator should use to connect to the parent. - `PARENT_REGISTRY`: Ethereum address of the IPC registry contract in the parent - `PARENT_GATEWAY`: Ethereum address of the IPC gateway contract in the parent. @@ -76,7 +76,7 @@ cargo make --makefile infra/Makefile.toml \ -e BOOTSTRAPS= -e PARENT_REGISTRY= \ -e PARENT_GATEWAY= \ - -e CMT_EXTERNAL_ADDR= \ + -e CMT_P2P_EXTERNAL_ADDR= \ child-validator ``` This command will run the infrastructure for a Fendermint validator in the child subnet. It will generate the genesis of the subnet from the information in its parent, and will run the validator's infrastructure with the specific configuration passed in the command. @@ -89,7 +89,7 @@ This command will run the infrastructure for a Fendermint validator in the child - `PRIVATE_KEY_PATH`: Path of the hex encoded private key for your validator (it should be the corresponding one used to join the subnet in the parent). This can be exported from the `ipc-cli` or any other wallet like Metamask. - `SUBNET_ID`: SubnetID for the child subnet. - `BOOTSTRAPS`: Comma separated list of bootstraps (or seeds in CometBFT parlance). -- `CMT_EXTERNAL_ADDR`: Address to advertise to peers for them to dial. If empty, will use the same as the default listening address from CometBFT (generally `0.0.0.0:`). +- `CMT_P2P_EXTERNAL_ADDR`: Address to advertise to peers for them to dial. If empty, will use the same as the default listening address from CometBFT (generally `0.0.0.0:`). - `PARENT_ENDPOINT`: Public endpoint that the validator should use to connect to the parent. - `PARENT_REGISTRY`: Ethereum address of the IPC registry contract in the parent - `PARENT_GATEWAY`: Ethereum address of the IPC gateway contract in the parent. @@ -114,7 +114,7 @@ cargo make --makefile infra/Makefile.toml \ -e BOOTSTRAPS= -e PARENT_REGISTRY= \ -e PARENT_GATEWAY= \ - -e CMT_EXTERNAL_ADDR= \ + -e CMT_P2P_EXTERNAL_ADDR= \ child-fullnode ``` The full node also has its corresponding commands to kill and restart the node: @@ -122,4 +122,3 @@ The full node also has its corresponding commands to kill and restart the node: cargo make --makefile infra/Makefile.toml child-fullnode-down cargo make --makefile infra/Makefile.toml child-fullnode-restart ``` - diff --git a/fendermint/app/settings/src/lib.rs b/fendermint/app/settings/src/lib.rs index 767b9354..d28afeb5 100644 --- a/fendermint/app/settings/src/lib.rs +++ b/fendermint/app/settings/src/lib.rs @@ -175,6 +175,14 @@ pub struct SnapshotSettings { /// How often to poll CometBFT to see whether it has caught up with the chain. #[serde_as(as = "DurationSeconds")] pub sync_poll_interval: Duration, + /// Temporary directory for downloads. + download_dir: Option, +} + +impl SnapshotSettings { + pub fn download_dir(&self) -> PathBuf { + self.download_dir.clone().unwrap_or(std::env::temp_dir()) + } } #[derive(Debug, Deserialize, Clone)] diff --git a/fendermint/app/src/app.rs b/fendermint/app/src/app.rs index 82bc463e..0aacaa42 100644 --- a/fendermint/app/src/app.rs +++ b/fendermint/app/src/app.rs @@ -690,25 +690,18 @@ where tendermint::Hash::None => return Err(anyhow!("empty block hash").into()), }; + let db = self.state_store_clone(); + let state = self.committed_state()?; + let mut state_params = state.state_params.clone(); + tracing::debug!( height = block_height, + timestamp = request.header.time.unix_timestamp(), app_hash = request.header.app_hash.to_string(), + //app_state_hash = to_app_hash(&state_params).to_string(), // should be the same as `app_hash` "begin block" ); - let db = self.state_store_clone(); - let state = self.committed_state()?; - let mut state_params = state.state_params.clone(); - - // Notify the snapshotter. We don't do this in `commit` because *this* is the height at which - // this state has been officially associated with the application hash, which is something - // we will receive in `offer_snapshot` and we can compare. If we did it in `commit` we'd - // have to associate the snapshot with `block_height + 1`. But this way we also know that - // others have agreed with our results. - if let Some(ref snapshots) = self.snapshots { - atomically(|| snapshots.notify(block_height as u64, state_params.clone())).await; - } - state_params.timestamp = to_timestamp(request.header.time); let state = FvmExecState::new(db, self.multi_engine.as_ref(), block_height, state_params) @@ -801,6 +794,13 @@ where let app_hash = state.app_hash(); let block_height = state.block_height; + // Tell CometBFT how much of the block history it can forget. + let retain_height = if self.state_hist_size == 0 { + Default::default() + } else { + block_height.saturating_sub(self.state_hist_size) + }; + tracing::debug!( block_height, state_root = state_root.to_string(), @@ -824,6 +824,24 @@ where // notified about), we could add it to the `ChainMessageInterpreter` as a constructor argument, // a sort of "ambient state", and not worry about in in the `App` at all. + // Notify the snapshotter. It wasn't clear whether this should be done in `commit` or `begin_block`, + // that is, whether the _height_ of the snapshot should be `block_height` or `block_height+1`. + // When CometBFT calls `offer_snapshot` it sends an `app_hash` in it that we compare to the CID + // of the `state_params`. Based on end-to-end testing it looks like it gives the `app_hash` from + // the *next* block, so we have to do it here. + // For example: + // a) Notify in `begin_block`: say we are at committing block 899, then we notify in `begin_block` + // that block 900 has this state (so we use `block_height+1` in notification); + // CometBFT is going to offer it with the `app_hash` of block 901, which won't match, because + // by then the timestamp will be different in the state params after committing block 900. + // b) Notify in `commit`: say we are committing block 900 and notify immediately that it has this state + // (even though this state will only be available to query from the next height); + // CometBFT is going to offer it with the `app_hash` of 901, but in this case that's good, because + // that hash reflects the changes made by block 900, which this state param is the result of. + if let Some(ref snapshots) = self.snapshots { + atomically(|| snapshots.notify(block_height, state.state_params.clone())).await; + } + // Commit app state to the datastore. self.set_committed_state(state)?; @@ -831,20 +849,20 @@ where let mut guard = self.check_state.lock().await; *guard = None; - let response = response::Commit { + Ok(response::Commit { data: app_hash.into(), - // We have to retain blocks until we can support Snapshots. - retain_height: Default::default(), - }; - Ok(response) + retain_height: retain_height.try_into().expect("height is valid"), + }) } /// List the snapshots available on this node to be served to remote peers. async fn list_snapshots(&self) -> AbciResult { if let Some(ref client) = self.snapshots { let snapshots = atomically(|| client.list_snapshots()).await; + tracing::info!(snapshot_count = snapshots.len(), "listing snaphots"); Ok(to_snapshots(snapshots)?) } else { + tracing::info!("listing snaphots disabled"); Ok(Default::default()) } } @@ -882,6 +900,11 @@ where request: request::OfferSnapshot, ) -> AbciResult { if let Some(ref client) = self.snapshots { + tracing::info!( + height = request.snapshot.height.value(), + "received snapshot offer" + ); + match from_snapshot(request).context("failed to parse snapshot") { Ok(manifest) => { tracing::info!(?manifest, "received snapshot offer"); @@ -955,6 +978,8 @@ where // Now insert the new state into the history. let mut state = self.committed_state()?; + + // The height reflects that it was produced in `commit`. state.block_height = snapshot.manifest.block_height; state.state_params = snapshot.manifest.state_params; self.set_committed_state(state)?; diff --git a/fendermint/app/src/cmd/run.rs b/fendermint/app/src/cmd/run.rs index 8638ce42..c0253ae8 100644 --- a/fendermint/app/src/cmd/run.rs +++ b/fendermint/app/src/cmd/run.rs @@ -15,7 +15,7 @@ use fendermint_vm_interpreter::{ signed::SignedMessageInterpreter, }; use fendermint_vm_resolver::ipld::IpldResolver; -use fendermint_vm_snapshot::SnapshotManager; +use fendermint_vm_snapshot::{SnapshotManager, SnapshotParams}; use fendermint_vm_topdown::proxy::IPCProviderProxy; use fendermint_vm_topdown::sync::launch_polling_syncer; use fendermint_vm_topdown::{CachedFinalityProvider, Toggle}; @@ -187,12 +187,15 @@ async fn run(settings: Settings) -> anyhow::Result<()> { let snapshots = if settings.snapshots.enabled { let (manager, client) = SnapshotManager::new( state_store.clone(), - settings.snapshots_dir(), - settings.snapshots.block_interval, - settings.snapshots.chunk_size_bytes, - settings.snapshots.hist_size, - settings.snapshots.last_access_hold, - settings.snapshots.sync_poll_interval, + SnapshotParams { + snapshots_dir: settings.snapshots_dir(), + download_dir: settings.snapshots.download_dir(), + block_interval: settings.snapshots.block_interval, + chunk_size: settings.snapshots.chunk_size_bytes, + hist_size: settings.snapshots.hist_size, + last_access_hold: settings.snapshots.last_access_hold, + sync_poll_interval: settings.snapshots.sync_poll_interval, + }, ) .context("failed to create snapshot manager")?; diff --git a/fendermint/app/src/tmconv.rs b/fendermint/app/src/tmconv.rs index a09b45bf..4ef0a605 100644 --- a/fendermint/app/src/tmconv.rs +++ b/fendermint/app/src/tmconv.rs @@ -400,7 +400,11 @@ pub fn from_snapshot( let app_hash = to_app_hash(&metadata.state_params); if app_hash != offer.app_hash { - bail!("the application hash does not match the metadata"); + bail!( + "the application hash does not match the metadata; from-meta = {}, from-offer = {}", + app_hash, + offer.app_hash, + ); } let checksum = tendermint::hash::Hash::try_from(offer.snapshot.hash) diff --git a/fendermint/testing/snapshot-test/Makefile.toml b/fendermint/testing/snapshot-test/Makefile.toml index 0b09d560..3acb916c 100644 --- a/fendermint/testing/snapshot-test/Makefile.toml +++ b/fendermint/testing/snapshot-test/Makefile.toml @@ -1,38 +1,48 @@ -# cd fendermint/testing/snapshot-test -# - then - -# cargo make -# - or - -# cargo make setup -# cargo make test -# docker logs snapshot-fendermint -# cargo make teardown +# See fendermint/testing/snapshot-test/src/lib.rs for description. extend = [ { path = "../Makefile/common.toml" }, ] env_files = [ + # `snapshot.env` is the environment for `cargo make`. { path = "./scripts/snapshot.env" }, { path = "../Makefile/common.env" }, { path = "../Makefile/ci.env", profile = "ci" }, ] # Overriding the env file to enable snapshotting. +# This one is applied on every *container*. +# The other env files are for `cargo make` itself, +# the values are only available inside TOML files. +# Disabling PEX so nodes only connect to what they are told about. [tasks.test-data-env] script = """ cat << EOF > ${TEST_DATA_DIR}/.env -FM_DB__HIST_SIZE=100 FM_SNAPSHOTS__ENABLED=true FM_SNAPSHOTS__BLOCK_INTERVAL=10 FM_SNAPSHOTS__HIST_SIZE=10 FM_SNAPSHOTS__CHUNK_SIZE_BYTES=1048576 FM_SNAPSHOTS__SYNC_POLL_INTERVAL=10 +CMT_P2P_PEX=false +CMT_P2P_MAX_NUM_OUTBOUND_PEERS=3 +CMT_CONSENSUS_TIMEOUT_COMMIT=1s EOF """ +# This is the test workflow [tasks.test] clear = true -dependencies = ["snapshot-wait", "snapshot-created", "fullnode-sync"] +run_task = { name = [ + "node-1-setup", + "node-1-sync-test", + "node-2-setup", + "node-2-sync-test", + "snapshot-wait", + "snapshot-created", + "node-3-setup", + "node-3-sync-test", +], fork = true, cleanup_task = "snapshot-teardown" } # Wait enough time that some snapshots should be exported. [tasks.snapshot-wait] @@ -49,52 +59,58 @@ if [ -z "$(ls -A $FM_SNAPSOTS_DIR)" ]; then fi """ - -# Set up a full node that syncs with the default one, then stop it. -[tasks.fullnode-sync] +# Shut down all non-default nodes. +[tasks.snapshot-teardown] run_task = { name = [ - "fullnode-setup", - "fullnode-test", -], fork = true, cleanup_task = "fullnode-teardown" } + "node-1-teardown", + "node-2-teardown", + "node-3-teardown", +] } + +# ### General tasks for node-1 and node-2 -[tasks.fullnode-setup] +[tasks.node-setup] +# Export node-0 ID. dependencies = ["cometbft-export-node-id"] -env_files = [{ path = "./scripts/fullnode.env" }] run_task = { name = [ "test-node-dir", + "node-env", "cometbft-init", - "fullnode-set-seed", - "fullnode-copy-genesis", + "node-set-seed", + "node-copy-genesis", "fendermint-start", "cometbft-start", "cometbft-wait", + "cometbft-export-node-id", "fendermint-logs", "cometbft-logs", ] } - # Set the persistent peer address to that of the default node-0. -[tasks.fullnode-set-seed] -env_files = [{ path = "./scripts/fullnode.env" }] +[tasks.node-set-seed] script = """ CMT_SEED_ID=$(cat $BASE_DIR/$SEED_NODE_NAME/node-id) -CMT_PERSISTENT_PEERS=$CMT_SEED_ID@$SEED_CMT_CONTAINER_NAME:26656 -sed -i'' -e "s|persistent_peers = \\"\\"|persistent_peers = \\"$CMT_PERSISTENT_PEERS\\"|" $BASE_DIR/${NODE_NAME}/cometbft/config/config.toml +CMT_P2P_PERSISTENT_PEERS=$CMT_SEED_ID@$SEED_CMT_CONTAINER_NAME:26656 +sed -i'' -e "s|persistent_peers = \\"\\"|persistent_peers = \\"$CMT_P2P_PERSISTENT_PEERS\\"|" $BASE_DIR/${NODE_NAME}/cometbft/config/config.toml """ - # Get the genesis from node-0 -[tasks.fullnode-copy-genesis] -env_files = [{ path = "./scripts/fullnode.env" }] +[tasks.node-copy-genesis] script = """ cp $BASE_DIR/${SEED_NODE_NAME}/cometbft/config/genesis.json \ $BASE_DIR/${NODE_NAME}/cometbft/config/genesis.json """ +[tasks.node-teardown] +run_task = { name = [ + "cometbft-destroy", + "fendermint-destroy", + "test-node-dir-rm", +] } + # See if it managed to sync. -[tasks.fullnode-test] -env_files = [{ path = "./scripts/fullnode.env" }] +[tasks.node-sync-test] script = """ EARLIEST=$(curl -s localhost:${CMT_RPC_HOST_PORT}/status | jq -r ".result.sync_info.earliest_block_height") LATEST=$(curl -s localhost:${CMT_RPC_HOST_PORT}/status | jq -r ".result.sync_info.latest_block_height") @@ -105,11 +121,77 @@ if [ "$EARLIEST" = "$LATEST" ]; then fi """ +# Tell node-3 to statesync from node-1 and node-2 +# Setting the trusted height to 10 and fetching its block hash (which appears in the header @ 11), +# so that it can do whatever it wants after that. +# Tell node-1 and node-2 to prune their states so node-3 (who only knows aobut node-1) has no chance +# but to use snapshots to sync itself. +[tasks.node-env] +script = """ +cat ${TEST_DATA_DIR}/.env > ${TEST_DATA_DIR}/${NODE_NAME}/.env + +cat << EOL >> ${TEST_DATA_DIR}/${NODE_NAME}/.env +FM_DB__STATE_HIST_SIZE=100 +EOL + +if [ $NODE_NAME = "node-3" ]; then + +LATEST_HEIGHT=$(curl -s http://localhost:26657/commit | jq -r ".result.signed_header.header.height") +TRUST_HEIGHT=$(($LATEST_HEIGHT-30)) +QUERY_HEIGHT=$(($TRUST_HEIGHT+1)) +TRUST_HASH=$(curl -s "http://localhost:26657/header?height=$QUERY_HEIGHT" | jq -r ".result.header.last_block_id.hash") + +cat << EOL >> ${TEST_DATA_DIR}/${NODE_NAME}/.env +CMT_STATESYNC_ENABLE=true +CMT_STATESYNC_RPC_SERVERS=http://snapshot-cometbft-1:26657,http://snapshot-cometbft-2:26657 +CMT_STATESYNC_TRUST_HEIGHT=$TRUST_HEIGHT +CMT_STATESYNC_TRUST_HASH=$TRUST_HASH +CMT_STATESYNC_TEMP_DIR=/cometbft +CMT_STATESYNC_DISCOVERY_TIME=5s +FM_SNAPSHOTS__DOWNLOAD_DIR=/data/${NODE_NAME}/fendermint/data +EOL +fi +""" -[tasks.fullnode-teardown] -env_files = [{ path = "./scripts/fullnode.env" }] -run_task = { name = [ - "cometbft-destroy", - "fendermint-destroy", - "test-node-dir-rm", -] } +# ### node-1 tasks + +[tasks.node-1-setup] +env_files = [{ path = "./scripts/node-1.env" }] +extend = "node-setup" + +[tasks.node-1-teardown] +env_files = [{ path = "./scripts/node-1.env" }] +extend = "node-teardown" + +[tasks.node-1-sync-test] +env_files = [{ path = "./scripts/node-1.env" }] +extend = "node-sync-test" + +# ### node-2 tasks + +[tasks.node-2-setup] +env_files = [{ path = "./scripts/node-2.env" }] +extend = "node-setup" + +[tasks.node-2-teardown] +env_files = [{ path = "./scripts/node-2.env" }] +extend = "node-teardown" + +[tasks.node-2-sync-test] +env_files = [{ path = "./scripts/node-2.env" }] +extend = "node-sync-test" + + +# ### node-3 tasks + +[tasks.node-3-setup] +env_files = [{ path = "./scripts/node-3.env" }] +extend = "node-setup" + +[tasks.node-3-teardown] +env_files = [{ path = "./scripts/node-3.env" }] +extend = "node-teardown" + +[tasks.node-3-sync-test] +env_files = [{ path = "./scripts/node-3.env" }] +extend = "node-sync-test" diff --git a/fendermint/testing/snapshot-test/scripts/fullnode.env b/fendermint/testing/snapshot-test/scripts/node-1.env similarity index 71% rename from fendermint/testing/snapshot-test/scripts/fullnode.env rename to fendermint/testing/snapshot-test/scripts/node-1.env index b6a21f47..1fcb7f70 100644 --- a/fendermint/testing/snapshot-test/scripts/fullnode.env +++ b/fendermint/testing/snapshot-test/scripts/node-1.env @@ -1,10 +1,10 @@ SEED_NODE_NAME=node-0 SEED_CMT_CONTAINER_NAME=snapshot-cometbft NODE_NAME=node-1 +ENV_FILE=${TEST_DATA_DIR}/${NODE_NAME}/.env FM_CONTAINER_NAME=snapshot-fendermint-1 CMT_CONTAINER_NAME=snapshot-cometbft-1 CMT_DIR=${TEST_DATA_DIR}/${NODE_NAME}/cometbft -CMT_P2P_HOST_PORT=26666 -CMT_RPC_HOST_PORT=26667 -CMT_MAX_NUM_OUTBOUND_PEERS=1 +CMT_P2P_HOST_PORT=26156 +CMT_RPC_HOST_PORT=26157 CMT_WAIT_MILLIS=20000 diff --git a/fendermint/testing/snapshot-test/scripts/node-2.env b/fendermint/testing/snapshot-test/scripts/node-2.env new file mode 100644 index 00000000..7380f715 --- /dev/null +++ b/fendermint/testing/snapshot-test/scripts/node-2.env @@ -0,0 +1,10 @@ +SEED_NODE_NAME=node-0 +SEED_CMT_CONTAINER_NAME=snapshot-cometbft +NODE_NAME=node-2 +ENV_FILE=${TEST_DATA_DIR}/${NODE_NAME}/.env +FM_CONTAINER_NAME=snapshot-fendermint-2 +CMT_CONTAINER_NAME=snapshot-cometbft-2 +CMT_DIR=${TEST_DATA_DIR}/${NODE_NAME}/cometbft +CMT_P2P_HOST_PORT=26256 +CMT_RPC_HOST_PORT=26257 +CMT_WAIT_MILLIS=20000 diff --git a/fendermint/testing/snapshot-test/scripts/node-3.env b/fendermint/testing/snapshot-test/scripts/node-3.env new file mode 100644 index 00000000..62f8705b --- /dev/null +++ b/fendermint/testing/snapshot-test/scripts/node-3.env @@ -0,0 +1,10 @@ +SEED_NODE_NAME=node-1 +SEED_CMT_CONTAINER_NAME=snapshot-cometbft-1 +NODE_NAME=node-3 +ENV_FILE=${TEST_DATA_DIR}/${NODE_NAME}/.env +FM_CONTAINER_NAME=snapshot-fendermint-3 +CMT_CONTAINER_NAME=snapshot-cometbft-3 +CMT_DIR=${TEST_DATA_DIR}/${NODE_NAME}/cometbft +CMT_P2P_HOST_PORT=26356 +CMT_RPC_HOST_PORT=26357 +CMT_WAIT_MILLIS=20000 diff --git a/fendermint/testing/snapshot-test/src/lib.rs b/fendermint/testing/snapshot-test/src/lib.rs index 43d95313..6b754bc7 100644 --- a/fendermint/testing/snapshot-test/src/lib.rs +++ b/fendermint/testing/snapshot-test/src/lib.rs @@ -1,14 +1,35 @@ // Copyright 2022-2023 Protocol Labs // SPDX-License-Identifier: Apache-2.0, MIT -//! Run tests against multiple Fendermint+CometBFT docker container pairs locally, -//! where one is allowed to run for a while and export some snapshots, then another -//! is started to sync its state directly with it. +//! Run tests against multiple Fendermint+CometBFT docker container pairs locally: +//! 0. The default `snapshot-fendermint` and `snapshot-cometbft` pair +//! 1. A `snapshot-cometbft-1` and `snapshot-cometbft-2`, using `scripts/node-1.env` and `node-2`.env, +//! syncing with the default node from genesis on a block-by-block basis, and clear out their history +//! to force others who sync with them to use snapshots. +//! 2. A `snapshot-cometbft-3` using `scripts/node-3.env`, +//! which syncs with `node-1` and `node-2` using snapshots (a.k.a. state sync). //! -//! Example: +//! Note that CometBFT state sync requires 2 RPC servers, which is why we need 3 nodes. //! +//! See and +//! +//! Examples: +//! +//! 1. All in one go //! ```text //! cd fendermint/testing/snapshot-test //! cargo make //! ``` //! +//! 2. One by one +//! ```text +//! cd fendermint/testing/snapshot-test +//! cargo make setup +//! cargo make node-1 setup +//! cargo make node-2 setup +//! cargo make node-3 setup +//! docker logs snapshot-cometbft-3 +//! cargo make snapshot-teardown +//! cargo make teardown +//! ``` +//! //! Make sure you installed cargo-make by running `cargo install cargo-make` first. diff --git a/fendermint/vm/snapshot/src/client.rs b/fendermint/vm/snapshot/src/client.rs index 2d75093f..2b9471c0 100644 --- a/fendermint/vm/snapshot/src/client.rs +++ b/fendermint/vm/snapshot/src/client.rs @@ -8,7 +8,6 @@ use fendermint_vm_interpreter::fvm::state::{ snapshot::{BlockHeight, SnapshotVersion}, FvmStateParams, }; -use tempfile::tempdir; use crate::{ manifest, @@ -19,14 +18,20 @@ use crate::{ /// Interface to snapshot state for the application. #[derive(Clone)] pub struct SnapshotClient { + download_dir: PathBuf, /// The client will only notify the manager of snapshottable heights. snapshot_interval: BlockHeight, state: SnapshotState, } impl SnapshotClient { - pub fn new(snapshot_interval: BlockHeight, state: SnapshotState) -> Self { + pub fn new( + download_dir: PathBuf, + snapshot_interval: BlockHeight, + state: SnapshotState, + ) -> Self { Self { + download_dir, snapshot_interval, state, } @@ -78,13 +83,8 @@ impl SnapshotClient { if manifest.version != 1 { abort(SnapshotError::IncompatibleVersion(manifest.version)) } else { - match tempdir() { + match tempfile::tempdir_in(&self.download_dir) { Ok(dir) => { - // Create a `parts` sub-directory for the chunks. - if let Err(e) = std::fs::create_dir(dir.path().join("parts")) { - return abort(SnapshotError::from(e)); - }; - // Save the manifest into the temp directory; // that way we can always see on the file system what's happening. let json = match serde_json::to_string_pretty(&manifest) @@ -93,17 +93,23 @@ impl SnapshotClient { Ok(json) => json, Err(e) => return abort(SnapshotError::from(e)), }; - if let Err(e) = std::fs::write(dir.path().join(MANIFEST_FILE_NAME), json) { - return abort(SnapshotError::from(e)); - } - let download_path = dir.path().into(); + let download_path: PathBuf = dir.path().into(); let download = SnapshotDownload { manifest, download_dir: Arc::new(dir), next_index: TVar::new(0), }; + // Create a `parts` sub-directory for the chunks. + if let Err(e) = std::fs::create_dir(download.parts_dir()) { + return abort(SnapshotError::from(e)); + }; + + if let Err(e) = std::fs::write(download_path.join(MANIFEST_FILE_NAME), json) { + return abort(SnapshotError::from(e)); + } + self.state.current_download.write(Some(download))?; Ok(download_path) @@ -129,12 +135,7 @@ impl SnapshotClient { if index != next_index { abort(SnapshotError::UnexpectedChunk(next_index, index)) } else { - let part_path = cd - .download_dir - .as_ref() - .path() - .join("parts") - .join(format!("{}.part", index)); + let part_path = cd.parts_dir().join(format!("{}.part", index)); // We are doing IO inside the STM transaction, but that's okay because there is no contention on the download. match std::fs::write(part_path, contents) { @@ -144,7 +145,7 @@ impl SnapshotClient { if next_index == cd.manifest.chunks { // Verify the checksum then load the snapshot and remove the current download from memory. - match manifest::parts_checksum(cd.download_dir.as_ref()) { + match manifest::parts_checksum(cd.parts_dir()) { Ok(checksum) => { if checksum == cd.manifest.checksum { let item = SnapshotItem::new( diff --git a/fendermint/vm/snapshot/src/lib.rs b/fendermint/vm/snapshot/src/lib.rs index cd9e6b5c..aad78c79 100644 --- a/fendermint/vm/snapshot/src/lib.rs +++ b/fendermint/vm/snapshot/src/lib.rs @@ -18,6 +18,6 @@ const PARTS_DIR_NAME: &str = "parts"; pub use client::SnapshotClient; pub use error::SnapshotError; -pub use manager::SnapshotManager; +pub use manager::{SnapshotManager, SnapshotParams}; pub use manifest::SnapshotManifest; pub use state::SnapshotItem; diff --git a/fendermint/vm/snapshot/src/manager.rs b/fendermint/vm/snapshot/src/manager.rs index 3e3715fd..89f3b5c6 100644 --- a/fendermint/vm/snapshot/src/manager.rs +++ b/fendermint/vm/snapshot/src/manager.rs @@ -14,22 +14,31 @@ use fendermint_vm_interpreter::fvm::state::FvmStateParams; use fvm_ipld_blockstore::Blockstore; use tendermint_rpc::Client; -/// Create snapshots at regular block intervals. -pub struct SnapshotManager { - /// Blockstore - store: BS, +pub struct SnapshotParams { /// Location to store completed snapshots. - snapshots_dir: PathBuf, + pub snapshots_dir: PathBuf, + pub download_dir: PathBuf, + pub block_interval: BlockHeight, /// Target size in bytes for snapshot chunks. - chunk_size: usize, + pub chunk_size: usize, /// Number of snapshots to keep. /// /// 0 means unlimited. - hist_size: usize, + pub hist_size: usize, /// Time to hold on from purging a snapshot after a remote client /// asked for a chunk from it. - last_access_hold: Duration, + pub last_access_hold: Duration, /// How often to check CometBFT whether it has finished syncing. + pub sync_poll_interval: Duration, +} + +/// Create snapshots at regular block intervals. +pub struct SnapshotManager { + store: BS, + snapshots_dir: PathBuf, + chunk_size: usize, + hist_size: usize, + last_access_hold: Duration, sync_poll_interval: Duration, /// Shared state of snapshots. state: SnapshotState, @@ -43,35 +52,29 @@ where BS: Blockstore + Clone + Send + Sync + 'static, { /// Create a new manager. - pub fn new( - store: BS, - snapshots_dir: PathBuf, - block_interval: BlockHeight, - chunk_size: usize, - hist_size: usize, - last_access_hold: Duration, - sync_poll_interval: Duration, - ) -> anyhow::Result<(Self, SnapshotClient)> { + pub fn new(store: BS, params: SnapshotParams) -> anyhow::Result<(Self, SnapshotClient)> { // Make sure the target directory exists. - std::fs::create_dir_all(&snapshots_dir).context("failed to create snapshots directory")?; + std::fs::create_dir_all(¶ms.snapshots_dir) + .context("failed to create snapshots directory")?; - let snapshot_items = list_manifests(&snapshots_dir).context("failed to list manifests")?; + let snapshot_items = + list_manifests(¶ms.snapshots_dir).context("failed to list manifests")?; let state = SnapshotState::new(snapshot_items); - let manager = Self { + let manager: SnapshotManager = Self { store, - snapshots_dir, - chunk_size, - hist_size, - last_access_hold, - sync_poll_interval, + snapshots_dir: params.snapshots_dir, + chunk_size: params.chunk_size, + hist_size: params.hist_size, + last_access_hold: params.last_access_hold, + sync_poll_interval: params.sync_poll_interval, state: state.clone(), // Assume we are syncing until we can determine otherwise. is_syncing: TVar::new(true), }; - let client = SnapshotClient::new(block_interval, state); + let client = SnapshotClient::new(params.download_dir, params.block_interval, state); Ok((manager, client)) } @@ -225,7 +228,9 @@ where // Create a checksum over the CAR file. let checksum_bytes = file_checksum(&snapshot_path).context("failed to compute checksum")?; - std::fs::write(&checksum_path, checksum_bytes).context("failed to write checksum file")?; + + std::fs::write(&checksum_path, checksum_bytes.to_string()) + .context("failed to write checksum file")?; // Create a directory for the parts. std::fs::create_dir(&parts_path).context("failed to create parts dir")?; @@ -320,7 +325,7 @@ mod tests { use fvm::engine::MultiEngine; use quickcheck::Arbitrary; - use crate::{manifest, PARTS_DIR_NAME}; + use crate::{manager::SnapshotParams, manifest, PARTS_DIR_NAME}; use super::SnapshotManager; @@ -343,7 +348,8 @@ mod tests { let (state_params, store) = init_genesis().await; // Now we have one store initialized with genesis, let's create a manager and snapshot it. - let temp_dir = tempfile::tempdir().expect("failed to create tmp dir"); + let snapshots_dir = tempfile::tempdir().expect("failed to create tmp dir"); + let download_dir = tempfile::tempdir().expect("failed to create tmp dir"); // Not polling because it's cumbersome to mock it. let never_poll_sync = Duration::ZERO; @@ -351,12 +357,15 @@ mod tests { let (snapshot_manager, snapshot_client) = SnapshotManager::new( store.clone(), - temp_dir.path().into(), - 1, - 10000, - 1, - Duration::ZERO, - never_poll_sync, + SnapshotParams { + snapshots_dir: snapshots_dir.path().into(), + download_dir: download_dir.path().into(), + block_interval: 1, + chunk_size: 10000, + hist_size: 1, + last_access_hold: Duration::ZERO, + sync_poll_interval: never_poll_sync, + }, ) .expect("failed to create snapshot manager"); @@ -393,13 +402,13 @@ mod tests { assert_eq!(snapshot.manifest.state_params, state_params); assert_eq!( snapshot.snapshot_dir.as_path(), - temp_dir.path().join("snapshot-0") + snapshots_dir.path().join("snapshot-0") ); let _ = std::fs::File::open(snapshot.snapshot_dir.join("manifest.json")) .expect("manifests file exists"); - let snapshots = manifest::list_manifests(temp_dir.path()).unwrap(); + let snapshots = manifest::list_manifests(snapshots_dir.path()).unwrap(); assert_eq!(snapshots.len(), 1, "can list manifests"); assert_eq!(snapshots[0], snapshot); @@ -416,12 +425,15 @@ mod tests { // Create a new manager instance let (_, new_client) = SnapshotManager::new( store, - temp_dir.path().into(), - 1, - 10000, - 1, - Duration::ZERO, - never_poll_sync, + SnapshotParams { + snapshots_dir: snapshots_dir.path().into(), + download_dir: download_dir.path().into(), + block_interval: 1, + chunk_size: 10000, + hist_size: 1, + last_access_hold: Duration::ZERO, + sync_poll_interval: never_poll_sync, + }, ) .expect("failed to create snapshot manager"); diff --git a/fendermint/vm/snapshot/src/state.rs b/fendermint/vm/snapshot/src/state.rs index 6b0d944a..016201c6 100644 --- a/fendermint/vm/snapshot/src/state.rs +++ b/fendermint/vm/snapshot/src/state.rs @@ -57,6 +57,11 @@ impl SnapshotItem { last_access: SystemTime::UNIX_EPOCH, } } + + fn parts_dir(&self) -> PathBuf { + self.snapshot_dir.join(PARTS_DIR_NAME) + } + /// Load the data from disk. /// /// Returns an error if the chunk isn't within range or if the file doesn't exist any more. @@ -67,7 +72,7 @@ impl SnapshotItem { self.manifest.chunks ); } - let chunk_file = self.snapshot_dir.join("{chunk}.part"); + let chunk_file = self.parts_dir().join(format!("{chunk}.part")); let content = std::fs::read(&chunk_file) .with_context(|| format!("failed to read chunk {}", chunk_file.to_string_lossy()))?; @@ -80,8 +85,8 @@ impl SnapshotItem { where BS: Blockstore + Send + 'static, { - let parts = manifest::list_parts(self.snapshot_dir.join(PARTS_DIR_NAME)) - .context("failed to list snapshot parts")?; + let parts = + manifest::list_parts(self.parts_dir()).context("failed to list snapshot parts")?; // 1. Restore the snapshots into a complete `snapshot.car` file. let car_path = self.snapshot_dir.join(SNAPSHOT_FILE_NAME); @@ -166,6 +171,12 @@ pub struct SnapshotDownload { pub next_index: TVar, } +impl SnapshotDownload { + pub fn parts_dir(&self) -> PathBuf { + self.download_dir.path().join(PARTS_DIR_NAME) + } +} + #[cfg(feature = "arb")] mod arb { use std::{path::PathBuf, time::SystemTime}; diff --git a/infra/Makefile.toml b/infra/Makefile.toml index 91a979c1..fc4b3a79 100644 --- a/infra/Makefile.toml +++ b/infra/Makefile.toml @@ -19,7 +19,7 @@ SUBNET_ID = { value = "/r0", condition = { env_not_set = ["SUBNET_ID"] } } # The network name is derived from the SUBNET_ID, replacing slashes with dashes, and dropping the first dash if any. NETWORK_NAME = { script = ["echo $SUBNET_ID | sed -e 's|/|-|g' -e 's|^-||1'"] } # External P2P address advertised by CometBFT to other peers. -CMT_EXTERNAL_ADDR = { value = "", condition = { env_not_set = ["CMT_EXTERNAL_ADDR"] } } +CMT_P2P_EXTERNAL_ADDR = { value = "", condition = { env_not_set = ["CMT_P2P_EXTERNAL_ADDR"] } } BALANCE = { value = "1000", condition = { env_not_set = ["BALANCE"] } } BASE_FEE = { value = "1000", condition = { env_not_set = ["BASE_FEE"] } } diff --git a/infra/docker-compose.yml b/infra/docker-compose.yml index 3cb19357..8a1489f4 100644 --- a/infra/docker-compose.yml +++ b/infra/docker-compose.yml @@ -24,9 +24,9 @@ services: environment: - ID=${NODE_ID} - LOG=${LOG:-cometbft-node${NODE_ID}.log} - - CMT_PEX=true - CMT_PROXY_APP=tcp://fendermint-node${NODE_ID}:26658 - - CMT_PERSISTENT_PEERS="${CMT_PERSISTENT_PEERS}" + - CMT_P2P_PEX=true + - CMT_P2P_PERSISTENT_PEERS="${CMT_P2P_PERSISTENT_PEERS}" volumes: - $BASE_DIR/node${NODE_ID}/cometbft:/cometbft healthcheck: diff --git a/infra/scripts/cometbft.toml b/infra/scripts/cometbft.toml index b34b8ba8..085db788 100644 --- a/infra/scripts/cometbft.toml +++ b/infra/scripts/cometbft.toml @@ -26,9 +26,8 @@ docker run \ --volume ${CMT_DIR}:/cometbft \ --env-file ${ENV_FILE} \ --env CMT_PROXY_APP=tcp://${FM_CONTAINER_NAME}:26658 \ - --env CMT_PEX=false \ - --env CMT_MAX_SUBSCRIPTION_CLIENTS=10 \ - --env CMT_MAX_SUBSCRIPTIONS_PER_CLIENT=1000 \ + --env CMT_RPC_MAX_SUBSCRIPTION_CLIENTS=10 \ + --env CMT_RPC_MAX_SUBSCRIPTIONS_PER_CLIENT=1000 \ ${CMT_DOCKER_IMAGE} \ ${CMD} """ @@ -197,7 +196,7 @@ sed -i'' -e "s/{{PLACEHOLDER}}/$BOOTSTRAPS/g" ${CMT_DIR}/config/config.toml [tasks.set-external-addr] script = """ sed -i'' -e 's|^external_address = ""$|external_address = "{{PLACEHOLDER}}"|g' ${CMT_DIR}/config/config.toml -sed -i'' -e "s/{{PLACEHOLDER}}/$CMT_EXTERNAL_ADDR/g" ${CMT_DIR}/config/config.toml +sed -i'' -e "s/{{PLACEHOLDER}}/$CMT_P2P_EXTERNAL_ADDR/g" ${CMT_DIR}/config/config.toml """ # This is required to run several validators locally. You may want to disable it when running diff --git a/infra/scripts/testnet.toml b/infra/scripts/testnet.toml index 974b2685..20519960 100644 --- a/infra/scripts/testnet.toml +++ b/infra/scripts/testnet.toml @@ -6,7 +6,7 @@ dependencies = [ "testnet-down", "testnet-init", "fendermint-deps", - "testnet-up" + "testnet-up", ] [tasks.testnet-up] @@ -15,7 +15,7 @@ if [ -z $GID ]; then GID=$(id -g); fi if [ -z $UID ]; then UID=$(id -u); fi export UID export GID -export CMT_PERSISTENT_PEERS=`cat $BASE_DIR/peers` +export CMT_P2P_PERSISTENT_PEERS=`cat $BASE_DIR/peers` export SUBNET_ID=$SUBNET_ID export BASE_DIR=$BASE_DIR ./infra/run.sh start @@ -23,7 +23,7 @@ export BASE_DIR=$BASE_DIR [tasks.testnet-down] script = """ -export CMT_PERSISTENT_PEERS="UNDEFINED" +export CMT_P2P_PERSISTENT_PEERS="UNDEFINED" if [ -z $GID ]; then GID=$(id -g); fi if [ -z $UID ]; then UID=$(id -u); fi export UID @@ -78,19 +78,19 @@ release ${nodes} """ [tasks.testnet-clear] -script=""" +script = """ echo clearing all IPC data rm -rf ${BASE_DIR} """ [tasks.testnet-mkdir] -script=""" +script = """ mkdir -p ${BASE_DIR} """ [tasks.testnet-cometbft-init] extend = "cometbft-init" -env = { "CMD" = "init", "NETWORK_NAME"="${NETWORK_NAME}", "CMT_DIR" = "${BASE_DIR}/${NODE_NAME}/cometbft", "CMT_CONTAINER_NAME" = "cometbft-node${NUMBER}", "FLAGS" = "-a STDOUT -a STDERR --rm"} +env = { "CMD" = "init", "NETWORK_NAME" = "${NETWORK_NAME}", "CMT_DIR" = "${BASE_DIR}/${NODE_NAME}/cometbft", "CMT_CONTAINER_NAME" = "cometbft-node${NUMBER}", "FLAGS" = "-a STDOUT -a STDERR --rm" } [tasks.testnet-add-peer] extend = "fendermint-tool" @@ -101,13 +101,13 @@ env = { "ENTRY" = "fendermint", "CMD" = """key add-peer \ """ } [tasks.testnet-setup-persistent-peers] -script=""" -unset CMT_PERSISTENT_PEERS -export CMT_PERSISTENT_PEERS=`cat $BASE_DIR/peers` -echo Persistent peers: $CMT_PERSISTENT_PEERS +script = """ +unset CMT_P2P_PERSISTENT_PEERS +export CMT_P2P_PERSISTENT_PEERS=`cat $BASE_DIR/peers` +echo Persistent peers: $CMT_P2P_PERSISTENT_PEERS for i in $(seq 0 3); do - sed -i'' -e "s|persistent_peers = \\"\\"|persistent_peers = \\"$CMT_PERSISTENT_PEERS\\"|" $BASE_DIR/node${i}/cometbft/config/config.toml + sed -i'' -e "s|persistent_peers = \\"\\"|persistent_peers = \\"$CMT_P2P_PERSISTENT_PEERS\\"|" $BASE_DIR/node${i}/cometbft/config/config.toml done """