Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simpler heartbeat based on health #599

Merged
merged 6 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion deploy/prod/smpcv2-0-prod/values-iris-mpc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
env:
- name: RUST_LOG
value: "debug"
value: "info"

- name: NCCL_SOCKET_IFNAME
value: "eth0"
Expand Down Expand Up @@ -100,6 +100,9 @@ env:
- name: SMPC__DISABLE_PERSISTENCE
value: "true"

- name: SMPC__NODE_HOSTNAMES
value: '["iris-mpc-node.1.smpcv2.worldcoin.org","iris-mpc-node.2.smpcv2.worldcoin.org","iris-mpc-node.3.smpcv2.worldcoin.org"]'

initContainer:
enabled: true
image: "amazon/aws-cli:2.17.62"
Expand Down
5 changes: 4 additions & 1 deletion deploy/prod/smpcv2-1-prod/values-iris-mpc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
env:
- name: RUST_LOG
value: "debug"
value: "info"

- name: RUST_BACKTRACE
value: "1"
Expand Down Expand Up @@ -100,6 +100,9 @@ env:
- name: SMPC__DISABLE_PERSISTENCE
value: "true"

- name: SMPC__NODE_HOSTNAMES
value: '["iris-mpc-node.1.smpcv2.worldcoin.org","iris-mpc-node.2.smpcv2.worldcoin.org","iris-mpc-node.3.smpcv2.worldcoin.org"]'

initContainer:
enabled: true
image: "amazon/aws-cli:2.17.62"
Expand Down
5 changes: 4 additions & 1 deletion deploy/prod/smpcv2-2-prod/values-iris-mpc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
env:
- name: RUST_LOG
value: "debug"
value: "info"

- name: RUST_BACKTRACE
value: "1"
Expand Down Expand Up @@ -100,6 +100,9 @@ env:
- name: SMPC__DISABLE_PERSISTENCE
value: "true"

- name: SMPC__NODE_HOSTNAMES
value: '["iris-mpc-node.1.smpcv2.worldcoin.org","iris-mpc-node.2.smpcv2.worldcoin.org","iris-mpc-node.3.smpcv2.worldcoin.org"]'

initContainer:
enabled: true
image: "amazon/aws-cli:2.17.62"
Expand Down
2 changes: 1 addition & 1 deletion deploy/stage/common-values-iris-mpc.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
image: "ghcr.io/worldcoin/iris-mpc:v0.8.31"
image: "ghcr.io/worldcoin/iris-mpc:v0.8.33"

environment: stage
replicaCount: 1
Expand Down
3 changes: 3 additions & 0 deletions iris-mpc-common/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ pub struct Config {

#[serde(default)]
pub disable_persistence: bool,

#[serde(default)]
pub node_hostnames: Vec<String>,
}

fn default_processing_timeout_secs() -> u64 {
Expand Down
44 changes: 22 additions & 22 deletions iris-mpc-gpu/src/server/actor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ impl ServerActor {
///////////////////////////////////////////////////////////////////
// SYNC BATCH CONTENTS AND FILTER OUT INVALID ENTRIES
///////////////////////////////////////////////////////////////////
tracing::debug!("Syncing batch entries");
tracing::info!("Syncing batch entries");
let valid_entries = self.sync_batch_entries(&batch.valid_entries)?;
let valid_entry_idxs = valid_entries.iter().positions(|&x| x).collect::<Vec<_>>();
batch_size = valid_entry_idxs.len();
Expand All @@ -557,7 +557,7 @@ impl ServerActor {
///////////////////////////////////////////////////////////////////
// COMPARE LEFT EYE QUERIES
///////////////////////////////////////////////////////////////////
tracing::debug!("Comparing left eye queries");
tracing::info!("Comparing left eye queries");
// *Query* variant including Lagrange interpolation.
let compact_query_left = {
let code_query = preprocess_query(
Expand Down Expand Up @@ -618,7 +618,7 @@ impl ServerActor {
&self.cublas_handles[0],
)?;

tracing::debug!("Comparing left eye queries against DB and self");
tracing::info!("Comparing left eye queries against DB and self");
self.compare_query_against_db_and_self(
&compact_device_queries_left,
&compact_device_sums_left,
Expand All @@ -629,7 +629,7 @@ impl ServerActor {
///////////////////////////////////////////////////////////////////
// COMPARE RIGHT EYE QUERIES
///////////////////////////////////////////////////////////////////
tracing::debug!("Comparing right eye queries");
tracing::info!("Comparing right eye queries");
// *Query* variant including Lagrange interpolation.
let compact_query_right = {
let code_query = preprocess_query(
Expand Down Expand Up @@ -690,7 +690,7 @@ impl ServerActor {
&self.cublas_handles[0],
)?;

tracing::debug!("Comparing right eye queries against DB and self");
tracing::info!("Comparing right eye queries against DB and self");
self.compare_query_against_db_and_self(
&compact_device_queries_right,
&compact_device_sums_right,
Expand All @@ -701,7 +701,7 @@ impl ServerActor {
///////////////////////////////////////////////////////////////////
// MERGE LEFT & RIGHT results
///////////////////////////////////////////////////////////////////
tracing::debug!("Joining both sides");
tracing::info!("Joining both sides");
// Merge results and fetch matching indices
// Format: host_results[device_index][query_index]
self.distance_comparator.join_db_matches(
Expand Down Expand Up @@ -870,7 +870,7 @@ impl ServerActor {
self.current_db_sizes[i] += 1;
}

tracing::debug!(
tracing::info!(
"Updating DB size on device {}: {:?}",
i,
self.current_db_sizes[i]
Expand Down Expand Up @@ -979,10 +979,10 @@ impl ServerActor {
};

// ---- START BATCH DEDUP ----
tracing::debug!(party_id = self.party_id, "Starting batch deduplication");
tracing::info!(party_id = self.party_id, "Starting batch deduplication");

record_stream_time!(&self.device_manager, batch_streams, events, "batch_dot", {
tracing::debug!(party_id = self.party_id, "batch_dot start");
tracing::info!(party_id = self.party_id, "batch_dot start");

compact_device_queries.compute_dot_products(
&mut self.batch_codes_engine,
Expand All @@ -992,7 +992,7 @@ impl ServerActor {
batch_streams,
batch_cublas,
);
tracing::debug!(party_id = self.party_id, "compute_dot_reducers start");
tracing::info!(party_id = self.party_id, "compute_dot_reducers start");

compact_device_sums.compute_dot_reducers(
&mut self.batch_codes_engine,
Expand All @@ -1001,7 +1001,7 @@ impl ServerActor {
0,
batch_streams,
);
tracing::debug!(party_id = self.party_id, "batch_dot end");
tracing::info!(party_id = self.party_id, "batch_dot end");
});

record_stream_time!(
Expand All @@ -1010,13 +1010,13 @@ impl ServerActor {
events,
"batch_reshare",
{
tracing::debug!(party_id = self.party_id, "batch_reshare start");
tracing::info!(party_id = self.party_id, "batch_reshare start");
self.batch_codes_engine
.reshare_results(&self.query_db_size, batch_streams);
tracing::debug!(party_id = self.party_id, "batch_reshare masks start");
tracing::info!(party_id = self.party_id, "batch_reshare masks start");
self.batch_masks_engine
.reshare_results(&self.query_db_size, batch_streams);
tracing::debug!(party_id = self.party_id, "batch_reshare end");
tracing::info!(party_id = self.party_id, "batch_reshare end");
}
);

Expand All @@ -1031,17 +1031,17 @@ impl ServerActor {
events,
"batch_threshold",
{
tracing::debug!(party_id = self.party_id, "batch_threshold start");
tracing::info!(party_id = self.party_id, "batch_threshold start");
self.phase2_batch.compare_threshold_masked_many(
&code_dots_batch,
&mask_dots_batch,
batch_streams,
);
tracing::debug!(party_id = self.party_id, "batch_threshold end");
tracing::info!(party_id = self.party_id, "batch_threshold end");
}
);

tracing::debug!(party_id = self.party_id, "phase2_batch start");
tracing::info!(party_id = self.party_id, "phase2_batch start");

let res = self.phase2_batch.take_result_buffer();
let chunk_size = self.phase2_batch.chunk_size();
Expand All @@ -1060,7 +1060,7 @@ impl ServerActor {
);
self.phase2_batch.return_result_buffer(res);

tracing::debug!(party_id = self.party_id, "Finished batch deduplication");
tracing::info!(party_id = self.party_id, "Finished batch deduplication");
// ---- END BATCH DEDUP ----

// Create new initial events
Expand All @@ -1072,7 +1072,7 @@ impl ServerActor {
let mut next_phase2_event = self.device_manager.create_events();

// ---- START DATABASE DEDUP ----
tracing::debug!(party_id = self.party_id, "Start DB deduplication");
tracing::info!(party_id = self.party_id, "Start DB deduplication");
let ignore_device_results: Vec<bool> =
self.current_db_sizes.iter().map(|&s| s == 0).collect();
let mut db_chunk_idx = 0;
Expand Down Expand Up @@ -1229,10 +1229,10 @@ impl ServerActor {
// ---- END DATABASE DEDUP ----

// Wait for protocol to finish
tracing::debug!(party_id = self.party_id, "waiting for db search to finish");
tracing::info!(party_id = self.party_id, "waiting for db search to finish");
self.device_manager.await_streams(&self.streams[0]);
self.device_manager.await_streams(&self.streams[1]);
tracing::debug!(party_id = self.party_id, "db search finished");
tracing::info!(party_id = self.party_id, "db search finished");

// Reset the results buffers for reuse
for dst in &[&self.results, &self.batch_results, &self.final_results] {
Expand Down Expand Up @@ -1416,7 +1416,7 @@ fn get_merged_results(host_results: &[Vec<u32>], n_devices: usize) -> Vec<u32> {

results.push(match_entry);

tracing::debug!(
tracing::info!(
"Query {}: match={} [index: {}]",
j,
match_entry != NON_MATCH_ID,
Expand Down
99 changes: 0 additions & 99 deletions iris-mpc-gpu/src/server/heartbeat_nccl.rs

This file was deleted.

1 change: 0 additions & 1 deletion iris-mpc-gpu/src/server/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
mod actor;
pub mod heartbeat_nccl;
pub mod sync_nccl;

use crate::dot::ROTATIONS;
Expand Down
2 changes: 1 addition & 1 deletion iris-mpc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dotenvy.workspace = true
rand.workspace = true
base64.workspace = true
uuid.workspace = true

reqwest.workspace = true
sodiumoxide = "0.2.7"
iris-mpc-gpu = { path = "../iris-mpc-gpu" }
iris-mpc-common = { path = "../iris-mpc-common" }
Expand Down
Loading
Loading