Skip to content

Commit

Permalink
Prepare v0.3.2 (#90)
Browse files Browse the repository at this point in the history
* clippy

* bump version

* write changelog and update image tag in docs

* rebuild ui

* bump deps in examples

* various stability, logging and test improvements
  • Loading branch information
sebadob authored Dec 12, 2024
1 parent 05a89f8 commit 156def1
Show file tree
Hide file tree
Showing 61 changed files with 183 additions and 130 deletions.
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
# Changelog

## v0.3.2

This version will make Raft cluster formation and re-joins of nodes after restarts more robust. Additional checks and
logic have been added to figure out the correct behavior automatically. The tricky part about this for cache nodes is,
that they don't persist their state, which is usually needed for Raft. This makes the auto-setup of cache Raft's more
challenging, but this version seems pretty good in that regard so far.

Additionally, a new optional config variable has been added to set an initial delay for `/health` checks:

```
# Configures the initial delay in seconds that should be applied
# to `<API>/health` checks. During the first X seconds after node
# start, health checks will always return true to solve a chicken
# and egg problem when you want to cold-start a cluster while
# relying on `readinessProbe` checks.
# default: 30
HQL_HEALTH_CHECK_DELAY_SECS=30
```

## v0.3.1

This version only bumps the `svelte` dependency for the prebuilt dashboard to fix some build steps and bugs.
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ members = ["hiqlite"]
exclude = ["examples"]

[workspace.package]
version = "0.3.1"
version = "0.3.2"
edition = "2021"
license = "Apache-2.0"
authors = ["Sebastian Dobe <[email protected]"]
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ spec:
spec:
containers:
- name: hiqlite
image: ghcr.io/sebadob/hiqlite:0.3.1
image: ghcr.io/sebadob/hiqlite:0.3.2
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
Expand Down Expand Up @@ -639,7 +639,7 @@ spec:
spec:
containers:
- name: hiqlite-proxy
image: ghcr.io/sebadob/hiqlite:0.3.1
image: ghcr.io/sebadob/hiqlite:0.3.2
command: [ "/app/hiqlite", "proxy" ]
imagePullPolicy: Always
securityContext:
Expand Down
8 changes: 8 additions & 0 deletions config
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ HQL_LOGS_UNTIL_SNAPSHOT=10000
HQL_SECRET_RAFT=SuperSecureSecret1337
HQL_SECRET_API=SuperSecureSecret1337

# Configures the initial delay in seconds that should be applied
# to `<API>/health` checks. During the first X seconds after node
# start, health checks will always return true to solve a chicken
# and egg problem when you want to cold-start a cluster while
# relying on `readinessProbe` checks.
# default: 30
HQL_HEALTH_CHECK_DELAY_SECS=30

# You can either parse `ENC_KEYS` and `ENC_KEY_ACTIVE` from the
# environment with setting this value to `env`, or parse them from
# a file on disk with `file:path/to/enc/keys/file`
Expand Down
3 changes: 1 addition & 2 deletions examples/bench/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions examples/cache-only/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions examples/sqlite-only/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions examples/walkthrough/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions hiqlite/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ spec:
spec:
containers:
- name: hiqlite
image: ghcr.io/sebadob/hiqlite:0.3.1
image: ghcr.io/sebadob/hiqlite:0.3.2
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
Expand Down Expand Up @@ -639,7 +639,7 @@ spec:
spec:
containers:
- name: hiqlite-proxy
image: ghcr.io/sebadob/hiqlite:0.3.1
image: ghcr.io/sebadob/hiqlite:0.3.2
command: [ "/app/hiqlite", "proxy" ]
imagePullPolicy: Always
securityContext:
Expand Down
8 changes: 4 additions & 4 deletions hiqlite/src/backup.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::app_state::AppState;
use crate::helpers::fn_access;
use crate::helpers::set_path_access;
use crate::s3::S3Config;
use crate::store::logs;
use crate::store::state_machine::sqlite::state_machine::{
Expand Down Expand Up @@ -300,7 +300,7 @@ pub async fn restore_backup(node_config: &NodeConfig, src: BackupSource) -> Resu
let path_logs = logs::logs_dir(&node_config.data_dir);

fs::create_dir_all(&path_backups).await?;
fn_access(&path_backups, 0o700).await?;
set_path_access(&path_backups, 0o700).await?;

let (path_backup, remove_src) = match src {
BackupSource::S3(s3_obj) => {
Expand Down Expand Up @@ -340,15 +340,15 @@ pub async fn restore_backup(node_config: &NodeConfig, src: BackupSource) -> Resu
let _ = fs::remove_dir_all(&path_logs).await;

fs::create_dir_all(&path_db).await?;
fn_access(&path_db, 0o700).await?;
set_path_access(&path_db, 0o700).await?;

let path_db_full = format!("{}/{}", path_db, node_config.filename_db);
info!(
"Given backup check ok - copying into its final place: {} -> {}",
path_backup, path_db_full
);
fs::copy(&path_backup, &path_db_full).await?;
fn_access(&path_db_full, 0o700).await?;
set_path_access(&path_db_full, 0o700).await?;

if remove_src {
info!("Cleaning up S3 backup from {}", path_backup);
Expand Down
9 changes: 5 additions & 4 deletions hiqlite/src/client/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,12 @@ impl Client {
lock: &Arc<RwLock<(NodeId, String)>>,
tx: &flume::Sender<ClientStreamReq>,
) -> bool {
let mut has_changed = false;
let mut was_leader_error = false;

if let Some((id, node)) = err.is_forward_to_leader() {
if id.is_some() && node.is_some() {
was_leader_error = true;

let api_addr = node.as_ref().unwrap().addr_api.clone();
let leader_id = id.unwrap();
{
Expand All @@ -240,18 +242,17 @@ impl Client {
// re-connect triggers
if lock.0 != leader_id {
*lock = (leader_id, api_addr.clone());
has_changed = true;
}
}

if has_changed {
if was_leader_error {
tx.send_async(ClientStreamReq::LeaderChange((id, node.clone())))
.await
.expect("the Client API WebSocket Manager to always be running");
}
}
}

has_changed
was_leader_error
}
}
6 changes: 3 additions & 3 deletions hiqlite/src/client/mgmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::sync::Arc;
use std::time::Duration;
use tokio::sync::watch;
use tokio::time;
use tracing::{error, info};
use tracing::{debug, info};

#[cfg(feature = "sqlite")]
use crate::store::{logs::rocksdb::ActionWrite, state_machine::sqlite::writer::WriterRequest};
Expand Down Expand Up @@ -136,8 +136,8 @@ impl Client {
return;
}
Err(err) => {
debug!("Waiting for healthy Raft DB: {:?}", err);
info!("Waiting for healthy Raft DB");
error!("\n\n{}\n", err);
time::sleep(Duration::from_millis(500)).await;
}
}
Expand All @@ -153,8 +153,8 @@ impl Client {
return;
}
Err(err) => {
debug!("Waiting for healthy Raft cache: {:?}", err);
info!("Waiting for healthy Raft cache");
error!("{}", err);
time::sleep(Duration::from_millis(500)).await;
}
}
Expand Down
2 changes: 1 addition & 1 deletion hiqlite/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ impl From<std::io::Error> for Error {

impl From<Box<bincode::ErrorKind>> for Error {
fn from(value: Box<ErrorKind>) -> Self {
trace!("\n\nbincode::ErrorKind: {}\n", value);
trace!("bincode::ErrorKind: {}", value);
Self::Bincode(value.to_string())
}
}
Expand Down
42 changes: 40 additions & 2 deletions hiqlite/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,43 @@ pub async fn is_raft_initialized(
) -> Result<bool, Error> {
match raft_type {
#[cfg(feature = "sqlite")]
RaftType::Sqlite => Ok(state.raft_db.raft.is_initialized().await?),
RaftType::Sqlite => {
if !state.raft_db.raft.is_initialized().await? {
Ok(false)
} else {
/*
We can get in a tricky situation here.
In most cases, the `.is_initialized()` gives just the information we want.
But, if *this* node lost its volume and therefore membership state, and another
leader is still running and trying to reach *this* node before it can fully start
up (race condition), the raft will report being initialized via this check, while
it actually is not, because it lost all its state.
If we get into this situation, we will have a committed leader vote, but no other
data like logs and membership config.
*/

let metrics = state.raft_db.raft.server_metrics().borrow().clone();

#[cfg(debug_assertions)]
if metrics.current_leader.is_none() && metrics.vote.leader_id().node_id == state.id
{
panic!(
"current_leader.is_none() && metrics.vote.leader_id().node_id == state.id"
)
}

if metrics.vote.committed
&& metrics.vote.leader_id.node_id != state.id
&& metrics.current_leader.is_none()
{
// If we get here, we have a race condition and a remote leader initialized this
// node after a data volume loss before it had a change to re-join and sync data.
Ok(false)
} else {
Ok(true)
}
}
}
#[cfg(feature = "cache")]
RaftType::Cache => Ok(state.raft_cache.raft.is_initialized().await?),
RaftType::Unknown => panic!("neither `sqlite` nor `cache` feature enabled"),
Expand Down Expand Up @@ -41,6 +77,8 @@ pub async fn get_raft_metrics(
}
}

/// Raft locking - necessary for auto-cluster-join scenarios of remote notes to prevent
/// race conditions.
pub async fn lock_raft<'a>(
state: &'a Arc<AppState>,
raft_type: &'a RaftType,
Expand Down Expand Up @@ -110,7 +148,7 @@ pub async fn change_membership(
/// Restricts the access for the given path.
#[cfg(feature = "sqlite")]
#[inline]
pub async fn fn_access(path: &str, mode: u32) -> Result<(), Error> {
pub async fn set_path_access(path: &str, mode: u32) -> Result<(), Error> {
#[cfg(target_family = "unix")]
{
use std::fs::Permissions;
Expand Down
15 changes: 13 additions & 2 deletions hiqlite/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@

#[cfg(feature = "sqlite")]
use crate::store::state_machine::sqlite::state_machine::Response;
pub use openraft::SnapshotPolicy;
use serde::{Deserialize, Serialize};
use std::fmt::{Debug, Display};

pub use openraft::SnapshotPolicy;

#[cfg(any(feature = "sqlite", feature = "cache"))]
pub use crate::{client::Client, error::Error};
#[cfg(any(feature = "sqlite", feature = "cache"))]
Expand Down Expand Up @@ -81,6 +80,18 @@ pub mod server;

type NodeId = u64;

#[cfg(any(feature = "cache", feature = "sqlite"))]
pub(crate) static START_TS: std::sync::LazyLock<chrono::DateTime<chrono::Utc>> =
std::sync::LazyLock::new(chrono::Utc::now);
#[cfg(any(feature = "cache", feature = "sqlite"))]
pub(crate) static HEALTH_CHECK_DELAY_SECS: std::sync::LazyLock<u16> =
std::sync::LazyLock::new(|| {
std::env::var("HQL_HEALTH_CHECK_DELAY_SECS")
.unwrap_or_else(|_| String::from("30"))
.parse::<u16>()
.expect("Cannot parse HQL_HEALTH_CHECK_DELAY_SECS as u16")
});

/// Helper macro to created Owned Params which can be serialized and sent
/// over the Raft network between nodes.
#[macro_export]
Expand Down
11 changes: 10 additions & 1 deletion hiqlite/src/network/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use fastwebsockets::{upgrade, FragmentCollectorRead, Frame, OpCode, Payload};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fmt::Debug;
use std::ops::Deref;
use std::ops::{Deref, Sub};
use std::time::Duration;
use tokio::{task, time};
use tracing::{error, info, warn};
Expand All @@ -33,8 +33,10 @@ use crate::{

#[cfg(feature = "listen_notify")]
use crate::store::state_machine::memory::notify_handler::NotifyRequest;
use crate::{HEALTH_CHECK_DELAY_SECS, START_TS};
#[cfg(feature = "listen_notify")]
use axum::response::sse;
use chrono::Utc;
#[cfg(feature = "listen_notify")]
use futures_util::stream::Stream;
// pub(crate) async fn write(
Expand Down Expand Up @@ -117,6 +119,13 @@ pub async fn health(state: AppStateExt) -> Result<(), Error> {

#[cfg(any(feature = "sqlite", feature = "cache"))]
async fn check_health(state: &AppStateExt) -> Result<(), Error> {
if Utc::now().sub(*START_TS).num_seconds() < *HEALTH_CHECK_DELAY_SECS as i64 {
info!(
"Early health check within the HQL_HEALTH_CHECK_DELAY_SECS timeframe - returning true"
);
return Ok(());
}

#[cfg(feature = "sqlite")]
{
let metrics = state.raft_db.raft.metrics().borrow().clone();
Expand Down
3 changes: 2 additions & 1 deletion hiqlite/src/network/raft_client_split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ impl NetworkStreaming {
}
});

time::sleep(Duration::from_millis(heartbeat_interval)).await;
// if there is a network error, don't try too hard to connect
time::sleep(Duration::from_millis(heartbeat_interval * 3)).await;
continue;
}
}
Expand Down
Loading

0 comments on commit 156def1

Please sign in to comment.