diff --git a/Cargo.toml b/Cargo.toml
index e198ff7..e799aee 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ rustdoc-args = ["--cfg", "docsrs"]
 ahash = "0.8.11"
 bincode = "1.3.3"
 chashmap = { version = "2.2.2", optional = true }
-clap = { version = "=4.5.13", features = ["derive"] }
+clap = { version = "=4.5.14", features = ["derive"] }
 contrie = { version = "0.1.4", optional = true }
 core_affinity = "0.8.1"
 ctrlc = "3.4.4"
@@ -37,13 +37,13 @@ parking_lot = "0.12.3"
 quanta = "0.12.3"
 rand = "0.8.5"
 rocksdb = { version = "0.22.0", optional = true }
-scc = { version = "2.1.6", optional = true }
-serde = { version = "1.0.204", features = ["derive"] }
+scc = { version = "2.1.9", optional = true }
+serde = { version = "1.0.205", features = ["derive"] }
 toml = "0.8.19"
 zipf = "7.0.1"
 
 [dev-dependencies]
-tempfile = "3.10.1"
+tempfile = "3.12.0"
 
 [features]
 chashmap = ["dep:chashmap"]
diff --git a/README.md b/README.md
index fd7c267..7d245c1 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ The [documentation](https://docs.rs/kvbench) provides detailed usage guidelines.
 
 ## Development
 
-The missing pieces that are currently under active development:
+This project is being actively developed. The following tasks are currently being worked on:
 
 - Read-modify-write (RMW) support.
 - More built-in stores and benchmark parameters.
diff --git a/src/bench.rs b/src/bench.rs
index 984d9e2..3dae38f 100644
--- a/src/bench.rs
+++ b/src/bench.rs
@@ -1,133 +1,4 @@
 //! The core benchmark functionality.
-//!
-//! A benchmark in this crate actually refers to a group of benchmark runs, named **phases**. Users
-//! can provide one or multiple phases that will be run sequentially, each with different
-//! configurations.
-//!
-//! ## Configuration Format
-//!
-//! A benchmark configuration file is formatted in TOML. It consists of the definition of multiple
-//! phases, each is defined in a dictionary named `benchmark`. Phases are organized in an array, so
-//! the configuration of each phase starts with `[[benchmark]]`. It also supports a `[global]`
-//! section in the configuration file that will override the missing field in each phase. This can
-//! reduce the number of repeated options in each phase (e.g., shared options).
-//!
-//! A configuration file generally looks like the following:
-//!
-//! ```toml
-//! [global]
-//! # global options
-//!
-//! [[benchmark]]
-//! # phase 1 configuration
-//!
-//! [[benchmark]]
-//! # phase 2 configuration
-//!
-//! ...
-//! ```
-//!
-//! Available options and their usage can be found in [`BenchmarkOpt`] and [`GlobalOpt`], for phase
-//! and global options, respectively.
-//!
-//! Options in `[global]` section can be overwritten via environment variables without changing the
-//! content in the TOML file.
-//! For example, if the user needs to override `x` in `[global]`, setting the environment variable
-//! `global.x` will get the job done.
-//!
-//! ## Output Format
-//!
-//! Currently, all outputs are in plain text format. This makes the output easy to process using
-//! shell scripts and tools including gnuplot. If there are new data added to the output, it
-//! will be appended at the end of existing entries (but before `cdf` if it exists, see below)
-//! to make sure outputs from old versions can still be processed without changes.
-//!
-//! ### Throughput-only Output (default case)
-//!
-//! When measuring throughput, an output may look like the following:
-//! ```txt
-//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
-//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
-//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
-//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00
-//! ```
-//!
-//! The general format is:
-//!
-//! ```txt
-//! phase <p> repeat <r> duration <d> elapsed <e> total <o> mops <t>
-//! ```
-//!
-//! Where:
-//!
-//! - `<p>`: phase id.
-//! - `<r>`: repeat id in a phase, or string `finish .`, if the line is the aggregated report
-//! of a whole phase.
-//! - `<d>`: the duration of the repeat/phase, in seconds.
-//! - `<e>`: the total elapsed seconds since the starting of the program.
-//! - `<o>`: the total key-value operations executed by all worker threads in the repeat/phase.
-//! - `<t>`: followed by the throughput in million operations per second of the repeat/phase.
-//!
-//! ### Throughput + Latency Output (when `latency` is `true`)
-//!
-//! When latency measurement is enabled, the latency metrics shall be printed at the end of each
-//! benchmark. It is not shown after each repeat, because unlike throughput which is a singleton
-//! value at a given time, latency is a set of values and it usually matters only when we aggregate
-//! a lot of them. The output format in this case is generally the same as throughput-only
-//! measurements, but the `finish` line has extra output like the following:
-//!
-//! ```txt
-//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
-//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
-//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
-//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00 min_us 0.05 max_us 100.00 avg_us 50.00 p50_us 50.00 p95_us 95.00 p99_us 99.00 p999_us 100.00
-//! ```
-//!
-//! The extra output on the last line has a format of:
-//!
-//! ```txt
-//! min_us <i> max_us <a> avg_us <v> p50_us <m> p95_us <n> p99_us <p> p999_us <t>
-//! ```
-//!
-//! Where (all units are microseconds):
-//!
-//! - `<i>`: minimum latency
-//! - `<a>`: maximum latency
-//! - `<v>`: mean latency
-//! - `<m>`: median latency (50% percentile)
-//! - `<n>`: P95 latency
-//! - `<p>`: P99 latency
-//! - `<t>`: P999 latency (99.9%)
-//!
-//! ### Throughput + Latency + Latency CDF Mode (when both `latency` and `cdf` are `true`)
-//!
-//! When `cdf` is enabled, the latency CDF data will be printed at the end of the same line as the
-//! latency metrics above. In that case, the output will be like the following:
-//!
-//! ```txt
-//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
-//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
-//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
-//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00 min_us 0.05 max_us 100.00 avg_us 50.00 p50_us 50.00 p95_us 95.00 p99_us 99.00 p999_us 100.00 cdf_us percentile ...
-//! ```
-//! Since the latency metrics vary a lot between different benchmarks/runs, the number of data
-//! points of the CDF is different. Therefore, it is printed at the end of the output only. It is
-//! printed as a tuple of `<us> <percentile>` where `<us>` is the latency in microseconds and
-//! `<percentile>` is the percentile of the accumulated operations with latency higher than between
-//! `<ns> - 1` and `<ns>`, inclusively, ranging from 0 to 100 (two digit precision).
-//! There can be arbitrary number of tuples. The output ends when the maximum recorded latency is
-//! reached.
-//!
-//! An example of the CDF data will look like:
-//!
-//! ```txt
-//! cdf_us percentile 1 0.00 2 0.00 3 0.00 4 10.00 5 20.00 6 20.00 ...
-//! ```
-//!
-//! It means there are not data points at 1/2/3 microseconds. At 4 microseconds, there are 10% data
-//! points. At 5 microseconds, there are another 10% data points which makes the total percentile
-//! 20.00. At 6 microseconds, there are no data points so the percentile is still 20.00. Users can
-//! post-process the output and make a smooth CDF plot out of it.
 
 use crate::stores::{BenchKVMap, BenchKVMapOpt};
 use crate::workload::{Workload, WorkloadOpt};
@@ -174,7 +45,12 @@ enum ReportMode {
 /// The configuration of a single benchmark deserialized from a TOML string.
 ///
 /// The fields are optional to ease parsing from TOML, as there can be global parameters that are
-/// set for them.
+/// set for them. The default value will be applied if an option is not specified by both the file
+/// and the global option.
+///
+/// **Note**: If an option not explicitly marked optional and it is not specified by both the file
+/// and the global option, its default value will be applied. If it has no default value, an error
+/// will be raised. The precedence of a value is: file > global (after env overridden) > default.
 #[derive(Deserialize, Clone, Debug)]
 pub struct BenchmarkOpt {
     /// Number of threads that runs this benchmark.
@@ -182,10 +58,11 @@ pub struct BenchmarkOpt {
     /// Default: 1.
     pub threads: Option<usize>,
 
-    /// How many times this benchmark will be repeated. This option is useful when user would like
-    /// to plot the performance trend over time in the same benchmark. For example, setting this
-    /// option to 100 with one second timeout for each repeat can provide 100 data points over a
-    /// 100 second period.
+    /// How many times this benchmark will be repeated.
+    ///
+    /// This option is useful when user would like to plot the performance trend over time in the
+    /// same benchmark. For example, setting this option to 100 with one second timeout for each
+    /// repeat can provide 100 data points over a 100 second period.
     ///
     /// Default: 1.
     pub repeat: Option<usize>,
@@ -194,23 +71,29 @@ pub struct BenchmarkOpt {
     /// option will be ignored.
     ///
     /// Note: see `ops`.
+    ///
+    /// *This value is optional.*
     pub timeout: Option<f32>,
 
     /// How many operations each worker will execute. Only used if `timeout` is not given.
     ///
     /// Note: if both `timeout` and `ops` are not given, the run is only stopped when all possible
     /// keys are generated.
+    ///
+    /// *This value is optional.*
     pub ops: Option<u64>,
 
-    /// Report mode:
+    /// Report mode.
     ///
     /// - "hidden": not reported.
     /// - "repeat": after each repeat, the metrics for that repeat is printed.
     /// - "finish": after all repeats are finished, the metrics of the whole phase is printed.
     /// - "all": equals to "repeat" + "finish".
+    ///
+    /// Default: "all".
     pub report: Option<String>,
 
-    /// Max depth of queue for each worker. Only useful with [`AsyncKVMap`].
+    /// Max depth of queue for each worker (only used with async stores).
     ///
     /// When the pending requests are less than `qd`, the worker will not attempt to get more
     /// responses.
@@ -218,7 +101,7 @@ pub struct BenchmarkOpt {
     /// Default: 1.
     pub qd: Option<usize>,
 
-    /// Batch size for each request. Only useful with [`AsyncKVMap`].
+    /// Batch size for each request (only used with async stores).
     ///
     /// Default: 1.
     pub batch: Option<usize>,
@@ -368,8 +251,8 @@ impl Benchmark {
 
 /// The global options that go to the `[global]` section.
 ///
-/// They will override missing fields in each `[[benchmark]]` section, if the corresponding option
-/// is missing. For the usage of each option, please refer to [`BenchmarkOpt`].
+/// They will override the unspecified fields in each `[[benchmark]]` section with the same name.
+/// For the usage of each option, please refer to [`BenchmarkOpt`].
 #[derive(Deserialize, Clone, Debug)]
 pub struct GlobalOpt {
     // benchmark
diff --git a/src/cmdline.rs b/src/cmdline.rs
index 3cf7f36..c59b862 100644
--- a/src/cmdline.rs
+++ b/src/cmdline.rs
@@ -120,8 +120,7 @@ fn list_cli() {
 /// ```
 ///
 /// Where `STORE_CONFIG` and `BENCH_CONFIG` are the paths to the key-value store and benchmark
-/// configuration files, respectively. For their format, you can refer to the documentations of
-/// [`crate::stores`] and [`crate::bench`].
+/// configuration files, respectively.
 ///
 /// ### Server mode
 ///
@@ -131,8 +130,7 @@ fn list_cli() {
 /// kvbench server -s <STORE_CONFIG> -a <HOST> -p <PORT> -n <WORKERS>
 /// ```
 ///
-/// Where `STORE_CONFIG` is the path of the key-value store configuration file. Its format is
-/// documented in [`crate::stores`].
+/// Where `STORE_CONFIG` is the path of the key-value store configuration file.
 ///
 /// The default `HOST` and `PORT` are `0.0.0.0` and `9000`. By default, the server will spawn one
 /// worker thread only for incoming connections. You can adjust the number of worker threads by
diff --git a/src/lib.rs b/src/lib.rs
index 8f2587a..f17ea00 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,30 +3,188 @@
 //! A benchmark framework designed for testing key-value stores with easily customizable
 //! workloads.
 //!
-//! With `kvbench`, you can define the details of a benchmark using the TOML format, such as the
-//! proportions of mixed operations, the key access pattern, and key space size, just to name a
-//! few. In addition to regular single-process benchmarks, `kvbench` also integrates a key-value
-//! client/server implementation that works with a dedicated server thread/machine.
+//! Key features:
 //!
-//! You can also incorporate `kvbench` into your own key-value store implementations and run it
-//! against the built-in stores. All you need is implementing the [`KVMap`] or the [`AsyncKVMap`]
-//! trait, depending on the type of the store. After registering your store, simply reuse the
-//! exported [`cmdline()`] in your `main` function and it will work seamlessly with your own store.
+//! 1. Flexible and ergonomic control over benchmark specifications using TOML configuration files.
+//! 2. Collecting diverse metrics, including throughput, latency (w/ CDF), and rate-limited latency.
+//! 3. One-shot execution of multiple benchmark steps with different properties.
+//! 4. Various built-in key-value stores in place as well as a client/server implementation.
+//! 5. Highly extensible and can be seamlessly integrated into your own store.
 //!
-//! A few key design choices include:
+//! # Benchmark Configuration
 //!
-//! - Each key-value store exclusively stores a single type of key/value pair: variable-sized byte
-//! arrays represented as [`u8`] slices on the heap. No generics over the key's type.
-//! - The key-value store and the benchmark configurations are black boxes. They are created
-//! dynamically from a TOML file, and dynamically dispatched.
-//! - Benchmark functionalities can be reused in users' own crates: new key-value stores can be
-//! dynamically registered without touching the source code of this crate.
+//! A benchmark in kvbench consists of one or more benchmark runs, termed as *phases*.
+//! Phases will be run sequentially following their order in the configuration file.
 //!
-//! More detailed usage could be found in the module-level rustdocs:
+//! A benchmark configuration file is formatted in TOML. It consists of the definition of each
+//! phase in an array named `benchmark`, so the configuration of each phase starts with
+//! `[[benchmark]]`. The file also optionally contains a `[global]` section which will override the
+//! unspecified field in each phase. This can eliminate redundant options in each phase, for
+//! example, when those options are the same across the board.
 //!
-//! - [`mod@bench`] for the config format of a benchmark.
-//! - [`mod@stores`] for the config format of a built-in key-value store.
-//! - [`cmdline()`] for the usage of the default command line interface.
+//! A configuration file generally looks like the following:
+//!
+//! ```toml
+//! [global]
+//! # global options
+//!
+//! [[benchmark]]
+//! # phase 1 configuration
+//!
+//! [[benchmark]]
+//! # phase 2 configuration
+//!
+//! ...
+//! ```
+//! Options in `[global]` section can also be overwritten via environment variables without
+//! modifying the TOML file. For example, if the user needs to override `x` in `[global]`, one can
+//! set the environment variable `global.x` (case insensitive). This is helpful when the user would
+//! like to run different benchmarks when changing only a few options using a shell script.
+//!
+//! **Reference**
+//!
+//! - [`BenchmarkOpt`]: the available options for benchmark phase configuration.
+//! - [`GlobalOpt`]: the available options for global configuration.
+//!
+//! # Key-Value Store Configuration
+//!
+//! In addition to the specification of the benchmark itself, kvbench also requires the
+//! parameters of the key-value store it runs against. Only one key-value store runs at a time.
+//!
+//! The configuration of a key-value store is stored in a dictionary `map`.
+//! A store's configuration file looks like the following:
+//!
+//! ```toml
+//! [map]
+//! name = "..."
+//! # option1 = ...
+//! # option2 = ...
+//!
+//! ...
+//! ```
+//! The field `name` must be given and it should be equal to the name registered by the store.
+//! Other than `name`, all the fields are parsed as a string map and will be passed to the
+//! store's constructor function. The options in `[map]` section can also be overwritten via
+//! environment variables (e.g., setting `map.x` overrides property `x`).
+//!
+//! **Reference**
+//!
+//! - [`mod@stores`]: the available options for built-in stores and how to register new stores.
+//!
+//! # Run a Benchmark
+//!
+//! Once the configuration files of the benchmark along with the key-value store are ready, a
+//! benchmark can be started by using the `bench` mode of the built-in command-line interface.
+//!
+//! **Reference**
+//!
+//! - [`cmdline()`]: the usage of the default command-line interface.
+//!
+//! # Metrics Collection
+//!
+//! Currently, all outputs are in plain text format. This makes the output easy to process using
+//! shell scripts and tools including gnuplot. If there are new data added to the output, it
+//! will be appended at the end of existing entries (but before `cdf` if it exists, see below)
+//! to make sure outputs from old versions can still be processed without changes.
+//!
+//! ## Throughput-only Output (default case)
+//!
+//! When measuring throughput, an output may look like the following:
+//! ```txt
+//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
+//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
+//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
+//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00
+//! ```
+//!
+//! The general format is:
+//!
+//! ```txt
+//! phase <p> repeat <r> duration <d> elapsed <e> total <o> mops <t>
+//! ```
+//!
+//! Where:
+//!
+//! - `<p>`: phase id.
+//! - `<r>`: repeat id in a phase, or string `finish .`, if the line is the aggregated report
+//! of a whole phase.
+//! - `<d>`: the duration of the repeat/phase, in seconds.
+//! - `<e>`: the total elapsed seconds since the starting of the program.
+//! - `<o>`: the total key-value operations executed by all worker threads in the repeat/phase.
+//! - `<t>`: followed by the throughput in million operations per second of the repeat/phase.
+//!
+//! ## Throughput + Latency Output (when `latency` is `true`)
+//!
+//! When latency measurement is enabled, the latency metrics shall be printed at the end of each
+//! benchmark. It is not shown after each repeat, because unlike throughput which is a singleton
+//! value at a given time, latency is a set of values and it usually matters only when we aggregate
+//! a lot of them. The output format in this case is generally the same as throughput-only
+//! measurements, but the `finish` line has extra output like the following:
+//!
+//! ```txt
+//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
+//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
+//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
+//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00 min_us 0.05 max_us 100.00 avg_us 50.00 p50_us 50.00 p95_us 95.00 p99_us 99.00 p999_us 100.00
+//! ```
+//!
+//! The extra output on the last line has a format of:
+//!
+//! ```txt
+//! min_us <i> max_us <a> avg_us <v> p50_us <m> p95_us <n> p99_us <p> p999_us <t>
+//! ```
+//!
+//! Where (all units are microseconds):
+//!
+//! - `<i>`: minimum latency
+//! - `<a>`: maximum latency
+//! - `<v>`: mean latency
+//! - `<m>`: median latency (50% percentile)
+//! - `<n>`: P95 latency
+//! - `<p>`: P99 latency
+//! - `<t>`: P999 latency (99.9%)
+//!
+//! ## Throughput + Latency + Latency CDF Mode (when both `latency` and `cdf` are `true`)
+//!
+//! When `cdf` is enabled, the latency CDF data will be printed at the end of the same line as the
+//! latency metrics above. In that case, the output will be like the following:
+//!
+//! ```txt
+//! phase 0 repeat 0 duration 1.00 elapsed 1.00 total 1000000 mops 1.00
+//! phase 0 repeat 1 duration 1.00 elapsed 2.00 total 1000000 mops 1.00
+//! phase 0 repeat 2 duration 1.00 elapsed 3.00 total 1000000 mops 1.00
+//! phase 0 finish . duration 1.00 elapsed 3.00 total 3000000 mops 1.00 min_us 0.05 max_us 100.00 avg_us 50.00 p50_us 50.00 p95_us 95.00 p99_us 99.00 p999_us 100.00 cdf_us percentile ...
+//! ```
+//! Since the latency metrics vary a lot between different benchmarks/runs, the number of data
+//! points of the CDF is different. Therefore, it is printed at the end of the output only. It is
+//! printed as a tuple of `<us> <percentile>` where `<us>` is the latency in microseconds and
+//! `<percentile>` is the percentile of the accumulated operations with latency higher than between
+//! `<ns> - 1` and `<ns>`, inclusively, ranging from 0 to 100 (two digit precision).
+//! There can be arbitrary number of tuples. The output ends when the maximum recorded latency is
+//! reached.
+//!
+//! An example of the CDF data will look like:
+//!
+//! ```txt
+//! cdf_us percentile 1 0.00 2 0.00 3 0.00 4 10.00 5 20.00 6 20.00 ...
+//! ```
+//!
+//! It means there are not data points at 1/2/3 microseconds. At 4 microseconds, there are 10% data
+//! points. At 5 microseconds, there are another 10% data points which makes the total percentile
+//! 20.00. At 6 microseconds, there are no data points so the percentile is still 20.00. Users can
+//! post-process the output and make a smooth CDF plot out of it.
+//!
+//! # Server Mode
+//! A key-value client/server implementation is available in kvbench. The server can be backed by
+//! an arbitrary key-value store defined by a TOML file as in a benchmark, and the server can be
+//! started using the `server` mode of the built-in command-line interface.
+//!
+//! To benchmark the server's performance, users can use the built-in client implementation.
+//!
+//! **Reference**
+//!
+//! - [`cmdline()`]: the usage of the default command-line interface.
+//! - [`stores::remote`]: the available options of the key-value store client.
 
 use serde::{Deserialize, Serialize};
 use std::cell::RefCell;
@@ -83,7 +241,7 @@ pub enum Operation {
     Scan { key: Box<[u8]>, n: usize },
 }
 
-/// A request sent by a client to a server.
+/// A request submitted by an asynchronous store.
 #[derive(Serialize, Deserialize, Eq, PartialEq, Clone, Debug)]
 pub struct Request {
     /// The (usually unique) identifier of the request, or custom data.
@@ -93,7 +251,7 @@ pub struct Request {
     pub op: Operation,
 }
 
-/// A response sent by a server to a client.
+/// A response received by an asynchronous store.
 #[derive(Serialize, Deserialize, Eq, PartialEq, Clone, Debug)]
 pub struct Response {
     /// The `id` of the corresponding request.
@@ -153,14 +311,16 @@ impl AsyncResponder for RefCell<Vec<Response>> {
     }
 }
 
-pub mod bench;
+mod bench;
 mod cmdline;
-pub mod server;
+mod server;
 pub mod stores;
 pub mod thread;
-pub mod workload;
+mod workload;
 
+pub use bench::{BenchmarkOpt, GlobalOpt};
 pub use cmdline::cmdline;
+pub use workload::WorkloadOpt;
 
 pub extern crate inventory;
 pub extern crate toml;
diff --git a/src/stores.rs b/src/stores.rs
index f8b7469..e9cf6f6 100644
--- a/src/stores.rs
+++ b/src/stores.rs
@@ -1,27 +1,10 @@
 //! Adapters for built-in and external key-value stores.
 //!
-//! ## Configuration Format
+//! ## Built-in Stores
 //!
-//! The configuration of a key-value store is stored in a dictionary named `map`. Therefore, a
-//! store's configuration file looks like the following:
-//!
-//! ```toml
-//! [map]
-//! name = "..."
-//! # option1 = ...
-//! # option2 = ...
-//!
-//! ...
-//! ```
-//! The field `name` must be given and it should be equal to the name registered by the store.
-//! Other than `name`, all the fields are parsed as a string map and will be hand over to the
-//! constructor of the store's constructor function. For available options other than `name`, one
-//! can refer to the module-level documentation of a specific store.
-//!
-//! Similar to the `[global]` secition of a benchmark, the options in a `[map]` section can also
-//! be overwritten via environment variables.
-//! For example, if the user needs to override `x` in `[map]`, setting the environment variable
-//! `map.x` will get the job done.
+//! The usage of built-in stores can be found in the module-level documentations. Please note that
+//! it may be necessary to enable specific features of the crate to enable a certain built-in
+//! store.
 //!
 //! ## Registering New Stores
 //!
diff --git a/src/thread.rs b/src/thread.rs
index 92fa562..fe480d3 100644
--- a/src/thread.rs
+++ b/src/thread.rs
@@ -11,22 +11,30 @@
 //! it is with the [`JoinHandle`]. Because the purpose is not general spawn-join but solely for
 //! benchmark code, which does not use any return values.
 
+/// A join handle returned by a spawn function.
 pub trait JoinHandle {
+    /// Join the thread, consume the boxed self.
     fn join(self: Box<Self>);
 }
 
+/// A thread management abstraction.
 pub trait Thread {
+    /// Spawn a new thread using a boxed closure.
     fn spawn(&self, f: Box<dyn FnOnce() + Send>) -> Box<dyn JoinHandle>;
 
+    /// Yield the current thread.
     fn yield_now(&self);
 
+    /// Pin the current thread to a certain CPU core.
     fn pin(&self, core: usize);
 }
 
+/// A zero-sized wrapper for [`std::thread`] functions.
 #[derive(Clone)]
-pub(crate) struct DefaultThread;
+pub struct DefaultThread;
 
-pub(crate) struct DefaultJoinHandle(std::thread::JoinHandle<()>);
+/// A wrapper for [`std::thread::JoinHandle`].
+pub struct DefaultJoinHandle(std::thread::JoinHandle<()>);
 
 impl JoinHandle for DefaultJoinHandle {
     fn join(self: Box<Self>) {
diff --git a/src/workload.rs b/src/workload.rs
index 333b406..7f31ecf 100644
--- a/src/workload.rs
+++ b/src/workload.rs
@@ -148,37 +148,66 @@ impl KeyGenerator {
 
 /// A set of workload parameters that can be deserialized from a TOML string.
 ///
-/// This struct is used for interacting with workload configuration files and also create new
-/// [`Workload`] instances. Some options are wrapped in an `Option` type to ease writing
-/// configuration files. If users would like to create a [`Workload`] instance directly using these
-/// options, all fields must be present.
+/// **Note 1**: If an option not explicitly marked optional and it is not specified by both the file
+/// and the global option, its default value will be applied. If it has no default value, an error
+/// will be raised. The precedence of a value is: file > global (after env overridden) > default.
+///
+/// **Note 2**: the sum of all `*_perc` options must be equal to 100.
 #[derive(Deserialize, Clone, Debug, PartialEq)]
 pub struct WorkloadOpt {
-    /// Percentage of `SET` operations (optional, default 0).
+    /// Percentage of `SET` operations.
+    ///
+    /// Must be a non-negative integer if given.
+    ///
+    /// Default: 0.
     pub set_perc: Option<u8>,
 
-    /// Percentage of `GET` operations (optional, default 0).
+    /// Percentage of `GET` operations.
+    ///
+    /// Must be a non-negative integer if given.
+    ///
+    /// Default: 0.
     pub get_perc: Option<u8>,
 
-    /// Percentage of `DELETE` operations (optional, default 0).
+    /// Percentage of `DELETE` operations.
+    ///
+    /// Must be a non-negative integer if given.
+    ///
+    /// Default: 0.
     pub del_perc: Option<u8>,
 
-    /// Percentage of `SCAN` operations (optional, default 0).
+    /// Percentage of `SCAN` operations.
+    ///
+    /// Must be a non-negative integer if given.
+    ///
+    /// Default: 0.
     pub scan_perc: Option<u8>,
 
-    /// The number of iterations per `SCAN` (only used when `scan_perc` is non-zero, default 10).
+    /// The number of iterations per `SCAN`.
+    ///
+    /// Must be a positive integer if provided.
+    ///
+    /// Default: 10.
     pub scan_n: Option<usize>,
 
     /// Key length in bytes.
+    ///
+    /// Must be a positive integer.
     pub klen: Option<usize>,
 
     /// Value length in bytes.
+    ///
+    /// Must be a positive integer.
     pub vlen: Option<usize>,
 
     /// Minimum key.
+    ///
+    /// Must be a non-negative integer.
     pub kmin: Option<usize>,
 
     /// Maximum key.
+    ///
+    /// Must be greater than `kmin`.
     pub kmax: Option<usize>,
 
     /// Key distribution.
@@ -196,12 +225,16 @@ pub struct WorkloadOpt {
     /// - "latest": just like Zipfian but the hotspot is the latest key written to the store.
     pub dist: String,
 
-    /// The theta parameter for Zipfian distribution. (Optional, default 1.0)
+    /// The theta parameter for Zipfian distribution.
+    ///
+    /// Default: 1.0.
     pub zipf_theta: Option<f64>,
 
-    /// The hotspot location for Zipfian distribution. (Optional, default 0.0)
+    /// The hotspot location for Zipfian distribution.
     ///
     /// 0.0 means the first key. 0.5 means approximately the middle in the key space.
+    ///
+    /// Default: 0.0.
     pub zipf_hotspot: Option<f64>,
 }