diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d379a6b..129fc94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - id: fix-encoding-pragma args: [--remove] - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 7.0.0 hooks: - id: flake8 args: ["--config=.flake8"] diff --git a/llm_analysis/__init__.py b/llm_analysis/__init__.py index e3f684f..bcff761 100644 --- a/llm_analysis/__init__.py +++ b/llm_analysis/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/llm_analysis/analysis.py b/llm_analysis/analysis.py index 5079e58..0b19ea3 100644 --- a/llm_analysis/analysis.py +++ b/llm_analysis/analysis.py @@ -756,7 +756,7 @@ def get_activation_memory_per_layernorm( self.parallelism_config.sp_size) * dtype_bytes def get_activation_memory_input_embedding(self, batch_size: int, - seq_len: int) -> float: + seq_len: int) -> float: """Get the memory (in bytes) required to store the activations of output embedding (logits)""" return self.model_config.hidden_dim * batch_size * seq_len * self.dtype_config.activation_bits / BITS_PER_BYTE / self.parallelism_config.tp_size @@ -1182,7 +1182,8 @@ def get_latency_fwd_per_layernorm( float: the latency in seconds for the forward pass of a single layernorm in a transformer layer """ input_numel = seq_len * batch_size * self.model_config.hidden_dim - compute_latency = input_numel * 5 / (self.get_TFLOPS_per_gpu() * 10**12) + compute_latency = input_numel * 5 / (self.get_TFLOPS_per_gpu() * + 10**12) activation_memory = self.get_activation_memory_per_layernorm( batch_size, seq_len, @@ -2069,8 +2070,9 @@ def training( 3 + int(fwd_prefetch) + int(bwd_prefetch)) * (unsharded_weight_memory_per_layer) - estimated_prefetch_memory_per_gpu = max(estimated_fwd_prefetch_memory_per_gpu, - estimated_bwd_prefetch_memory_per_gpu) + estimated_prefetch_memory_per_gpu = max( + estimated_fwd_prefetch_memory_per_gpu, + estimated_bwd_prefetch_memory_per_gpu) memory_left = (self.gpu_config.mem_per_GPU_in_GB * 1024**3 - weight_memory_per_gpu - optimizer_state_memory_per_gpu) @@ -2079,22 +2081,20 @@ def training( f"weight_memory_per_gpu: {_num_to_string(weight_memory_per_gpu)}B (embedding_memory: {_num_to_string(weight_memory_embedding_per_gpu)}B), optimizer_state_memory_per_gpu: {_num_to_string(optimizer_state_memory_per_gpu)}B, gradient_memory_per_gpu: {_num_to_string(gradient_memory_per_gpu)}B, estimated_fwd_prefetch_memory_per_gpu: {_num_to_string(estimated_fwd_prefetch_memory_per_gpu)}B, estimated_bwd_prefetch_memory_per_gpu: {_num_to_string(estimated_bwd_prefetch_memory_per_gpu)}B" ) - if memory_left < 0: logger.warning( "model weight/optimizer state memory usage is too large to fit in GPU memory" ) - if memory_left - max(estimated_prefetch_memory_per_gpu, gradient_memory_per_gpu) < 0: + if memory_left - max(estimated_prefetch_memory_per_gpu, + gradient_memory_per_gpu) < 0: logger.warning( "model gradient or bwd prefetch memory usage is too large to fit in GPU memory" ) loss_bwd_memory_batch_size_1 = self.get_loss_bwd_memory(1, seq_len) if memory_left - loss_bwd_memory_batch_size_1 < 0: - logger.warning( - "loss_bwd_memory is too large to fit in GPU memory" - ) + logger.warning("loss_bwd_memory is too large to fit in GPU memory") # With pipeline parallelism, each stage contains L/p layers so the first stage must store p ×L/p = L layers worth of activations regardless of the pipeline parallel size p; activation memory required for the input embeddings, the last layer-norm, and the output layer are ignored here. Refer to https://arxiv.org/abs/2205.05198 for more details. @@ -2121,7 +2121,8 @@ def training( layernorm_activation_memory_per_layer_batch_size_1) ] - activation_memory_input_embedding_batch_size_1 = self.get_activation_memory_input_embedding(1, seq_len) + activation_memory_input_embedding_batch_size_1 = self.get_activation_memory_input_embedding( + 1, seq_len) activation_memory_batch_size_1 += activation_memory_input_embedding_batch_size_1 activation_memory_output_embedding_batch_size_1 = self.get_activation_memory_output_embedding( 1, seq_len) @@ -2132,7 +2133,9 @@ def training( layernorm_dtype_bytes, ) - if memory_left - max(estimated_prefetch_memory_per_gpu, loss_bwd_memory_batch_size_1) < activation_memory_batch_size_1: + if memory_left - max( + estimated_prefetch_memory_per_gpu, + loss_bwd_memory_batch_size_1) < activation_memory_batch_size_1: logger.warning( f"memory_left {_num_to_string(memory_left)} < activation_memory_batch_size_1 {_num_to_string(activation_memory_batch_size_1)}" ) @@ -2143,7 +2146,10 @@ def training( max_batch_size_per_gpu = int(memory_left // activation_memory_batch_size_1) - while memory_left < max(estimated_prefetch_memory_per_gpu, self.get_loss_bwd_memory(max_batch_size_per_gpu, seq_len)) + activation_memory_batch_size_1 * max_batch_size_per_gpu: + while memory_left < max( + estimated_prefetch_memory_per_gpu, + self.get_loss_bwd_memory(max_batch_size_per_gpu, seq_len) + ) + activation_memory_batch_size_1 * max_batch_size_per_gpu: max_batch_size_per_gpu -= 1 logger.info( @@ -2201,7 +2207,8 @@ def training( loss_bwd_memory = self.get_loss_bwd_memory(batch_size_per_gpu, seq_len) - if memory_left < activation_memory_per_gpu + max(estimated_prefetch_memory_per_gpu, loss_bwd_memory): + if memory_left < activation_memory_per_gpu + max( + estimated_prefetch_memory_per_gpu, loss_bwd_memory): logger.warning( "activation_memory_per_gpu memory or loss_bwd_memory is too large with batch_size_per_gpu =" f" {batch_size_per_gpu} to fit in GPU memory (requiring" @@ -2210,7 +2217,8 @@ def training( f" {_num_to_string(memory_left)}B, max_batch_size_per_gpu =" f" {max_batch_size_per_gpu})") - memory_left = memory_left - activation_memory_per_gpu - max(estimated_prefetch_memory_per_gpu, loss_bwd_memory) + memory_left = memory_left - activation_memory_per_gpu - max( + estimated_prefetch_memory_per_gpu, loss_bwd_memory) num_flops_fwd_total = self.get_num_flops_fwd_total( batch_size_per_gpu, seq_len) @@ -2271,11 +2279,16 @@ def training( num_layers_per_gpu = int(self.model_config.num_layers / self.parallelism_config.pp_size) if activation_recomputation == ActivationRecomputation.FULL: - latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + latency_fwd_per_layer_mlp_compute + 2 * latency_fwd_per_layernorm_compute) + latency_recompute = num_layers_per_gpu * ( + latency_fwd_per_layer_attn_compute + + latency_fwd_per_layer_mlp_compute + + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.NORM_ATTN_NORM: - latency_recompute = num_layers_per_gpu * (latency_fwd_per_layer_attn_compute + 2 * latency_fwd_per_layernorm_compute) + latency_recompute = num_layers_per_gpu * ( + latency_fwd_per_layer_attn_compute + + 2 * latency_fwd_per_layernorm_compute) elif activation_recomputation == ActivationRecomputation.ATTN: - latency_recompute = num_layers_per_gpu * latency_fwd_per_layer_attn_compute + latency_recompute = num_layers_per_gpu * latency_fwd_per_layer_attn_compute elif activation_recomputation == ActivationRecomputation.ATTN_COMPUTE: latency_recompute = num_layers_per_gpu * self.get_num_flops_total_attn_compute( batch_size_per_gpu, seq_len) / ( @@ -2448,14 +2461,17 @@ def training( "(weight+op_state+grad)_memory_per_gpu": self.weight_grad_op_state_memory_per_gpu, "estimated_peak_memory_per_gpu": - optimizer_state_memory_per_gpu + weight_memory_per_gpu + max(activation_memory_per_gpu, gradient_memory_per_gpu) + max(estimated_bwd_prefetch_memory_per_gpu, loss_bwd_memory), + optimizer_state_memory_per_gpu + weight_memory_per_gpu + + max(activation_memory_per_gpu, gradient_memory_per_gpu) + + max(estimated_bwd_prefetch_memory_per_gpu, loss_bwd_memory), "latency_per_micro_batch": latency_per_micro_batch, "latency_fwd": latency_fwd, } summary_dict.update(latency_fwd_breakdown) - device_tokens_per_sec = round(seq_len * batch_size_per_gpu / latency_per_iter, 2) + device_tokens_per_sec = round( + seq_len * batch_size_per_gpu / latency_per_iter, 2) summary_dict.update({ "latency_per_iter": latency_per_iter, "device_tokens_per_sec": device_tokens_per_sec,