Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(monitor): send exception when feishu alert is enable && remove light monitoring address #373

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion configs/1.8B_MoE16_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
3 changes: 1 addition & 2 deletions configs/57B_qwen2_MoE.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,9 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
queue_max_length=10,
),
)
)
1 change: 0 additions & 1 deletion configs/7B_MoE4_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
3 changes: 1 addition & 2 deletions configs/7B_baichuan2.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
Expand Down Expand Up @@ -196,7 +196,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
3 changes: 1 addition & 2 deletions configs/7B_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
Expand Down Expand Up @@ -203,7 +203,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/7B_internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
9 changes: 4 additions & 5 deletions configs/7B_isp_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
# load function such as "llama"
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
Expand Down Expand Up @@ -188,17 +188,17 @@
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
sequence_2D (dict):
1. enable: bool, whether enable the 2D sequence parallel or not.
2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
head_size * context_size should be equal tensor size.
3. context_size: int, the parallel degree of context parallelism.
head_size * context_size should be equal tensor size.
4. window_size: int, the sliding window size in context parallelism.
5. device_placement_strategy: dict,
head_first: bool, if `True`, ranks of the same head parallel group are
head_first: bool, if `True`, ranks of the same head parallel group are
given high priority for colocation on the same node;
if `False`, ranks of the same context parallel group are
given high priority for colocation on the same node;
interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
interleaved the ranks in the same window to make full use of NIC as much as possible.
"""
parallel = dict(
Expand All @@ -223,7 +223,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/7B_llama2.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
3 changes: 1 addition & 2 deletions configs/7B_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
Expand Down Expand Up @@ -203,7 +203,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/7B_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/8x22B_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/8x7B_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
3 changes: 1 addition & 2 deletions configs/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
# load function such as "llama"
load_ckpt_info=dict(path=LOAD_CKPT_FOLDER, content=("model",), ckpt_type="internevo"),
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
Expand Down Expand Up @@ -141,7 +141,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
1 change: 0 additions & 1 deletion configs/demo_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
Expand Down
94 changes: 9 additions & 85 deletions doc/code-docs/locales/en/LC_MESSAGES/monitor.po
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: InternLM \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2024-08-30 16:07+0800\n"
"POT-Creation-Date: 2024-11-20 15:01+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: en\n"
Expand All @@ -16,7 +16,7 @@ msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.15.0\n"
"Generated-By: Babel 2.14.0\n"

#: ../../source/monitor.rst:2
msgid "监控和告警"
Expand Down Expand Up @@ -56,25 +56,12 @@ msgstr ""
"``internlm.monitor.alert.send_feishu_msg_with_webhook()``."

#: ../../source/monitor.rst:25
msgid "轻量监控"
msgstr "Light Monitoring"
msgid "监控告警配置"
msgstr "Monitor Config"

#: ../../source/monitor.rst:27
#: ../../source/monitor.rst:28
msgid ""
"InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternEvo还可以通过"
" `grafana dashboard <https://grafana.com/grafana/dashboards/>`_ "
"直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。"
msgstr ""
"The InternEvo light monitoring tool employs a heartbeat mechanism to "
"real-time monitor various metrics during the training process, such as "
"loss, grad_norm, and training phase duration. Additionally, InternEvo can"
" present these metric details through a `grafana dashboard "
"<https://grafana.com/grafana/dashboards/>`_, allowing users to conduct "
"more comprehensive and in-depth training analysis in an intuitive manner."

#: ../../source/monitor.rst:29
msgid ""
"轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file "
"配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file "
"<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_ "
"来更改监控配置。以下是一个监控配置的示例:"
msgstr ""
Expand All @@ -84,23 +71,17 @@ msgstr ""
"<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_."
" Here is an example of a monitoring configuration:"

#: ../../source/monitor.rst:42
#: ../../source/monitor.rst:40
msgid "enable_feishu_alert (bool):是否启用飞书告警。默认值:False。"
msgstr "enable_feishu_alert: Whether to enable Feishu alerts. Defaults: False."

#: ../../source/monitor.rst:43
#: ../../source/monitor.rst:41
msgid "feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。"
msgstr ""
"feishu_alert_address: The webhook address for Feishu alerts. Defaults: "
"None."

#: ../../source/monitor.rst:44
msgid "light_monitor_address (str):轻量监控的地址。默认值:None。"
msgstr ""
"light_monitor_address: The address for lightweight monitoring. Defaults: "
"None."

#: ../../source/monitor.rst:45
#: ../../source/monitor.rst:42
msgid "alert_file_path (str):告警存储路径。默认值:None。"
msgstr "alert_file_path: path of alert. Defaults: None."

Expand Down Expand Up @@ -213,60 +194,3 @@ msgstr "alert_file_path: path of alert. Defaults: None."

#~ msgid "示例"
#~ msgstr "Example"

#~ msgid ""
#~ "Initialize the monitoring module with "
#~ "the default address ``initialize_light_monitor()``"
#~ msgstr ""

#~ msgid "Send a heartbeat message to a monitoring server."
#~ msgstr ""

#~ msgid ""
#~ "The type of heartbeat message, e.g., "
#~ "\"train_metrics\", \"init_time\", \"stage_time\"."
#~ msgstr ""

#~ msgid "A dictionary containing message data to be included in the heartbeat."
#~ msgstr ""

#~ msgid ""
#~ "Sending a heartbeat message for training"
#~ " metrics ``send_heartbeat(\"train_metrics\", {\"loss\":"
#~ " 0.1, \"accuracy\": 0.95})``"
#~ msgstr ""

#~ msgid ""
#~ "Sending a heartbeat message for "
#~ "initialization time ``send_heartbeat(\"init_time\", "
#~ "{\"import_time\": 0.25})``"
#~ msgstr ""

#~ msgid ""
#~ "Sending a heartbeat message for stage"
#~ " time ``send_heartbeat(\"stage_time\", {\"fwd_time\":"
#~ " 2.3, \"bwd_time\": 6.2})``"
#~ msgstr ""

#~ msgid ""
#~ "InternEvo 使用 "
#~ "``internlm.monitor.alert.initialize_light_monitor`` "
#~ "来初始化轻量监控客户端。一旦初始化完成,它会建立与监控服务器的连接。在训练过程中,使用 "
#~ "``internlm.monitor.alert.send_heartbeat`` "
#~ "来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常,并在需要时发送警报消息。"
#~ msgstr ""
#~ "InternEvo uses "
#~ "``internlm.monitor.alert.initialize_light_monitor`` to "
#~ "initialize the lightweight monitoring client."
#~ " Once initialization is complete, it "
#~ "establishes a connection with the "
#~ "monitoring server. During the training "
#~ "process, it uses "
#~ "``internlm.monitor.alert.send_heartbeat`` to send "
#~ "various types of heartbeat messages to"
#~ " the monitoring server. The monitoring "
#~ "server uses these heartbeat messages to"
#~ " detect if the training encounters "
#~ "any abnormalities and sends alert "
#~ "messages as needed."

Loading
Loading