Skip to content

Commit

Permalink
[INLONG-10889][Agent] When the oom is detected, the process exits (#1…
Browse files Browse the repository at this point in the history
…0891)

* [INLONG-10889][Agent] When the oom is detected, the process exits

* Update inlong-agent/agent-common/src/main/java/org/apache/inlong/agent/utils/file/FileUtils.java

Co-authored-by: AloysZhang <[email protected]>

* Update inlong-agent/agent-common/src/main/java/org/apache/inlong/agent/utils/file/FileUtils.java

Co-authored-by: AloysZhang <[email protected]>

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

* [INLONG-10889][Agent] When the oom is detected, the process exits

---------

Co-authored-by: AloysZhang <[email protected]>
  • Loading branch information
justinwwhuang and aloyszhang authored Aug 28, 2024
1 parent 04971c3 commit eca3e64
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ public static void threadThrowableHandler(Thread t, Throwable e) {
}

private static void handleOOM(Thread t, Throwable e) {
if (ExceptionUtils.indexOfThrowable(e, java.lang.OutOfMemoryError.class) != -1) {
if (ExceptionUtils.indexOfThrowable(e, OutOfMemoryError.class) != -1) {
LOGGER.error("Agent exit caused by {} OutOfMemory: ", t.getName(), e);
forceShutDown();
}
}

private static void forceShutDown() {
try {
Runtime.getRuntime().exit(-1);
Runtime.getRuntime().halt(-1);
} catch (Throwable e) {
LOGGER.error("exit failed, just halt, exception: ", e);
Runtime.getRuntime().halt(-2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public static long getFileCreationTime(String fileName) {
creationTime = Files.readAttributes(Paths.get(fileName), BasicFileAttributes.class).creationTime()
.toMillis();
} catch (IOException e) {
LOGGER.error("getFileCreationTime error {}", e.getMessage());
LOGGER.error("getFileCreationTime error.", e);
}
return creationTime;
}
Expand All @@ -68,7 +68,7 @@ public static long getFileLastModifyTime(String fileName) {
try {
lastModify = Files.getLastModifiedTime(Paths.get(fileName)).toMillis();
} catch (IOException e) {
LOGGER.error("getFileLastModifyTime error {}", e.getMessage());
LOGGER.error("getFileLastModifyTime error.", e);
}
return lastModify;
}
Expand Down
10 changes: 5 additions & 5 deletions inlong-agent/agent-installer/bin/installer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ CONSOLE_OUTPUT_FILE="${LOG_DIR}/agent-out.log"

function help() {
echo "Usage: agent.sh {status|start|stop|restart|clean}" >&2
echo " status: the status of inlong agent"
echo " start: start the inlong agent"
echo " stop: stop the inlong agent"
echo " restart: restart the inlong agent"
echo " status: the status of agent installer"
echo " start: start the agent installer"
echo " stop: stop the agent installer"
echo " restart: restart the agent installer"
echo " clean: unregister this node in manager"
echo " help: get help from inlong agent"
echo " help: get help from agent installer"
}

function running() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ public void run() {
doRun();
} catch (Throwable e) {
LOGGER.error("do run error: ", e);
ThreadUtils.threadThrowableHandler(Thread.currentThread(), e);
}
running = false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ private void sendBatchWithRetryCount(SenderMessage message, int retry) {
}
retry++;
AgentUtils.silenceSleepInMs(retrySleepTime);
ThreadUtils.threadThrowableHandler(Thread.currentThread(), exception);
}
}
}
Expand Down Expand Up @@ -299,10 +300,9 @@ private Runnable flushResendQueue() {
message.getTotalSize(), auditVersion);
sendBatchWithRetryCount(callback.message, callback.retry + 1);
}
} catch (Exception ex) {
LOGGER.error("error caught", ex);
} catch (Throwable t) {
ThreadUtils.threadThrowableHandler(Thread.currentThread(), t);
} catch (Exception e) {
LOGGER.error("error caught", e);
ThreadUtils.threadThrowableHandler(Thread.currentThread(), e);
} finally {
AgentUtils.silenceSleepInMs(batchFlushInterval);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ protected void releaseSource() {
data.setReadBytes(String.valueOf(bytePosition));
data.setReadLines(String.valueOf(linePosition));
OffsetProfile offsetProfile = OffsetManager.getInstance().getOffset(taskId, instanceId);
if (offsetProfile == null) {
return;
}
data.setSendLines(offsetProfile.getOffset());
FileStaticManager.getInstance().putStaticMsg(data);
randomAccessFile.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.inlong.agent.plugin.file.Source;
import org.apache.inlong.agent.plugin.sources.file.extend.ExtendedHandler;
import org.apache.inlong.agent.utils.AgentUtils;
import org.apache.inlong.agent.utils.ThreadUtils;
import org.apache.inlong.common.metric.MetricRegister;

import lombok.AllArgsConstructor;
Expand Down Expand Up @@ -153,6 +154,7 @@ private Runnable run() {
doRun();
} catch (Throwable e) {
LOGGER.error("do run error maybe file deleted: ", e);
ThreadUtils.threadThrowableHandler(Thread.currentThread(), e);
}
running = false;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.inlong.agent.state.State;
import org.apache.inlong.agent.store.Store;
import org.apache.inlong.agent.utils.AgentUtils;
import org.apache.inlong.agent.utils.ThreadUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -111,6 +112,7 @@ public void run() {
doRun();
} catch (Throwable e) {
LOGGER.error("do run error: ", e);
ThreadUtils.threadThrowableHandler(Thread.currentThread(), e);
}
running = false;
}
Expand Down
13 changes: 8 additions & 5 deletions inlong-agent/bin/agent-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ export OTEL_LOGS_EXPORTER=otlp
export ENABLE_OBSERVABILITY=false
# OTEL_EXPORTER_OTLP_ENDPOINT must be configured as a URL when ENABLE_OBSERVABILITY=true.
export OTEL_EXPORTER_OTLP_ENDPOINT=
export TDW_SECURITY_URL_NULL

#project directory
BASE_DIR=$(cd "$(dirname "$0")"/../;pwd)
Expand All @@ -45,17 +46,19 @@ else
fi

if [ -z "$AGENT_JVM_HEAP_OPTS" ]; then
HEAP_OPTS="-Xmx512m -Xss512k"
HEAP_OPTS=" -Xmx2048m -Xms512m -Xss512k "
else
HEAP_OPTS="$AGENT_JVM_HEAP_OPTS"
fi
GC_OPTS="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:InitiatingHeapOccupancyPercent=60 -Djava.net.preferIPv4Stack=true -Dfile.encoding=UTF-8"
LOG_OPTS="-Xloggc:$BASE_DIR/logs/gc.log -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=20M"

GVM_OPTS=" -Djava.net.preferIPv4Stack=true -Dfile.encoding=UTF-8 "
OOM_HANDLER=" -XX:OnOutOfMemoryError=$BASE_DIR/bin/oom.sh"
GC_OPTS=" -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+TraceClassLoading -XX:InitiatingHeapOccupancyPercent=45 -XX:G1HeapRegionSize=16m -XX:G1MixedGCCountTarget=16 -XX:G1HeapWastePercent=10 -XX:+PrintGCDetails -XX:+PrintGCDateStamps"
LOG_OPTS=" -Xloggc:$BASE_DIR/logs/gc.log -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=20M"
if [ -n "$NEED_TRACK_NATIVE_MEMORY" ] && [ "$NEED_TRACK_NATIVE_MEMORY" = "true" ]; then
GC_OPTS="$GC_OPTS -XX:NativeMemoryTracking"
fi
AGENT_JVM_ARGS="$HEAP_OPTS $GC_OPTS $LOG_OPTS"

AGENT_JVM_ARGS="$HEAP_OPTS $GVM_OPTS $GC_OPTS $LOG_OPTS $OOM_HANDLER"
# Add Agent Rmi Args when necessary
AGENT_RMI_ARGS="-Dcom.sun.management.jmxremote \
-Dcom.sun.management.jmxremote.port=18080 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false"
Expand Down
63 changes: 63 additions & 0 deletions inlong-agent/bin/oom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

BASE_DIR=$(dirname $0)/..

AGENT_CONF="${BASE_DIR}"/conf/agent.properties
source "${BASE_DIR}"/bin/agent-env.sh
CONSOLE_OUTPUT_FILE="${LOG_DIR}/oom.log"

function running() {
agent_uniq=`cat ${AGENT_CONF}|grep -Ev '^[[:space:]].*|^#' |grep -E 'agent.uniq.id'`
check_agent_uniq="${agent_uniq:-"agent.uniq.id=1"}"
arg_uniq="-D${check_agent_uniq}"
process=$(ps -aux | grep 'java' | grep 'inlong-agent' | grep "$check_agent_uniq" | awk '{print $2}')
if [ "${process}" = "" ]; then
return 1;
else
return 0;
fi
}

function stop_agent() {
time=$(date "+%Y-%m-%d %H:%M:%S")
if ! running; then
echo "$time oom agent is not running." >> $CONSOLE_OUTPUT_FILE
exit 1
fi
count=0
while running;
do
(( count++ ))
time=$(date "+%Y-%m-%d %H:%M:%S")
pid=$(ps -aux | grep 'java' | grep 'inlong-agent' | grep "$check_agent_uniq" | awk '{print $2}')
echo "$time oom stopping agent($pid) $count times" >> $CONSOLE_OUTPUT_FILE
if [ "${count}" -gt 10 ]; then
echo "$time oom kill -9 $pid" >> $CONSOLE_OUTPUT_FILE
kill -9 "${pid}"
else
echo "$time oom kill $pid" >> $CONSOLE_OUTPUT_FILE
kill "${pid}"
fi
sleep 6;
done
time=$(date "+%Y-%m-%d %H:%M:%S")
echo "$time oom stop agent($pid) successfully." >> $CONSOLE_OUTPUT_FILE
}

stop_agent;

0 comments on commit eca3e64

Please sign in to comment.