8000 Who watches the watchdog by woz5999 · Pull Request #16 · logicmonitor/docker_lm_collector · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
This repository was archived by the owner on Jan 16, 2023. It is now read-only.

Who watches the watchdog #16

Merged
merged 10 commits into from
Aug 10, 2017
Merged
68 changes: 58 additions & 10 deletions startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

INSTALL_PATH=/usr/local/logicmonitor/agent
AGENT_BIN=$INSTALL_PATH/bin/logicmonitor-agent
PID_PATH=$INSTALL_PATH/bin/logicmonitor-agent.java.pid
AGENT_PID_PATH=$INSTALL_PATH/bin/logicmonitor-agent.java.pid
WATCHDOG_BIN=$INSTALL_PATH/bin/logicmonitor-watchdog
WATCHDOG_PID_PATH=$INSTALL_PATH/bin/logicmonitor-watchdog.java.pid
LOG_PATH=$INSTALL_PATH/logs/wrapper.log
UNCLEAN_SHUTDOWN_PATH=$INSTALL_PATH/shutdown.lck
UNCLEAN_SHUTDOWN_PATH=$INSTALL_PATH/unclean_shutdown.lck

# setup handlers
trap 'signal_handler' SIGTERM
Expand All @@ -16,8 +18,50 @@ watch_pid() {
while true
do
if ! $(ps -p $1 > /dev/null); then
# we want cleanup scripts since the collector failed unexpectedly
echo -e "Collector crashed\nExiting"
# we want to skip cleanup scripts since the collector failed unexpectedly
echo -e "Watchdog crashed\nExiting"
touch $UNCLEAN_SHUTDOWN_PATH
kill -INT $2
fi
sleep 10
done
}

watch_agent() {
# $1 = pid of startup script

while true
do
timeout 10 bash -c -- "\
while [ ! -e $AGENT_PID_PATH ]; do \
echo 'Waiting for agent to start'; \
sleep 1; \
done"

# make sure the PID file exists
if [ -e $AGENT_PID_PATH ]; then
# get the current PID of the collector agent
AGENT_PID=$(cat $AGENT_PID_PATH)
fi

# make sure we grabbed a PID
if [ -z "$AGENT_PID" ]; then
sleep 10
continue
fi

# check if the agent is running, and if not, make a note
FAIL=0
if ! $(ps -p $AGENT_PID > /dev/null); then
FAIL=$(($FAIL+1))
else
FAIL=0
fi

# if the agent has been down for 6 iterations (1m), it's time to fail
if [ "$FAIL" -ge 6 ]; then
# we want to skip cleanup scripts since the collector failed unexpectedly
echo -e "Agent crashed\nExiting"
touch $UNCLEAN_SHUTDOWN_PATH
kill -INT $2
fi
Expand All @@ -43,16 +87,20 @@ set -e
python /collector/startup.py
# ensure the collector is stopped so that we can control startup
$AGENT_BIN stop > /dev/null
$AGENT_BIN start
$WATCHDOG_BIN stop > /dev/null
$WATCHDOG_BIN start

# monitor the collector process and kill the container if it crashes
# monitor the watchdog process and kill the container if it crashes
timeout 10 bash -c -- "\
while [ ! -e $PID_PATH ]; do \
echo 'Waiting for collector to start'; \
while [ ! -e $WATCHDOG_PID_PATH ]; do \
echo 'Waiting for watchdog to start'; \
sleep 1; \
done"
echo "Collector started"
watch_pid $(cat $PID_PATH) $$ &
echo "Watchdog started"
watch_pid $(cat $WATCHDOG_PID_PATH) $$ &

# monitor the agent process and kill the container if it is down for 60s
watch_agent $$ &

while true
do
Expand Down
0