mirror of
https://github.com/clearlinux/cloud-native-setup.git
synced 2026-04-28 11:03:40 +00:00
metrics: change collectd output to host /opt/collectd/run
Currently we loose collectd data from a node when scaling ends to a system failure on the node - yet this data can be very helpful in root causing the failure. This patch changes collectd configuration so that the output will be continuously written to host filesystem instead of the collectd container overlay that will be lost unless scaling reaches graceful exit. Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
This commit is contained in:
committed by
Graham Whaley
parent
07fd8412da
commit
696861ce66
@@ -12,15 +12,25 @@ collectd_pod="collectd"
|
||||
init_stats() {
|
||||
local wait_time=$1
|
||||
|
||||
# create collectd-config configmap
|
||||
# create collectd-config configmap, delete old if there is one
|
||||
kubectl get configmap collectd-config >/dev/null 2>&1 && kubectl delete configmap collectd-config
|
||||
kubectl create configmap collectd-config --from-file=${COLLECTD_DIR}/collectd.conf
|
||||
|
||||
# if there is collectd daemonset already running, delete it
|
||||
# to make sure that the latest configmap will be used.
|
||||
kubectl get daemonset collectd >/dev/null 2>&1 && kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}"
|
||||
|
||||
# Launch our stats gathering pod
|
||||
kubectl apply -f ${COLLECTD_DIR}/${collectd_pod}.yaml
|
||||
kubectl rollout status --timeout=${wait_time}s daemonset/${collectd_pod}
|
||||
|
||||
# clear existing collectd output
|
||||
while read -u 3 name node; do
|
||||
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run/localhost/*"
|
||||
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
|
||||
|
||||
# attempting to provide buffer for collectd to be installed and running,
|
||||
# and CPU collection to build adequate history
|
||||
# and CPU collection to build adequate history
|
||||
sleep 12
|
||||
}
|
||||
|
||||
@@ -30,11 +40,9 @@ cleanup_stats() {
|
||||
|
||||
# get logs before shutting down stats daemonset
|
||||
while read -u 3 name node; do
|
||||
kubectl exec -ti $name -- sh -c "cd /opt/collectd; tar -czvf localhost.tar.gz localhost"
|
||||
# make a backup on the host in-case collection fail
|
||||
kubectl exec -ti $name -- sh -c "mkdir -p /mnt/opt/collectd"
|
||||
kubectl exec -ti $name -- sh -c "cp /opt/collectd/localhost.tar.gz /mnt/opt/collectd/localhost.tar.gz"
|
||||
kubectl cp $name:/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
|
||||
kubectl exec -ti $name -- sh -c "cd /mnt/opt/collectd/run; rm -f ../localhost.tar.gz; tar -czvf ../localhost.tar.gz localhost"
|
||||
kubectl cp $name:/mnt/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
|
||||
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run"
|
||||
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
|
||||
|
||||
kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}" || true
|
||||
|
||||
@@ -17,7 +17,7 @@ Hostname localhost
|
||||
ValuesPercentage true
|
||||
</Plugin>
|
||||
<Plugin "csv">
|
||||
DataDir "/opt/collectd"
|
||||
DataDir "/mnt/opt/collectd/run"
|
||||
StoreRates true
|
||||
</Plugin>
|
||||
<Plugin "interface">
|
||||
|
||||
Reference in New Issue
Block a user