metrics: change collectd output to host /opt/collectd/run

Currently we loose collectd data from a node when scaling ends to a
system failure on the node - yet this data can be very helpful in root
causing the failure. This patch changes collectd configuration so that
the output will be continuously written to host filesystem instead of
the collectd container overlay that will be lost unless scaling
reaches graceful exit.

Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
This commit is contained in:
Antti Kervinen
2019-11-29 15:38:30 +02:00
committed by Graham Whaley
parent 07fd8412da
commit 696861ce66
2 changed files with 16 additions and 8 deletions

View File

@@ -12,15 +12,25 @@ collectd_pod="collectd"
init_stats() {
local wait_time=$1
# create collectd-config configmap
# create collectd-config configmap, delete old if there is one
kubectl get configmap collectd-config >/dev/null 2>&1 && kubectl delete configmap collectd-config
kubectl create configmap collectd-config --from-file=${COLLECTD_DIR}/collectd.conf
# if there is collectd daemonset already running, delete it
# to make sure that the latest configmap will be used.
kubectl get daemonset collectd >/dev/null 2>&1 && kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}"
# Launch our stats gathering pod
kubectl apply -f ${COLLECTD_DIR}/${collectd_pod}.yaml
kubectl rollout status --timeout=${wait_time}s daemonset/${collectd_pod}
# clear existing collectd output
while read -u 3 name node; do
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run/localhost/*"
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
# attempting to provide buffer for collectd to be installed and running,
# and CPU collection to build adequate history
# and CPU collection to build adequate history
sleep 12
}
@@ -30,11 +40,9 @@ cleanup_stats() {
# get logs before shutting down stats daemonset
while read -u 3 name node; do
kubectl exec -ti $name -- sh -c "cd /opt/collectd; tar -czvf localhost.tar.gz localhost"
# make a backup on the host in-case collection fail
kubectl exec -ti $name -- sh -c "mkdir -p /mnt/opt/collectd"
kubectl exec -ti $name -- sh -c "cp /opt/collectd/localhost.tar.gz /mnt/opt/collectd/localhost.tar.gz"
kubectl cp $name:/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
kubectl exec -ti $name -- sh -c "cd /mnt/opt/collectd/run; rm -f ../localhost.tar.gz; tar -czvf ../localhost.tar.gz localhost"
kubectl cp $name:/mnt/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run"
done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}" || true

View File

@@ -17,7 +17,7 @@ Hostname localhost
ValuesPercentage true
</Plugin>
<Plugin "csv">
DataDir "/opt/collectd"
DataDir "/mnt/opt/collectd/run"
StoreRates true
</Plugin>
<Plugin "interface">