metrics: change collectd output to host /opt/collectd/run

Currently we loose collectd data from a node when scaling ends to a system failure on the node - yet this data can be very helpful in root causing the failure. This patch changes collectd configuration so that the output will be continuously written to host filesystem instead of the collectd container overlay that will be lost unless scaling reaches graceful exit. Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
2026-04-28 11:03:40 +00:00 · 2019-11-29 15:38:30 +02:00
parent 07fd8412da
commit 696861ce66
2 changed files with 16 additions and 8 deletions
--- a/metrics/collectd/collectd.bash
+++ b/metrics/collectd/collectd.bash
@@ -12,15 +12,25 @@ collectd_pod="collectd"
 init_stats() {
 	local wait_time=$1

-	# create collectd-config configmap
+	# create collectd-config configmap, delete old if there is one
+	kubectl get configmap collectd-config >/dev/null 2>&1 && kubectl delete configmap collectd-config
 	kubectl create configmap collectd-config --from-file=${COLLECTD_DIR}/collectd.conf

+	# if there is collectd daemonset already running, delete it
+	# to make sure that the latest configmap will be used.
+	kubectl get daemonset collectd >/dev/null 2>&1 && kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}"
+
 	# Launch our stats gathering pod
 	kubectl apply -f ${COLLECTD_DIR}/${collectd_pod}.yaml
 	kubectl rollout status --timeout=${wait_time}s daemonset/${collectd_pod}

+	# clear existing collectd output
+	while read -u 3 name node; do
+		kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run/localhost/*"
+	done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')
+
 	# attempting to provide buffer for collectd to be installed and running,
-    # and CPU collection to build adequate history
+	# and CPU collection to build adequate history
 	sleep 12
 }

@@ -30,11 +40,9 @@ cleanup_stats() {

 	# get logs before shutting down stats daemonset
 	while read -u 3 name node; do
-		kubectl exec -ti $name -- sh -c "cd /opt/collectd; tar -czvf localhost.tar.gz localhost"
-		# make a backup on the host in-case collection fail
-		kubectl exec -ti $name -- sh -c "mkdir -p /mnt/opt/collectd"
-		kubectl exec -ti $name -- sh -c "cp /opt/collectd/localhost.tar.gz /mnt/opt/collectd/localhost.tar.gz"
-		kubectl cp $name:/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
+		kubectl exec -ti $name -- sh -c "cd /mnt/opt/collectd/run; rm -f ../localhost.tar.gz; tar -czvf ../localhost.tar.gz localhost"
+		kubectl cp $name:/mnt/opt/collectd/localhost.tar.gz ${RESULT_DIR}/${node}.tar.gz
+		kubectl exec -ti $name -- sh -c "rm -rf /mnt/opt/collectd/run"
 	done 3< <(kubectl get pods --selector name=collectd-pods -o json | jq -r '.items[] | "\(.metadata.name) \(.spec.nodeName)"')

 	kubectl delete daemonset --wait=true --timeout=${delete_wait_time}s "${collectd_pod}" || true
--- a/metrics/collectd/collectd.conf
+++ b/metrics/collectd/collectd.conf
@@ -17,7 +17,7 @@ Hostname localhost
    ValuesPercentage true
 </Plugin>
 <Plugin "csv">
-    DataDir "/opt/collectd"
+    DataDir "/mnt/opt/collectd/run"
    StoreRates true
 </Plugin>
 <Plugin "interface">