I'm encountering an issue with my Spring Boot application where it fails to push metrics to Prometheus Pushgateway. The issue is happening intermettently
The error message indicates a SocketTimeoutException. Below is the relevant part of the stack trace:
org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager$PushGatewayTaskScheduler:218 - Shutting down ExecutorService
2024-07-01T17:12:03,782 ERROR org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager:119 - Unable to push metrics to Prometheus Pushgateway
java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_392]
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_392]
at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_392]
at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_392]
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) ~[?:1.8.0_392]
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) ~[?:1.8.0_392]
at java.io.BufferedInputStream.read(BufferedInputStream.java:345) ~[?:1.8.0_392]
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:743) ~[?:1.8.0_392]
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678) ~[?:1.8.0_392]
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1595) ~[?:1.8.0_392]
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1500) ~[?:1.8.0_392]
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480) ~[?:1.8.0_392]
at io.prometheus.client.exporter.PushGateway.doRequest(PushGateway.java:315) ~[hadoop-unjar3434452571338587139/:?]
at io.prometheus.client.exporter.PushGateway.pushAdd(PushGateway.java:182) ~[hadoop-unjar3434452571338587139/:?]
at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.push(PrometheusPushGatewayManager.java:108) ~[hadoop-unjar3434452571338587139/:?]
at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.shutdown(PrometheusPushGatewayManager.java:146) ~[hadoop-unjar3434452571338587139/:?]
at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.shutdown(PrometheusPushGatewayManager.java:136) ~[hadoop-unjar3434452571338587139/:?]
...
My Reporter class
package com.hotels.bdp.cloverleaf.metrics;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.ImmutableTag;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import io.micrometer.core.instrument.Tags;
/**
* MetricReporterService is responsible for reporting metrics to configured meter registries,
* including Graphite and Prometheus. It manages a collection of metrics and tags, and
* provides methods to report both general and partition-specific metrics.
*
* The class utilizes Micrometer's MeterRegistry to register and send metrics.
* It supports adding metrics with or without partition tags, and ensures metrics are
* properly tagged before reporting.
*/
public class MetricReporterService {
private static final Logger log = LoggerFactory.getLogger(MetricReporterService.class);
private static final String PARTITION_TAG_KEY = "partition";
private final MeterRegistry meterRegistry;
private final List<Metric> metrics = new ArrayList<>();
private final Map<Metric, String> partitionMetrics = new HashMap<>();
private final Tags tags;
private final ScheduledExecutorService executorService;
@Autowired
public MetricReporterService(MeterRegistry meterRegistry, List<ImmutableTag> cloverleafTags) {
this.meterRegistry = meterRegistry;
this.tags = Tags.of(cloverleafTags);
// Initialize the executor service with a single thread
this.executorService = Executors.newSingleThreadScheduledExecutor(r -> {
Thread thread = new Thread(r);
thread.setDaemon(true); // Daemon thread so it doesn't prevent JVM shutdown
thread.setName("metric-reporter");
return thread;
});
// Schedule the task to report metrics every fixed interval
this.executorService.scheduleAtFixedRate(this::reportAllMetrics, 0, 1, TimeUnit.MINUTES);
}
public void addMetric(Metric metric) {
metrics.add(metric);
}
public void addPartitionMetric(Metric metric, String partition) {
partitionMetrics.put(metric, partition);
}
/**
* Reports all the metrics to the meter registry, including both general and partition-specific metrics.
*/
public void reportAllMetrics() {
metrics.forEach(this::reportSingleMetric);
partitionMetrics.forEach((metric, partition) -> {
try {
reportPartitionedMetric(metric, partition);
} catch (SocketTimeoutException e) {
log.error("SocketTimeoutException while reporting partition metric '{}'", metric.getName(), e);
} catch (RuntimeException e) {
log.error("RuntimeException while reporting partition metric '{}'", metric.getName(), e);
}
});
}
private void reportSingleMetric(Metric metric) {
int retryCount = 0;
int maxRetries = 3; // Adjust as needed
long timeout = 1000; // Initial timeout in milliseconds
while (retryCount < maxRetries) {
try {
report(metric, tags);
return; // Success, exit loop
} catch (SocketTimeoutException e) {
// Handle timeout specifically
log.warn("Failed to report metric '{}' on attempt {} due to timeout. Retrying...", metric.getName(), retryCount + 1, e);
retryCount++;
timeout *= 2; // Exponential backoff
try {
Thread.sleep(timeout);
} catch (InterruptedException ie) {
// Handle being interrupted during sleep (optional)
log.warn("Thread interrupted while waiting for retry. Continuing...", ie);
}
} catch (RuntimeException e) {
log.error("RuntimeException while reporting metric '{}'", metric.getName(), e);
return; // Exit loop on other runtime exceptions
}
}
log.error("Failed to report metric '{}' after {} retries.", metric.getName(), maxRetries);
}
private void reportPartitionedMetric(Metric metric, String partition) throws SocketTimeoutException {
try {
report(metric, tags.and(PARTITION_TAG_KEY, partition));
} catch (SocketTimeoutException e) {
log.error("SocketTimeoutException while reporting partition metric '{}'", metric.getName(), e);
// Retry logic not explicitly handled here, ensure control flow reaches reportSingleMetric
throw e; // Rethrow to ensure it propagates up the call stack
} catch (RuntimeException e) {
log.error("RuntimeException while reporting partition metric '{}'", metric.getName(), e);
}
}
private void report(Metric metric, Iterable<Tag> tags) throws SocketTimeoutException {
String metricName = metric.getName();
double metricValue = metric.getValue();
// Register the metric with the configured meter registry
Counter counter = Counter.builder(metricName).tags(tags).register(meterRegistry);
log.info("\nSuccessfully recorded metric '{}' with value {} and tags {}\n",
metricName,
metricValue,
tags);
counter.increment(metricValue);
}
// Ensure the executor service shuts down gracefully on application shutdown
public void shutdown() {
log.info("Shutting down MetricReporterService...");
executorService.shutdown();
try {
if (!executorService.awaitTermination(10, TimeUnit.SECONDS)) {
log.warn("Executor service did not terminate in time. Attempting forced shutdown...");
executorService.shutdownNow();
if (!executorService.awaitTermination(10, TimeUnit.SECONDS)) {
log.error("Executor service could not be terminated.");
}
}
} catch (InterruptedException e) {
log.error("Interrupted while waiting for executor service termination.", e);
executorService.shutdownNow();
Thread.currentThread().interrupt();
}
log.info("MetricReporterService shutdown complete.");
}
}
My PushGateway Deployment in Kubernetes
kind: Deployment
apiVersion: apps/v1
metadata:
name: prometheus-pushgateway
namespace: monitoring
uid: 0e452c81
resourceVersion: '1316386'
generation: 14
creationTimestamp: '2023-11-21T21:39:50Z'
labels:
app.kubernetes.io/instance: prometheus-pushgateway
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: prometheus-pushgateway
app.kubernetes.io/version: v1.5.1
helm.sh/chart: prometheus-pushgateway-2.0.3
annotations:
deployment.kubernetes.io/revision: '11'
meta.helm.sh/release-name: prometheus-pushgateway
meta.helm.sh/release-namespace: monitoring
managedFields:
- manager: Go-http-client
operation: Update
apiVersion: apps/v1
time: '2024-06-24T22:04:13Z'
fieldsType: FieldsV1
fieldsV1:
'f:metadata':
'f:annotations':
.: {}
'f:meta.helm.sh/release-name': {}
'f:meta.helm.sh/release-namespace': {}
'f:labels':
.: {}
'f:app.kubernetes.io/instance': {}
'f:app.kubernetes.io/managed-by': {}
'f:app.kubernetes.io/name': {}
'f:app.kubernetes.io/version': {}
'f:helm.sh/chart': {}
'f:spec':
'f:progressDeadlineSeconds': {}
'f:revisionHistoryLimit': {}
'f:selector': {}
'f:strategy':
'f:type': {}
'f:template':
'f:metadata':
'f:annotations':
.: {}
'f:eks.amazonaws.com/role-arn': {}
'f:labels':
.: {}
'f:app.kubernetes.io/instance': {}
'f:app.kubernetes.io/managed-by': {}
'f:app.kubernetes.io/name': {}
'f:app.kubernetes.io/version': {}
'f:helm.sh/chart': {}
'f:spec':
'f:containers':
'k:{"name":"pushgateway"}':
.: {}
'f:image': {}
'f:imagePullPolicy': {}
'f:livenessProbe':
.: {}
'f:failureThreshold': {}
'f:httpGet':
.: {}
'f:path': {}
'f:port': {}
'f:scheme': {}
'f:initialDelaySeconds': {}
'f:periodSeconds': {}
'f:successThreshold': {}
'f:name': {}
'f:ports':
.: {}
'k:{"containerPort":9091,"protocol":"TCP"}':
.: {}
'f:containerPort': {}
'f:name': {}
'f:protocol': {}
'f:readinessProbe':
.: {}
'f:failureThreshold': {}
'f:httpGet':
.: {}
'f:path': {}
'f:port': {}
'f:scheme': {}
'f:initialDelaySeconds': {}
'f:periodSeconds': {}
'f:successThreshold': {}
'f:resources': {}
'f:terminationMessagePath': {}
'f:terminationMessagePolicy': {}
'f:volumeMounts':
.: {}
'k:{"mountPath":"/data"}':
.: {}
'f:mountPath': {}
'f:name': {}
'f:dnsPolicy': {}
'f:imagePullSecrets':
.: {}
'k:{"name":"artifactory-hub-docker-remote"}': {}
'f:restartPolicy': {}
'f:schedulerName': {}
'f:securityContext':
.: {}
'f:fsGroup': {}
'f:runAsNonRoot': {}
'f:runAsUser': {}
'f:serviceAccount': {}
'f:serviceAccountName': {}
'f:terminationGracePeriodSeconds': {}
'f:volumes':
.: {}
'k:{"name":"storage-volume"}':
.: {}
'f:name': {}
'f:persistentVolumeClaim':
.: {}
'f:claimName': {}
- manager: dashboard
operation: Update
apiVersion: apps/v1
time: '2024-06-28T12:03:48Z'
fieldsType: FieldsV1
fieldsV1:
'f:spec':
'f:replicas': {}
'f:template':
'f:metadata':
'f:annotations':
'f:ad.datadoghq.com/pushgateway.check_names': {}
'f:ad.datadoghq.com/pushgateway.init_configs': {}
'f:ad.datadoghq.com/pushgateway.instances': {}
'f:prometheus.io/path': {}
'f:prometheus.io/port': {}
'f:prometheus.io/scrape': {}
'f:spec':
'f:containers':
'k:{"name":"pushgateway"}':
'f:livenessProbe':
'f:timeoutSeconds': {}
'f:readinessProbe':
'f:timeoutSeconds': {}
- manager: kube-controller-manager
operation: Update
apiVersion: apps/v1
time: '2024-07-01T18:19:37Z'
fieldsType: FieldsV1
fieldsV1:
'f:metadata':
'f:annotations':
'f:deployment.kubernetes.io/revision': {}
'f:status':
'f:availableReplicas': {}
'f:conditions':
.: {}
'k:{"type":"Available"}':
.: {}
'f:lastTransitionTime': {}
'f:lastUpdateTime': {}
'f:message': {}
'f:reason': {}
'f:status': {}
'f:type': {}
'k:{"type":"Progressing"}':
.: {}
'f:lastTransitionTime': {}
'f:lastUpdateTime': {}
'f:message': {}
'f:reason': {}
'f:status': {}
'f:type': {}
'f:observedGeneration': {}
'f:readyReplicas': {}
'f:replicas': {}
'f:updatedReplicas': {}
subresource: status
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/instance: prometheus-pushgateway
app.kubernetes.io/name: prometheus-pushgateway
template:
metadata:
creationTimestamp: null
labels:
app.kubernetes.io/instance: prometheus-pushgateway
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: prometheus-pushgateway
app.kubernetes.io/version: v1.5.1
helm.sh/chart: prometheus-pushgateway-2.0.3
annotations:
ad.datadoghq.com/pushgateway.check_names: '["openmetrics"]'
ad.datadoghq.com/pushgateway.init_configs: '[{}]'
ad.datadoghq.com/pushgateway.instances: >-
[{ "prometheus_url": "http://%%host%%:9091/metrics", "namespace":
"cloverleaf", "metrics": [
"current_lag*","jvm_*","system_*","timer_*","backfill_lag*","partition*","populate_temp_tables*","push*","source*","touched_partition*","triggered_by*","update_table*","triggered*","schema_evolutio*","input-parti*","bytes*","new_partition*","touched*","modified*","source*","update*","populate*","execution*",
"long_running_jobs_killed*" , "update-table-statistics-duration*" ],
"max_returned_metrics":"30000" }]
eks.amazonaws.com/role-arn: 'arn:....'
prometheus.io/path: /metrics
prometheus.io/port: '9091'
prometheus.io/scrape: 'true'
spec:
volumes:
- name: storage-volume
persistentVolumeClaim:
claimName: prometheus-pushgateway
containers:
- name: pushgateway
image: 'hub-docker-remote.artylab.expedia.biz/prom/pushgateway:v1.0.1'
ports:
- name: metrics
containerPort: 9091
protocol: TCP
resources: {}
volumeMounts:
- name: storage-volume
mountPath: /data
livenessProbe:
httpGet:
path: /-/ready
port: 9091
scheme: HTTP
initialDelaySeconds: 10
timeoutSeconds: 30
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /-/ready
port: 9091
scheme: HTTP
initialDelaySeconds: 10
timeoutSeconds: 30
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
serviceAccountName: prometheus-pushgateway
serviceAccount: prometheus-pushgateway
securityContext:
runAsUser: 65534
runAsNonRoot: true
fsGroup: 65534
imagePullSecrets:
- name: artifactory-hub-docker-remote
schedulerName: default-scheduler
strategy:
type: Recreate
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
status:
observedGeneration: 14
replicas: 1
updatedReplicas: 1
readyReplicas: 1
availableReplicas: 1
conditions:
- type: Progressing
status: 'True'
lastUpdateTime: '2024-06-27T21:09:03Z'
lastTransitionTime: '2023-11-21T21:39:50Z'
reason: NewReplicaSetAvailable
message: >-
ReplicaSet "prometheus-pushgateway-6db8" has successfully
progressed.
- type: Available
status: 'True'
lastUpdateTime: '2024-07-01T18:19:37Z'
lastTransitionTime: '2024-07-01T18:19:37Z'
reason: MinimumReplicasAvailable
message: Deployment has minimum availability.
What I’ve Tried
Network Issues: Verified that the Pushgateway is accessible from the application server. Timeout Settings: Checked and adjusted timeout settings to see if it mitigates the issue.
.properties("management.metrics.export.prometheus.pushgateway.connect-timeout=20s") .properties("management.metrics.export.prometheus.pushgateway.read-timeout=40s")
Questions
- What could be causing the SocketTimeoutException when pushing metrics to Prometheus Pushgateway?
- Are there any specific configurations in Spring Boot or Prometheus Pushgateway that I might be missing or need to adjust?
- Could there be any underlying network or resource constraints contributing to this issue?
- Any guidance or suggestions to resolve this issue would be greatly appreciated.