0

I'm encountering an issue with my Spring Boot application where it fails to push metrics to Prometheus Pushgateway. The issue is happening intermettently

The error message indicates a SocketTimeoutException. Below is the relevant part of the stack trace:

org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager$PushGatewayTaskScheduler:218 - Shutting down ExecutorService
2024-07-01T17:12:03,782 ERROR org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager:119 - Unable to push metrics to Prometheus Pushgateway
java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method) ~[?:1.8.0_392]
    at java.net.SocketInputStream.socketRead(SocketInputStream.java:116) ~[?:1.8.0_392]
    at java.net.SocketInputStream.read(SocketInputStream.java:171) ~[?:1.8.0_392]
    at java.net.SocketInputStream.read(SocketInputStream.java:141) ~[?:1.8.0_392]
    at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) ~[?:1.8.0_392]
    at java.io.BufferedInputStream.read1(BufferedInputStream.java:286) ~[?:1.8.0_392]
    at java.io.BufferedInputStream.read(BufferedInputStream.java:345) ~[?:1.8.0_392]
    at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:743) ~[?:1.8.0_392]
    at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678) ~[?:1.8.0_392]
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1595) ~[?:1.8.0_392]
    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1500) ~[?:1.8.0_392]
    at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480) ~[?:1.8.0_392]
    at io.prometheus.client.exporter.PushGateway.doRequest(PushGateway.java:315) ~[hadoop-unjar3434452571338587139/:?]
    at io.prometheus.client.exporter.PushGateway.pushAdd(PushGateway.java:182) ~[hadoop-unjar3434452571338587139/:?]
    at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.push(PrometheusPushGatewayManager.java:108) ~[hadoop-unjar3434452571338587139/:?]
    at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.shutdown(PrometheusPushGatewayManager.java:146) ~[hadoop-unjar3434452571338587139/:?]
    at org.springframework.boot.actuate.metrics.export.prometheus.PrometheusPushGatewayManager.shutdown(PrometheusPushGatewayManager.java:136) ~[hadoop-unjar3434452571338587139/:?]
    ...

My Reporter class

package com.hotels.bdp.cloverleaf.metrics;

import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;

import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.ImmutableTag;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import io.micrometer.core.instrument.Tags;

/**
 * MetricReporterService is responsible for reporting metrics to configured meter registries,
 * including Graphite and Prometheus. It manages a collection of metrics and tags, and
 * provides methods to report both general and partition-specific metrics.
 *
 * The class utilizes Micrometer's MeterRegistry to register and send metrics.
 * It supports adding metrics with or without partition tags, and ensures metrics are
 * properly tagged before reporting.
 */
public class MetricReporterService {
  private static final Logger log = LoggerFactory.getLogger(MetricReporterService.class);
  private static final String PARTITION_TAG_KEY = "partition";
  private final MeterRegistry meterRegistry;
  private final List<Metric> metrics = new ArrayList<>();
  private final Map<Metric, String> partitionMetrics = new HashMap<>();
  private final Tags tags;

  private final ScheduledExecutorService executorService;

  @Autowired
  public MetricReporterService(MeterRegistry meterRegistry, List<ImmutableTag> cloverleafTags) {
    this.meterRegistry = meterRegistry;
    this.tags = Tags.of(cloverleafTags);

    // Initialize the executor service with a single thread
    this.executorService = Executors.newSingleThreadScheduledExecutor(r -> {
      Thread thread = new Thread(r);
      thread.setDaemon(true); // Daemon thread so it doesn't prevent JVM shutdown
      thread.setName("metric-reporter");
      return thread;
    });

    // Schedule the task to report metrics every fixed interval
    this.executorService.scheduleAtFixedRate(this::reportAllMetrics, 0, 1, TimeUnit.MINUTES);
  }

  public void addMetric(Metric metric) {
    metrics.add(metric);
  }

  public void addPartitionMetric(Metric metric, String partition) {
    partitionMetrics.put(metric, partition);
  }

  /**
   * Reports all the metrics to the meter registry, including both general and partition-specific metrics.
   */
  public void reportAllMetrics() {
    metrics.forEach(this::reportSingleMetric);

    partitionMetrics.forEach((metric, partition) -> {
      try {
        reportPartitionedMetric(metric, partition);
      } catch (SocketTimeoutException e) {
        log.error("SocketTimeoutException while reporting partition metric '{}'", metric.getName(), e);
      }  catch (RuntimeException e) {
        log.error("RuntimeException while reporting partition metric '{}'", metric.getName(), e);
      }
    });
  }

  private void reportSingleMetric(Metric metric) {
    int retryCount = 0;
    int maxRetries = 3; // Adjust as needed
    long timeout = 1000; // Initial timeout in milliseconds

    while (retryCount < maxRetries) {
      try {
        report(metric, tags);
        return; // Success, exit loop
      } catch (SocketTimeoutException e) {
        // Handle timeout specifically
        log.warn("Failed to report metric '{}' on attempt {} due to timeout. Retrying...", metric.getName(), retryCount + 1, e);
        retryCount++;
        timeout *= 2; // Exponential backoff
        try {
          Thread.sleep(timeout);
        } catch (InterruptedException ie) {
          // Handle being interrupted during sleep (optional)
          log.warn("Thread interrupted while waiting for retry. Continuing...", ie);
        }
      } catch (RuntimeException e) {
        log.error("RuntimeException while reporting metric '{}'", metric.getName(), e);
        return; // Exit loop on other runtime exceptions
      }
    }
    log.error("Failed to report metric '{}' after {} retries.", metric.getName(), maxRetries);
  }

  private void reportPartitionedMetric(Metric metric, String partition) throws SocketTimeoutException {
    try {
      report(metric, tags.and(PARTITION_TAG_KEY, partition));
    } catch (SocketTimeoutException e) {
      log.error("SocketTimeoutException while reporting partition metric '{}'", metric.getName(), e);
      // Retry logic not explicitly handled here, ensure control flow reaches reportSingleMetric
      throw e; // Rethrow to ensure it propagates up the call stack
    } catch (RuntimeException e) {
      log.error("RuntimeException while reporting partition metric '{}'", metric.getName(), e);
    }
  }

  private void report(Metric metric, Iterable<Tag> tags) throws SocketTimeoutException {
    String metricName = metric.getName();
    double metricValue = metric.getValue();

    // Register the metric with the configured meter registry
    Counter counter = Counter.builder(metricName).tags(tags).register(meterRegistry);

    log.info("\nSuccessfully recorded metric '{}' with value {} and tags {}\n",
        metricName,
        metricValue,
        tags);

    counter.increment(metricValue);
  }

  // Ensure the executor service shuts down gracefully on application shutdown
  public void shutdown() {
    log.info("Shutting down MetricReporterService...");

    executorService.shutdown();
    try {
      if (!executorService.awaitTermination(10, TimeUnit.SECONDS)) {
        log.warn("Executor service did not terminate in time. Attempting forced shutdown...");
        executorService.shutdownNow();
        if (!executorService.awaitTermination(10, TimeUnit.SECONDS)) {
          log.error("Executor service could not be terminated.");
        }
      }
    } catch (InterruptedException e) {
      log.error("Interrupted while waiting for executor service termination.", e);
      executorService.shutdownNow();
      Thread.currentThread().interrupt();
    }
    log.info("MetricReporterService shutdown complete.");
  }
}

My PushGateway Deployment in Kubernetes

kind: Deployment
apiVersion: apps/v1
metadata:
  name: prometheus-pushgateway
  namespace: monitoring
  uid: 0e452c81
  resourceVersion: '1316386'
  generation: 14
  creationTimestamp: '2023-11-21T21:39:50Z'
  labels:
    app.kubernetes.io/instance: prometheus-pushgateway
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: prometheus-pushgateway
    app.kubernetes.io/version: v1.5.1
    helm.sh/chart: prometheus-pushgateway-2.0.3
  annotations:
    deployment.kubernetes.io/revision: '11'
    meta.helm.sh/release-name: prometheus-pushgateway
    meta.helm.sh/release-namespace: monitoring
  managedFields:
    - manager: Go-http-client
      operation: Update
      apiVersion: apps/v1
      time: '2024-06-24T22:04:13Z'
      fieldsType: FieldsV1
      fieldsV1:
        'f:metadata':
          'f:annotations':
            .: {}
            'f:meta.helm.sh/release-name': {}
            'f:meta.helm.sh/release-namespace': {}
          'f:labels':
            .: {}
            'f:app.kubernetes.io/instance': {}
            'f:app.kubernetes.io/managed-by': {}
            'f:app.kubernetes.io/name': {}
            'f:app.kubernetes.io/version': {}
            'f:helm.sh/chart': {}
        'f:spec':
          'f:progressDeadlineSeconds': {}
          'f:revisionHistoryLimit': {}
          'f:selector': {}
          'f:strategy':
            'f:type': {}
          'f:template':
            'f:metadata':
              'f:annotations':
                .: {}
                'f:eks.amazonaws.com/role-arn': {}
              'f:labels':
                .: {}
                'f:app.kubernetes.io/instance': {}
                'f:app.kubernetes.io/managed-by': {}
                'f:app.kubernetes.io/name': {}
                'f:app.kubernetes.io/version': {}
                'f:helm.sh/chart': {}
            'f:spec':
              'f:containers':
                'k:{"name":"pushgateway"}':
                  .: {}
                  'f:image': {}
                  'f:imagePullPolicy': {}
                  'f:livenessProbe':
                    .: {}
                    'f:failureThreshold': {}
                    'f:httpGet':
                      .: {}
                      'f:path': {}
                      'f:port': {}
                      'f:scheme': {}
                    'f:initialDelaySeconds': {}
                    'f:periodSeconds': {}
                    'f:successThreshold': {}
                  'f:name': {}
                  'f:ports':
                    .: {}
                    'k:{"containerPort":9091,"protocol":"TCP"}':
                      .: {}
                      'f:containerPort': {}
                      'f:name': {}
                      'f:protocol': {}
                  'f:readinessProbe':
                    .: {}
                    'f:failureThreshold': {}
                    'f:httpGet':
                      .: {}
                      'f:path': {}
                      'f:port': {}
                      'f:scheme': {}
                    'f:initialDelaySeconds': {}
                    'f:periodSeconds': {}
                    'f:successThreshold': {}
                  'f:resources': {}
                  'f:terminationMessagePath': {}
                  'f:terminationMessagePolicy': {}
                  'f:volumeMounts':
                    .: {}
                    'k:{"mountPath":"/data"}':
                      .: {}
                      'f:mountPath': {}
                      'f:name': {}
              'f:dnsPolicy': {}
              'f:imagePullSecrets':
                .: {}
                'k:{"name":"artifactory-hub-docker-remote"}': {}
              'f:restartPolicy': {}
              'f:schedulerName': {}
              'f:securityContext':
                .: {}
                'f:fsGroup': {}
                'f:runAsNonRoot': {}
                'f:runAsUser': {}
              'f:serviceAccount': {}
              'f:serviceAccountName': {}
              'f:terminationGracePeriodSeconds': {}
              'f:volumes':
                .: {}
                'k:{"name":"storage-volume"}':
                  .: {}
                  'f:name': {}
                  'f:persistentVolumeClaim':
                    .: {}
                    'f:claimName': {}
    - manager: dashboard
      operation: Update
      apiVersion: apps/v1
      time: '2024-06-28T12:03:48Z'
      fieldsType: FieldsV1
      fieldsV1:
        'f:spec':
          'f:replicas': {}
          'f:template':
            'f:metadata':
              'f:annotations':
                'f:ad.datadoghq.com/pushgateway.check_names': {}
                'f:ad.datadoghq.com/pushgateway.init_configs': {}
                'f:ad.datadoghq.com/pushgateway.instances': {}
                'f:prometheus.io/path': {}
                'f:prometheus.io/port': {}
                'f:prometheus.io/scrape': {}
            'f:spec':
              'f:containers':
                'k:{"name":"pushgateway"}':
                  'f:livenessProbe':
                    'f:timeoutSeconds': {}
                  'f:readinessProbe':
                    'f:timeoutSeconds': {}
    - manager: kube-controller-manager
      operation: Update
      apiVersion: apps/v1
      time: '2024-07-01T18:19:37Z'
      fieldsType: FieldsV1
      fieldsV1:
        'f:metadata':
          'f:annotations':
            'f:deployment.kubernetes.io/revision': {}
        'f:status':
          'f:availableReplicas': {}
          'f:conditions':
            .: {}
            'k:{"type":"Available"}':
              .: {}
              'f:lastTransitionTime': {}
              'f:lastUpdateTime': {}
              'f:message': {}
              'f:reason': {}
              'f:status': {}
              'f:type': {}
            'k:{"type":"Progressing"}':
              .: {}
              'f:lastTransitionTime': {}
              'f:lastUpdateTime': {}
              'f:message': {}
              'f:reason': {}
              'f:status': {}
              'f:type': {}
          'f:observedGeneration': {}
          'f:readyReplicas': {}
          'f:replicas': {}
          'f:updatedReplicas': {}
      subresource: status
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/instance: prometheus-pushgateway
      app.kubernetes.io/name: prometheus-pushgateway
  template:
    metadata:
      creationTimestamp: null
      labels:
        app.kubernetes.io/instance: prometheus-pushgateway
        app.kubernetes.io/managed-by: Helm
        app.kubernetes.io/name: prometheus-pushgateway
        app.kubernetes.io/version: v1.5.1
        helm.sh/chart: prometheus-pushgateway-2.0.3
      annotations:
        ad.datadoghq.com/pushgateway.check_names: '["openmetrics"]'
        ad.datadoghq.com/pushgateway.init_configs: '[{}]'
        ad.datadoghq.com/pushgateway.instances: >-
          [{ "prometheus_url": "http://%%host%%:9091/metrics", "namespace":
          "cloverleaf", "metrics": [
          "current_lag*","jvm_*","system_*","timer_*","backfill_lag*","partition*","populate_temp_tables*","push*","source*","touched_partition*","triggered_by*","update_table*","triggered*","schema_evolutio*","input-parti*","bytes*","new_partition*","touched*","modified*","source*","update*","populate*","execution*",
          "long_running_jobs_killed*" , "update-table-statistics-duration*"   ],
          "max_returned_metrics":"30000"  }]
        eks.amazonaws.com/role-arn: 'arn:....'
        prometheus.io/path: /metrics
        prometheus.io/port: '9091'
        prometheus.io/scrape: 'true'
    spec:
      volumes:
        - name: storage-volume
          persistentVolumeClaim:
            claimName: prometheus-pushgateway
      containers:
        - name: pushgateway
          image: 'hub-docker-remote.artylab.expedia.biz/prom/pushgateway:v1.0.1'
          ports:
            - name: metrics
              containerPort: 9091
              protocol: TCP
          resources: {}
          volumeMounts:
            - name: storage-volume
              mountPath: /data
          livenessProbe:
            httpGet:
              path: /-/ready
              port: 9091
              scheme: HTTP
            initialDelaySeconds: 10
            timeoutSeconds: 30
            periodSeconds: 10
            successThreshold: 1
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9091
              scheme: HTTP
            initialDelaySeconds: 10
            timeoutSeconds: 30
            periodSeconds: 10
            successThreshold: 1
            failureThreshold: 3
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
          imagePullPolicy: IfNotPresent
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      dnsPolicy: ClusterFirst
      serviceAccountName: prometheus-pushgateway
      serviceAccount: prometheus-pushgateway
      securityContext:
        runAsUser: 65534
        runAsNonRoot: true
        fsGroup: 65534
      imagePullSecrets:
        - name: artifactory-hub-docker-remote
      schedulerName: default-scheduler
  strategy:
    type: Recreate
  revisionHistoryLimit: 10
  progressDeadlineSeconds: 600
status:
  observedGeneration: 14
  replicas: 1
  updatedReplicas: 1
  readyReplicas: 1
  availableReplicas: 1
  conditions:
    - type: Progressing
      status: 'True'
      lastUpdateTime: '2024-06-27T21:09:03Z'
      lastTransitionTime: '2023-11-21T21:39:50Z'
      reason: NewReplicaSetAvailable
      message: >-
        ReplicaSet "prometheus-pushgateway-6db8" has successfully
        progressed.
    - type: Available
      status: 'True'
      lastUpdateTime: '2024-07-01T18:19:37Z'
      lastTransitionTime: '2024-07-01T18:19:37Z'
      reason: MinimumReplicasAvailable
      message: Deployment has minimum availability.

What I’ve Tried

Network Issues: Verified that the Pushgateway is accessible from the application server. Timeout Settings: Checked and adjusted timeout settings to see if it mitigates the issue.

.properties("management.metrics.export.prometheus.pushgateway.connect-timeout=20s") .properties("management.metrics.export.prometheus.pushgateway.read-timeout=40s")

Questions

  • What could be causing the SocketTimeoutException when pushing metrics to Prometheus Pushgateway?
  • Are there any specific configurations in Spring Boot or Prometheus Pushgateway that I might be missing or need to adjust?
  • Could there be any underlying network or resource constraints contributing to this issue?
  • Any guidance or suggestions to resolve this issue would be greatly appreciated.

0