sapcc · leust · Dec 20, 2022 · joker-at-work · Feb 22, 2023 · leust
diff --git a/nova/bigvm/exporter.py b/nova/bigvm/exporter.py
@@ -0,0 +1,96 @@
+# Copyright 2022 SAP SE
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+from oslo_log import log as logging
+from prometheus_client import CollectorRegistry
+from prometheus_client import Counter
+from prometheus_client import Gauge
+from prometheus_client import start_http_server
+
+import nova.conf
+
+CONF = nova.conf.CONF
+
+LOG = logging.getLogger(__name__)
+
+REGISTRY = CollectorRegistry(auto_describe=True)
+
+ERROR_FREEING = 'freeing'
+
+
+class _BigVmPrometheusMetrics:
+
+    def __init__(self, registry):
+        self.host_errors_counter = \
+            Counter('nova_bigvm_host_errors',
+                    'Counts errors that happened while reconciling '
+                    'a host. The "error" is a short code meaning: '
+                    'freeing = Error while freeing up a host',
+                    labelnames=['error', 'vc', 'host', 'rp'],
+                    registry=registry)
+
+        self.no_candidate_error_counter = \
+            Counter('nova_bigvm_no_candidate_error',
+                    'Counter that increments each time the '
+                    'reconciliation loop cannot find a '
+                    'resource-provider for freeing-up a host.',
+                    labelnames=['hv_size'],
+                    registry=registry)
+
+        self.host_freeing_up_gauge = \
+            Gauge('nova_bigvm_host_freeing_up',
+                  'Gauge for each BigVM host that is currently '
+                  'being freed up.',
+                  labelnames=['vc', 'host', 'rp'],
+                  registry=registry)
+
+        self.free_hosts_count_gauge = \
+            Gauge('nova_bigvm_free_hosts_count',
+                  'The total amount of available BigVM hosts '
+                  'in the region.',
+                  labelnames=['hv_size'],
+                  registry=registry)
+
+    def bigvm_host_error(self, error, rp):
+        self.host_errors_counter.labels(
+            error, rp['vc'], rp['host'], rp['rp']['name']).inc()
+
+    def error_freeing(self, rp):
+        self.bigvm_host_error(ERROR_FREEING, rp)
+
+    def no_candidate_error(self, hv_size):
+        self.no_candidate_error_counter.labels(hv_size).inc()
+
+    def set_freeing_provider(self, rp):
+        self.host_freeing_up_gauge.labels(
+            rp['vc'], rp['host'], rp['rp']['name']).set(1)
+
+    def remove_freeing_provider(self, rp):
+        try:
+            self.host_freeing_up_gauge.remove(
+                rp['vc'], rp['host'], rp['rp']['name'])
+        except KeyError:
+            pass
+
+    def set_free_hosts_count(self, hv_size, count):
+        self.free_hosts_count_gauge.labels(hv_size).set(count)
+
+
+bigvm_metrics = _BigVmPrometheusMetrics(REGISTRY)
+
+
+def start_bigvm_exporter():
+    port = CONF.bigvm_exporter_listen_port
+    start_http_server(port, registry=REGISTRY)
+    LOG.info("Started BigVM prometheus exporter on port %s", port)
diff --git a/nova/bigvm/manager.py b/nova/bigvm/manager.py
@@ -15,6 +15,7 @@
 """
 BigVM service
 """
+import collections
 import itertools
 
 import os_resource_classes as orc
@@ -23,6 +24,7 @@
 from oslo_messaging import exceptions as oslo_exceptions
 from oslo_service import periodic_task
 
+from nova.bigvm.exporter import bigvm_metrics
 import nova.conf
 from nova import context as nova_context
 from nova import exception
@@ -167,6 +169,7 @@ def _flatten(list_of_lists):
                              'max_used': CONF.bigvm_cluster_max_usage_percent,
                              'max_reserved':
                                 CONF.bigvm_cluster_max_reservation_percent})
+                bigvm_metrics.no_candidate_error(hv_size)
                 continue
 
             # filter out providers that are disabled in general or for bigVMs
@@ -184,6 +187,7 @@ def _flatten(list_of_lists):
                             'host for hypervisor size %(hv_size)d, because '
                             'all providers with enough space are disabled.',
                             {'hv_size': hv_size})
+                bigvm_metrics.no_candidate_error(hv_size)
                 continue
 
             candidates[hv_size] = (alloc_reqs, filtered_provider_summaries)
@@ -195,6 +199,7 @@ def _flatten(list_of_lists):
                                 'up a host for hypervisor size %(hv_size)d in '
                                 '%(vc)s.',
                                 {'hv_size': hv_size, 'vc': vc})
+                    bigvm_metrics.no_candidate_error(hv_size)
                     continue
                 alloc_reqs, provider_summaries = candidates[hv_size]
 
@@ -217,7 +222,7 @@ def _free_memory(p):
                         cm = vmware_providers[rp_uuid]['cell_mapping']
                         with nova_context.target_cell(context, cm) as cctxt:
                             if self._free_host_for_provider(cctxt, rp_uuid,
-                                                            host):
+                                                            host, vc):
                                 break
                 except oslo_exceptions.MessagingTimeout as e:
                     # we don't know if the timeout happened after we started
@@ -630,6 +635,7 @@ def _get_missing_hv_sizes(self, context, vcenters,
         """
         found_hv_sizes_per_vc = {vc: set() for vc in vcenters}
 
+        free_hosts = collections.defaultdict(int)
         for rp_uuid, rp in bigvm_providers.items():
             host_rp_uuid = rp['host_rp_uuid']
             hv_size = vmware_providers[host_rp_uuid]['hv_size']
@@ -645,16 +651,26 @@ def _get_missing_hv_sizes(self, context, vcenters,
 
                 if state == special_spawning.FREE_HOST_STATE_DONE:
                     self._add_resources_to_provider(context, rp_uuid, rp)
+                    bigvm_metrics.remove_freeing_provider(rp)
+                    free_hosts[hv_size] += 1
                 elif state == special_spawning.FREE_HOST_STATE_ERROR:
                     LOG.warning('Freeing a host for spawning failed on '
                                 '%(host)s.',
                                 {'host': rp['host']})
                     # do some cleanup, so another compute-node is used
                     found_hv_sizes_per_vc[rp['vc']].remove(hv_size)
+                    bigvm_metrics.remove_freeing_provider(rp)
+                    bigvm_metrics.error_freeing(rp)
                     self._clean_up_consumed_provider(context, rp_uuid, rp)
                 else:
                     LOG.info('Waiting for host on %(host)s to free up.',
                              {'host': rp['host']})
+                    bigvm_metrics.set_freeing_provider(rp)
+            else:
+                free_hosts[hv_size] += 1
+
+        for hv_size, count in free_hosts.items():
+            bigvm_metrics.set_free_hosts_count(hv_size, count)
 
         hv_sizes_per_vc = {
             vc: set(rp['hv_size'] for rp in vmware_providers.values()
@@ -692,7 +708,7 @@ def _add_resources_to_provider(self, context, rp_uuid, rp):
                      'on %(host)s.',
                      {'host': rp['host']})
 
-    def _free_host_for_provider(self, context, rp_uuid, host):
+    def _free_host_for_provider(self, context, rp_uuid, host, vc):
         """Takes care of creating a child resource provider in placement to
         "claim" a resource-provider/host for freeing up a host. Then calls the
         driver to actually free up the host in the cluster.
@@ -764,17 +780,18 @@ def _free_host_for_provider(self, context, rp_uuid, host):
 
             # find a host and let DRS free it up
             state = self.special_spawn_rpc.free_host(context, host)
-
+            new_rp = {'host': host,
+                      'vc': vc,
+                      'rp': {'name': new_rp_name},
+                      'host_rp_uuid': rp_uuid}
             if state == special_spawning.FREE_HOST_STATE_DONE:
                 # there were free resources available immediately
                 needs_cleanup = False
-                new_rp = {'host': host,
-                          'rp': {'name': new_rp_name},
-                          'host_rp_uuid': rp_uuid}
                 self._add_resources_to_provider(context, new_rp_uuid, new_rp)
             elif state == special_spawning.FREE_HOST_STATE_STARTED:
                 # it started working on it. we have to check back later
                 # if it's done
+                bigvm_metrics.set_freeing_provider(new_rp)
                 needs_cleanup = False
         finally:
             # clean up placement, if something went wrong

diff --git a/nova/cmd/bigvm.py b/nova/cmd/bigvm.py
@@ -21,6 +21,7 @@
 from oslo_reports import guru_meditation_report as gmr
 from oslo_reports import opts as gmr_opts
 
+from nova.bigvm.exporter import start_bigvm_exporter
 import nova.conf
 from nova import config
 from nova import objects
@@ -39,6 +40,8 @@ def main():
 
     gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
 
+    start_bigvm_exporter()
+
     server = service.Service.create(binary='nova-bigvm')
     service.serve(server)
     service.wait()
diff --git a/nova/conf/base.py b/nova/conf/base.py
@@ -192,6 +192,12 @@
 Compare the values of conf.vmware.memory_reservation_cluster_hosts_max_fail and
 conf.vmware.memory_reservation_max_ratio_fallback to see how much of total
 memory is actually reservable.
+"""),
+    cfg.IntOpt(
+        'bigvm_exporter_listen_port',
+        default=9847,
+        help="""
+Port on which the BigVM prometheus exporter listens for HTTP requests.
 """),
 ]
 

diff --git a/requirements.txt b/requirements.txt
@@ -68,3 +68,4 @@ futurist>=1.8.0 # Apache-2.0
 openstacksdk>=0.35.0 # Apache-2.0
 dataclasses>=0.7;python_version=='3.6'  # Apache 2.0 License
 PyYAML>=5.1 # MIT
+prometheus_client