From b5b24e63209e4a24f9777bfc6c7f7bf72ff7299f Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Tue, 2 Dec 2025 17:29:53 +0100 Subject: [PATCH 01/18] move a prometheus counter to otel --- poetry.lock | 32 +++++++++++----- pyproject.toml | 1 + synapse/federation/federation_client.py | 51 +++++++++++++++---------- synapse/metrics/__init__.py | 6 +++ 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/poetry.lock b/poetry.lock index 27fcf1fd74..f3f5aa0a7e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -960,7 +960,7 @@ files = [ {file = "importlib_metadata-6.7.0-py3-none-any.whl", hash = "sha256:cb52082e659e97afc5dac71e79de97d8681de3aa07ff18578330904a9d18e5b5"}, {file = "importlib_metadata-6.7.0.tar.gz", hash = "sha256:1aaf550d4f73e5d6783e7acb77aec43d49da8017410afae93822cc9cca98c4d4"}, ] -markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"", dev = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and python_version < \"3.12\""} +markers = {dev = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and python_version < \"3.12\""} [package.dependencies] zipp = ">=0.5" @@ -1673,10 +1673,9 @@ nicer-shell = ["ipython"] name = "opentelemetry-api" version = "1.34.1" description = "OpenTelemetry Python API" -optional = true +optional = false python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" files = [ {file = "opentelemetry_api-1.34.1-py3-none-any.whl", hash = "sha256:b7df4cb0830d5a6c29ad0c0691dbae874d8daefa934b8b1d642de48323d32a8c"}, {file = "opentelemetry_api-1.34.1.tar.gz", hash = "sha256:64f0bd06d42824843731d05beea88d4d4b6ae59f9fe347ff7dfa2cc14233bbb3"}, @@ -1766,6 +1765,23 @@ opentelemetry-sdk = ">=1.34.1,<1.35.0" requests = ">=2.7,<3.0" typing-extensions = ">=4.5.0" +[[package]] +name = "opentelemetry-exporter-prometheus" +version = "0.55b1" +description = "Prometheus Metric Exporter for OpenTelemetry" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "opentelemetry_exporter_prometheus-0.55b1-py3-none-any.whl", hash = "sha256:f364fbbff9e5de37a112ff104d1185fb1d7e2046c5ab5911e5afebc7ab3ddf0e"}, + {file = "opentelemetry_exporter_prometheus-0.55b1.tar.gz", hash = "sha256:d13ec0b22bf394113ff1ada5da98133a4b051779b803dae183188e26c4bd9ee0"}, +] + +[package.dependencies] +opentelemetry-api = ">=1.12,<2.0" +opentelemetry-sdk = ">=1.34.1,<1.35.0" +prometheus-client = ">=0.5.0,<1.0.0" + [[package]] name = "opentelemetry-opentracing-shim" version = "0.55b1" @@ -1804,10 +1820,9 @@ protobuf = ">=5.0,<6.0" name = "opentelemetry-sdk" version = "1.34.1" description = "OpenTelemetry Python SDK" -optional = true +optional = false python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" files = [ {file = "opentelemetry_sdk-1.34.1-py3-none-any.whl", hash = "sha256:308effad4059562f1d92163c61c8141df649da24ce361827812c40abb2a1e96e"}, {file = "opentelemetry_sdk-1.34.1.tar.gz", hash = "sha256:8091db0d763fcd6098d4781bbc80ff0971f94e260739aa6afe6fd379cdf3aa4d"}, @@ -1822,10 +1837,9 @@ typing-extensions = ">=4.5.0" name = "opentelemetry-semantic-conventions" version = "0.55b1" description = "OpenTelemetry Semantic Conventions" -optional = true +optional = false python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" files = [ {file = "opentelemetry_semantic_conventions-0.55b1-py3-none-any.whl", hash = "sha256:5da81dfdf7d52e3d37f8fe88d5e771e191de924cfff5f550ab0b8f7b2409baed"}, {file = "opentelemetry_semantic_conventions-0.55b1.tar.gz", hash = "sha256:ef95b1f009159c28d7a7849f5cbc71c4c34c845bb514d66adfdf1b3fff3598b3"}, @@ -3517,7 +3531,7 @@ files = [ {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, ] -markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"", dev = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and python_version < \"3.12\""} +markers = {dev = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and python_version < \"3.12\""} [package.extras] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] @@ -3638,4 +3652,4 @@ url-preview = ["lxml"] [metadata] lock-version = "2.1" python-versions = "^3.10.0" -content-hash = "7f5d25ff7f67ce8283b4d3bf831e20b1a11367c0996160a333abb7df5569c9c7" +content-hash = "194de82d864503514051cf1361cefca7f1d3a3ae613305b45b0dea2e740ec90a" diff --git a/pyproject.toml b/pyproject.toml index e4afc89312..f3a19ee3c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -225,6 +225,7 @@ packaging = ">=20.0" # We support pydantic v1 and pydantic v2 via the pydantic.v1 compat module. # See https://github.com/matrix-org/synapse/issues/15858 pydantic = ">=1.7.4, <3" +opentelemetry-exporter-prometheus = "0.55b1" # This is for building the rust components during "poetry install", which # currently ignores the `build-system.requires` directive (c.f. diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 8c91336dbc..22d5664296 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -44,7 +44,6 @@ ) import attr -from prometheus_client import Counter from synapse.api.constants import Direction, EventContentFields, EventTypes, Membership from synapse.api.errors import ( @@ -74,7 +73,7 @@ from synapse.http.client import is_unknown_endpoint from synapse.http.types import QueryParams from synapse.logging.opentracing import SynapseTags, log_kv, set_tag, tag_args, trace -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import JsonDict, StrCollection, UserID, get_domain_from_id from synapse.types.handlers.policy_server import RECOMMENDATION_OK, RECOMMENDATION_SPAM from synapse.util.async_helpers import concurrently_execute @@ -86,8 +85,8 @@ logger = logging.getLogger(__name__) -sent_queries_counter = Counter( - "synapse_federation_client_sent_queries", "", labelnames=["type", SERVER_NAME_LABEL] +sent_queries_counter = meter.create_counter( + "synapse_federation_client_sent_queries", description="" ) @@ -214,10 +213,13 @@ async def make_query( Returns: The JSON object from the response """ - sent_queries_counter.labels( - type=query_type, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + sent_queries_counter.add( + 1, + { + "type": query_type, + SERVER_NAME_LABEL: self.server_name, + }, + ) return await self.transport_layer.make_query( destination, @@ -239,10 +241,13 @@ async def query_client_keys( Returns: The JSON object from the response """ - sent_queries_counter.labels( - type="client_device_keys", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + sent_queries_counter.add( + 1, + { + "type": "client_device_keys", + SERVER_NAME_LABEL: self.server_name, + }, + ) return await self.transport_layer.query_client_keys( destination, content, timeout ) @@ -253,10 +258,13 @@ async def query_user_devices( """Query the device keys for a list of user ids hosted on a remote server. """ - sent_queries_counter.labels( - type="user_devices", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + sent_queries_counter.add( + 1, + { + "type": "user_devices", + SERVER_NAME_LABEL: self.server_name, + }, + ) return await self.transport_layer.query_user_devices( destination, user_id, timeout ) @@ -278,10 +286,13 @@ async def claim_client_keys( Returns: The JSON object from the response """ - sent_queries_counter.labels( - type="client_one_time_keys", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + sent_queries_counter.add( + 1, + { + "type": "client_one_time_keys", + SERVER_NAME_LABEL: self.server_name, + }, + ) # Convert the query with counts into a stable and unstable query and check # if attempting to claim more than 1 OTK. diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index fe673eea19..8b775f71f5 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -43,6 +43,7 @@ ) import attr +from opentelemetry.metrics import get_meter_provider from packaging.version import parse as parse_version from prometheus_client import ( CollectorRegistry, @@ -139,6 +140,10 @@ def _set_prometheus_client_use_created_metrics(new_value: bool) -> None: _set_prometheus_client_use_created_metrics(False) +# Global meter for registering otel metrics +meter = get_meter_provider().get_meter("synapse") + + class _RegistryProxy: @staticmethod def collect() -> Iterable[Metric]: @@ -775,4 +780,5 @@ def render_GET(self, request: Request) -> bytes: "GaugeBucketCollector", "MIN_TIME_BETWEEN_GCS", "install_gc_manager", + "meter", ] From 975e1cee1c8f597ae0912601fe1f273a4492b7de Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Wed, 3 Dec 2025 11:16:18 +0100 Subject: [PATCH 02/18] convert some counters --- synapse/appservice/api.py | 38 ++++++++--------- synapse/federation/federation_server.py | 33 +++++++-------- synapse/federation/sender/__init__.py | 40 ++++++++++-------- .../sender/per_destination_queue.py | 40 +++++++++--------- synapse/handlers/appservice.py | 35 +++++++++------- synapse/handlers/auth.py | 41 +++++++++++-------- synapse/metrics/__init__.py | 19 ++++----- 7 files changed, 124 insertions(+), 122 deletions(-) diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py index 55069cc5d3..904cffc6ed 100644 --- a/synapse/appservice/api.py +++ b/synapse/appservice/api.py @@ -34,7 +34,6 @@ Union, ) -from prometheus_client import Counter from typing_extensions import ParamSpec, TypeGuard from synapse.api.constants import EventTypes, Membership, ThirdPartyEntityKind @@ -48,7 +47,7 @@ from synapse.events.utils import SerializeEventConfig, serialize_event from synapse.http.client import SimpleHttpClient, is_unknown_endpoint from synapse.logging import opentracing -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import DeviceListUpdates, JsonDict, JsonMapping, ThirdPartyInstanceID from synapse.util.caches.response_cache import ResponseCache @@ -57,34 +56,29 @@ logger = logging.getLogger(__name__) -sent_transactions_counter = Counter( +sent_transactions_counter = meter.create_counter( "synapse_appservice_api_sent_transactions", - "Number of /transactions/ requests sent", - labelnames=["service", SERVER_NAME_LABEL], + description="Number of /transactions/ requests sent", ) -failed_transactions_counter = Counter( +failed_transactions_counter = meter.create_counter( "synapse_appservice_api_failed_transactions", - "Number of /transactions/ requests that failed to send", - labelnames=["service", SERVER_NAME_LABEL], + description="Number of /transactions/ requests that failed to send", ) -sent_events_counter = Counter( +sent_events_counter = meter.create_counter( "synapse_appservice_api_sent_events", - "Number of events sent to the AS", - labelnames=["service", SERVER_NAME_LABEL], + description="Number of events sent to the AS", ) -sent_ephemeral_counter = Counter( +sent_ephemeral_counter = meter.create_counter( "synapse_appservice_api_sent_ephemeral", - "Number of ephemeral events sent to the AS", - labelnames=["service", SERVER_NAME_LABEL], + description="Number of ephemeral events sent to the AS", ) -sent_todevice_counter = Counter( +sent_todevice_counter = meter.create_counter( "synapse_appservice_api_sent_todevice", - "Number of todevice messages sent to the AS", - labelnames=["service", SERVER_NAME_LABEL], + description="Number of todevice messages sent to the AS", ) HOUR_IN_MS = 60 * 60 * 1000 @@ -403,10 +397,10 @@ async def push_bulk( service.url, [event.get("event_id") for event in events], ) - sent_transactions_counter.labels(**labels).inc() - sent_events_counter.labels(**labels).inc(len(serialized_events)) - sent_ephemeral_counter.labels(**labels).inc(len(ephemeral)) - sent_todevice_counter.labels(**labels).inc(len(to_device_messages)) + sent_transactions_counter.add(1, labels) + sent_events_counter.add(len(serialized_events), labels) + sent_ephemeral_counter.add(len(ephemeral), labels) + sent_todevice_counter.add(len(to_device_messages), labels) return True except CodeMessageException as e: logger.warning( @@ -425,7 +419,7 @@ async def push_bulk( ex.args, exc_info=logger.isEnabledFor(logging.DEBUG), ) - failed_transactions_counter.labels(**labels).inc() + failed_transactions_counter.add(1, labels) return False async def claim_client_keys( diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index ea905cdaaa..33a7d904f0 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -35,7 +35,7 @@ Union, ) -from prometheus_client import Counter, Gauge, Histogram +from prometheus_client import Gauge, Histogram from twisted.python import failure @@ -82,7 +82,7 @@ tag_args, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.replication.http.federation import ( ReplicationFederationSendEduRestServlet, @@ -105,18 +105,12 @@ logger = logging.getLogger(__name__) -received_pdus_counter = Counter( - "synapse_federation_server_received_pdus", "", labelnames=[SERVER_NAME_LABEL] -) +received_pdus_counter = meter.create_counter("synapse_federation_server_received_pdus") -received_edus_counter = Counter( - "synapse_federation_server_received_edus", "", labelnames=[SERVER_NAME_LABEL] -) +received_edus_counter = meter.create_counter("synapse_federation_server_received_edus") -received_queries_counter = Counter( +received_queries_counter = meter.create_counter( "synapse_federation_server_received_queries", - "", - labelnames=["type", SERVER_NAME_LABEL], ) pdu_process_time = Histogram( @@ -442,8 +436,8 @@ async def _handle_pdus_in_txn( report back to the sending server. """ - received_pdus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( - len(transaction.pdus) + received_pdus_counter.add( + len(transaction.pdus), {SERVER_NAME_LABEL: self.server_name} ) origin_host, _ = parse_server_name(origin) @@ -565,7 +559,7 @@ async def _handle_edus_in_txn(self, origin: str, transaction: Transaction) -> No """Process the EDUs in a received transaction.""" async def _process_edu(edu_dict: JsonDict) -> None: - received_edus_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + received_edus_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) edu = Edu( origin=origin, @@ -680,10 +674,13 @@ async def on_pdu_request( async def on_query_request( self, query_type: str, args: Dict[str, str] ) -> Tuple[int, Dict[str, Any]]: - received_queries_counter.labels( - type=query_type, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + received_queries_counter.add( + 1, + { + "type": query_type, + SERVER_NAME_LABEL: self.server_name, + }, + ) resp = await self.registry.on_query(query_type, args) return 200, resp diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 4410ffc5c5..b656cc5101 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -145,7 +145,6 @@ ) import attr -from prometheus_client import Counter from twisted.internet import defer @@ -166,6 +165,7 @@ event_processing_loop_counter, event_processing_loop_room_count, events_processed_counter, + meter, ) from synapse.metrics.background_process_metrics import ( wrap_as_background_process, @@ -187,16 +187,14 @@ logger = logging.getLogger(__name__) -sent_pdus_destination_dist_count = Counter( +sent_pdus_destination_dist_count = meter.create_counter( "synapse_federation_client_sent_pdu_destinations_count", - "Number of PDUs queued for sending to one or more destinations", - labelnames=[SERVER_NAME_LABEL], + description="Number of PDUs queued for sending to one or more destinations", ) -sent_pdus_destination_dist_total = Counter( +sent_pdus_destination_dist_total = meter.create_counter( "synapse_federation_client_sent_pdu_destinations", - "Total number of PDUs queued for sending across all destinations", - labelnames=[SERVER_NAME_LABEL], + description="Total number of PDUs queued for sending across all destinations", ) transaction_queue_pending_destinations_gauge = LaterGauge( @@ -773,19 +771,25 @@ async def handle_room_events(events: List[EventBase]) -> None: **{SERVER_NAME_LABEL: self.server_name}, ).set(ts) - events_processed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(event_entries)) + events_processed_counter.add( + len(event_entries), {SERVER_NAME_LABEL: self.server_name} + ) - event_processing_loop_room_count.labels( - name="federation_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(len(events_by_room)) + event_processing_loop_room_count.add( + len(events_by_room), + { + "name": "federation_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) - event_processing_loop_counter.labels( - name="federation_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + event_processing_loop_counter.add( + 1, + { + "name": "federation_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) synapse.metrics.event_processing_positions.labels( name="federation_sender", **{SERVER_NAME_LABEL: self.server_name} diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index 845af92fac..53cee22bc9 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -26,7 +26,6 @@ from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Tuple, Type import attr -from prometheus_client import Counter from twisted.internet import defer @@ -42,7 +41,7 @@ from synapse.handlers.presence import format_user_presence_state from synapse.logging import issue9533_logger from synapse.logging.opentracing import SynapseTags, set_tag -from synapse.metrics import SERVER_NAME_LABEL, sent_transactions_counter +from synapse.metrics import SERVER_NAME_LABEL, meter, sent_transactions_counter from synapse.types import JsonDict, ReadReceipt from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from synapse.visibility import filter_events_for_server @@ -56,16 +55,14 @@ logger = logging.getLogger(__name__) -sent_edus_counter = Counter( +sent_edus_counter = meter.create_counter( "synapse_federation_client_sent_edus", - "Total number of EDUs successfully sent", - labelnames=[SERVER_NAME_LABEL], + description="Total number of EDUs successfully sent", ) -sent_edus_by_type = Counter( +sent_edus_by_type = meter.create_counter( "synapse_federation_client_sent_edus_by_type", - "Number of sent EDUs successfully sent, by event type", - labelnames=["type", SERVER_NAME_LABEL], + description="Number of sent EDUs successfully sent, by event type", ) @@ -392,17 +389,20 @@ async def _transaction_transmission_loop(self) -> None: self._destination, pending_pdus, pending_edus ) - sent_transactions_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() - sent_edus_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(pending_edus)) + sent_transactions_counter.add( + 1, {SERVER_NAME_LABEL: self.server_name} + ) + sent_edus_counter.add( + len(pending_edus), {SERVER_NAME_LABEL: self.server_name} + ) for edu in pending_edus: - sent_edus_by_type.labels( - type=edu.edu_type, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + sent_edus_by_type.add( + 1, + { + "type": edu.edu_type, + SERVER_NAME_LABEL: self.server_name, + }, + ) except NotRetryingDestination as e: logger.debug( @@ -628,9 +628,7 @@ async def _catch_up_transmission_loop(self) -> None: self._destination, room_catchup_pdus, [] ) - sent_transactions_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + sent_transactions_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) # We pulled this from the DB, so it'll be non-null assert pdu.internal_metadata.stream_ordering diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index 6536d9fe51..267765a3a9 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -31,8 +31,6 @@ Union, ) -from prometheus_client import Counter - from twisted.internet import defer import synapse @@ -45,6 +43,7 @@ SERVER_NAME_LABEL, event_processing_loop_counter, event_processing_loop_room_count, + meter, ) from synapse.metrics.background_process_metrics import ( wrap_as_background_process, @@ -68,8 +67,8 @@ logger = logging.getLogger(__name__) -events_processed_counter = Counter( - "synapse_handlers_appservice_events_processed", "", labelnames=[SERVER_NAME_LABEL] +events_processed_counter = meter.create_counter( + "synapse_handlers_appservice_events_processed" ) @@ -210,19 +209,25 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: **{SERVER_NAME_LABEL: self.server_name}, ).set(upper_bound) - events_processed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(events)) + events_processed_counter.add( + len(events), {SERVER_NAME_LABEL: self.server_name} + ) - event_processing_loop_room_count.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(len(events_by_room)) + event_processing_loop_room_count.add( + len(events_by_room), + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) - event_processing_loop_counter.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + event_processing_loop_counter.add( + 1, + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) if events: now = self.clock.time_msec() diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index 66bfa53ade..e29e502590 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -45,7 +45,6 @@ import attr import bcrypt import unpaddedbase64 -from prometheus_client import Counter from twisted.internet.defer import CancelledError from twisted.web.server import Request @@ -70,7 +69,7 @@ from synapse.http.server import finish_request, respond_with_html from synapse.http.site import SynapseRequest from synapse.logging.context import defer_to_thread -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.databases.main.registration import ( LoginTokenExpired, @@ -93,10 +92,9 @@ INVALID_USERNAME_OR_PASSWORD = "Invalid username or password" -invalid_login_token_counter = Counter( +invalid_login_token_counter = meter.create_counter( "synapse_user_login_invalid_login_tokens", - "Counts the number of rejected m.login.token on /login", - labelnames=["reason", SERVER_NAME_LABEL], + description="Counts the number of rejected m.login.token on /login", ) @@ -1482,20 +1480,29 @@ async def consume_login_token(self, login_token: str) -> LoginTokenLookupResult: try: return await self.store.consume_login_token(login_token) except LoginTokenExpired: - invalid_login_token_counter.labels( - reason="expired", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + invalid_login_token_counter.add( + 1, + { + "reason": "expired", + SERVER_NAME_LABEL: self.server_name, + }, + ) except LoginTokenReused: - invalid_login_token_counter.labels( - reason="reused", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + invalid_login_token_counter.add( + 1, + { + "reason": "reused", + SERVER_NAME_LABEL: self.server_name, + }, + ) except NotFoundError: - invalid_login_token_counter.labels( - reason="not found", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + invalid_login_token_counter.add( + 1, + { + "reason": "not found", + SERVER_NAME_LABEL: self.server_name, + }, + ) raise AuthError(403, "Invalid login token", errcode=Codes.FORBIDDEN) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 8b775f71f5..af36d8d87c 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -47,7 +47,6 @@ from packaging.version import parse as parse_version from prometheus_client import ( CollectorRegistry, - Counter, Gauge, Histogram, Metric, @@ -609,24 +608,22 @@ def collect(self) -> Iterable[Metric]: # Federation Metrics # -sent_transactions_counter = Counter( - "synapse_federation_client_sent_transactions", "", labelnames=[SERVER_NAME_LABEL] +sent_transactions_counter = meter.create_counter( + "synapse_federation_client_sent_transactions" ) -events_processed_counter = Counter( - "synapse_federation_client_events_processed", "", labelnames=[SERVER_NAME_LABEL] +events_processed_counter = meter.create_counter( + "synapse_federation_client_events_processed" ) -event_processing_loop_counter = Counter( +event_processing_loop_counter = meter.create_counter( "synapse_event_processing_loop_count", - "Event processing loop iterations", - labelnames=["name", SERVER_NAME_LABEL], + description="Event processing loop iterations", ) -event_processing_loop_room_count = Counter( +event_processing_loop_room_count = meter.create_counter( "synapse_event_processing_loop_room_count", - "Rooms seen per event processing loop iteration", - labelnames=["name", SERVER_NAME_LABEL], + description="Rooms seen per event processing loop iteration", ) From 5dae38ee1afdeae607ed58711203e56072715574 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Wed, 3 Dec 2025 12:30:21 +0100 Subject: [PATCH 03/18] convert most counters --- synapse/federation/sender/__init__.py | 10 +- synapse/handlers/federation_event.py | 13 +- synapse/handlers/presence.py | 162 +++++++++--------- synapse/handlers/register.py | 56 +++--- synapse/handlers/sync.py | 21 +-- synapse/http/client.py | 77 +++++---- synapse/http/matrixfederationclient.py | 35 ++-- synapse/http/request_metrics.py | 115 +++++-------- synapse/http/site.py | 13 +- synapse/metrics/background_process_metrics.py | 90 ++++------ synapse/notifier.py | 32 ++-- synapse/push/bulk_push_rule_evaluator.py | 16 +- synapse/push/httppusher.py | 42 ++--- synapse/push/mailer.py | 44 ++--- synapse/replication/http/_base.py | 48 +++--- synapse/replication/tcp/external_cache.py | 33 ++-- synapse/replication/tcp/handler.py | 30 ++-- synapse/replication/tcp/protocol.py | 65 +++---- synapse/replication/tcp/resource.py | 21 ++- synapse/state/__init__.py | 19 +- synapse/storage/controllers/persist_events.py | 59 +++---- synapse/storage/database.py | 28 ++- .../databases/main/event_federation.py | 15 +- synapse/storage/databases/main/events.py | 32 ++-- 24 files changed, 488 insertions(+), 588 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index b656cc5101..4f123a6c0c 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -810,12 +810,10 @@ async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None: if not destinations: return - sent_pdus_destination_dist_total.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(destinations)) - sent_pdus_destination_dist_count.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + sent_pdus_destination_dist_total.add( + len(destinations), {SERVER_NAME_LABEL: self.server_name} + ) + sent_pdus_destination_dist_count.add(1, {SERVER_NAME_LABEL: self.server_name}) assert pdu.internal_metadata.stream_ordering diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index d6390b79c7..82f8f0d0e9 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -36,7 +36,7 @@ Tuple, ) -from prometheus_client import Counter, Histogram +from prometheus_client import Histogram from synapse import event_auth from synapse.api.constants import ( @@ -80,7 +80,7 @@ tag_args, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.replication.http.federation import ( ReplicationFederationSendEventsRestServlet, ) @@ -106,10 +106,9 @@ logger = logging.getLogger(__name__) -soft_failed_event_counter = Counter( +soft_failed_event_counter = meter.create_counter( "synapse_federation_soft_failed_events_total", - "Events received over federation that we marked as soft_failed", - labelnames=[SERVER_NAME_LABEL], + description="Events received over federation that we marked as soft_failed", ) # Added to debug performance and track progress on optimizations @@ -2099,9 +2098,7 @@ async def _check_for_soft_fail( "hs": origin, }, ) - soft_failed_event_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + soft_failed_event_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) event.internal_metadata.soft_failed = True async def _load_or_fetch_auth_events_for_event( diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index 1610683066..cc7cc83253 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -96,8 +96,6 @@ Type, ) -from prometheus_client import Counter - import synapse.metrics from synapse.api.constants import EduTypes, EventTypes, Membership, PresenceState from synapse.api.errors import SynapseError @@ -105,7 +103,7 @@ from synapse.appservice import ApplicationService from synapse.events.presence_router import PresenceRouter from synapse.logging.context import run_in_background -from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) @@ -136,40 +134,28 @@ logger = logging.getLogger(__name__) -notified_presence_counter = Counter( - "synapse_handler_presence_notified_presence", "", labelnames=[SERVER_NAME_LABEL] -) -federation_presence_out_counter = Counter( - "synapse_handler_presence_federation_presence_out", - "", - labelnames=[SERVER_NAME_LABEL], +notified_presence_counter = meter.create_counter( + "synapse_handler_presence_notified_presence" ) -presence_updates_counter = Counter( - "synapse_handler_presence_presence_updates", "", labelnames=[SERVER_NAME_LABEL] +federation_presence_out_counter = meter.create_counter( + "synapse_handler_presence_federation_presence_out" ) -timers_fired_counter = Counter( - "synapse_handler_presence_timers_fired", "", labelnames=[SERVER_NAME_LABEL] +presence_updates_counter = meter.create_counter( + "synapse_handler_presence_presence_updates" ) -federation_presence_counter = Counter( - "synapse_handler_presence_federation_presence", "", labelnames=[SERVER_NAME_LABEL] +timers_fired_counter = meter.create_counter("synapse_handler_presence_timers_fired") +federation_presence_counter = meter.create_counter( + "synapse_handler_presence_federation_presence" ) -bump_active_time_counter = Counter( - "synapse_handler_presence_bump_active_time", "", labelnames=[SERVER_NAME_LABEL] +bump_active_time_counter = meter.create_counter( + "synapse_handler_presence_bump_active_time" ) -get_updates_counter = Counter( - "synapse_handler_presence_get_updates", "", labelnames=["type", SERVER_NAME_LABEL] -) +get_updates_counter = meter.create_counter("synapse_handler_presence_get_updates") -notify_reason_counter = Counter( - "synapse_handler_presence_notify_reason", - "", - labelnames=["locality", "reason", SERVER_NAME_LABEL], -) -state_transition_counter = Counter( - "synapse_handler_presence_state_transition", - "", - labelnames=["locality", "from", "to", SERVER_NAME_LABEL], +notify_reason_counter = meter.create_counter("synapse_handler_presence_notify_reason") +state_transition_counter = meter.create_counter( + "synapse_handler_presence_state_transition" ) presence_user_to_current_state_size_gauge = LaterGauge( @@ -1019,14 +1005,14 @@ async def _update_states( # TODO: We should probably ensure there are no races hereafter - presence_updates_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(new_states)) + presence_updates_counter.add( + len(new_states), {SERVER_NAME_LABEL: self.server_name} + ) if to_notify: - notified_presence_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(to_notify)) + notified_presence_counter.add( + len(to_notify), {SERVER_NAME_LABEL: self.server_name} + ) await self._persist_and_notify(list(to_notify.values())) self.unpersisted_users_changes |= {s.user_id for s in new_states} @@ -1045,9 +1031,9 @@ async def _update_states( if user_id not in to_notify } if to_federation_ping: - federation_presence_out_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(to_federation_ping)) + federation_presence_out_counter.add( + len(to_federation_ping), {SERVER_NAME_LABEL: self.server_name} + ) hosts_to_states = await get_interested_remotes( self.store, @@ -1097,9 +1083,7 @@ async def _handle_timeouts(self) -> None: for user_id in users_to_check ] - timers_fired_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( - len(states) - ) + timers_fired_counter.add(len(states), {SERVER_NAME_LABEL: self.server_name}) # Set of user ID & device IDs which are currently syncing. syncing_user_devices = { @@ -1133,7 +1117,7 @@ async def bump_presence_active_time( user_id = user.to_string() - bump_active_time_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + bump_active_time_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) now = self.clock.time_msec() @@ -1385,9 +1369,9 @@ async def incoming_presence(self, origin: str, content: JsonDict) -> None: updates.append(prev_state.copy_and_replace(**new_fields)) if updates: - federation_presence_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(updates)) + federation_presence_counter.add( + len(updates), {SERVER_NAME_LABEL: self.server_name} + ) await self._update_states(updates) async def set_state( @@ -1714,21 +1698,28 @@ def should_notify( return False if old_state.status_msg != new_state.status_msg: - notify_reason_counter.labels( - locality=user_location, - reason="status_msg_change", - **{SERVER_NAME_LABEL: our_server_name}, - ).inc() + notify_reason_counter.add( + 1, + { + "locality": user_location, + "reason": "status_msg_change", + SERVER_NAME_LABEL: our_server_name, + }, + ) return True if old_state.state != new_state.state: - notify_reason_counter.labels( - locality=user_location, - reason="state_change", - **{SERVER_NAME_LABEL: our_server_name}, - ).inc() - state_transition_counter.labels( - **{ + notify_reason_counter.add( + 1, + { + "locality": user_location, + "reason": "state_change", + SERVER_NAME_LABEL: our_server_name, + }, + ) + state_transition_counter.add( + 1, + { "locality": user_location, # `from` is a reserved word in Python so we have to label it this way if # we want to use keyword args. @@ -1736,16 +1727,19 @@ def should_notify( "to": new_state.state, SERVER_NAME_LABEL: our_server_name, }, - ).inc() + ) return True if old_state.state == PresenceState.ONLINE: if new_state.currently_active != old_state.currently_active: - notify_reason_counter.labels( - locality=user_location, - reason="current_active_change", - **{SERVER_NAME_LABEL: our_server_name}, - ).inc() + notify_reason_counter.add( + 1, + { + "locality": user_location, + "reason": "current_active_change", + SERVER_NAME_LABEL: our_server_name, + }, + ) return True if ( @@ -1754,20 +1748,26 @@ def should_notify( ): # Only notify about last active bumps if we're not currently active if not new_state.currently_active: - notify_reason_counter.labels( - locality=user_location, - reason="last_active_change_online", - **{SERVER_NAME_LABEL: our_server_name}, - ).inc() + notify_reason_counter.add( + 1, + { + "locality": user_location, + "reason": "last_active_change_online", + SERVER_NAME_LABEL: our_server_name, + }, + ) return True elif new_state.last_active_ts - old_state.last_active_ts > LAST_ACTIVE_GRANULARITY: # Always notify for a transition where last active gets bumped. - notify_reason_counter.labels( - locality=user_location, - reason="last_active_change_not_online", - **{SERVER_NAME_LABEL: our_server_name}, - ).inc() + notify_reason_counter.add( + 1, + { + "locality": user_location, + "reason": "last_active_change_not_online", + SERVER_NAME_LABEL: our_server_name, + }, + ) return True return False @@ -1945,10 +1945,9 @@ async def get_new_events( # If we have the full list of changes for presence we can # simply check which ones share a room with the user. - get_updates_counter.labels( - type="stream", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + get_updates_counter.add( + 1, {"type": "stream", SERVER_NAME_LABEL: self.server_name} + ) sharing_users = await self.store.do_users_share_a_room( user_id, updated_users @@ -1961,10 +1960,9 @@ async def get_new_events( else: # Too many possible updates. Find all users we can see and check # if any of them have changed. - get_updates_counter.labels( - type="full", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + get_updates_counter.add( + 1, {"type": "full", SERVER_NAME_LABEL: self.server_name} + ) users_interested_in = ( await self.store.get_users_who_share_room_with_user(user_id) diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py index c3ff0cfaf8..7caab6fcac 100644 --- a/synapse/handlers/register.py +++ b/synapse/handlers/register.py @@ -32,8 +32,6 @@ TypedDict, ) -from prometheus_client import Counter - from synapse import types from synapse.api.constants import ( MAX_USERID_LENGTH, @@ -52,7 +50,7 @@ from synapse.appservice import ApplicationService from synapse.config.server import is_threepid_reserved from synapse.http.servlet import assert_params_in_dict -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.replication.http.login import RegisterDeviceReplicationServlet from synapse.replication.http.register import ( ReplicationPostRegisterActionsServlet, @@ -67,16 +65,14 @@ logger = logging.getLogger(__name__) -registration_counter = Counter( +registration_counter = meter.create_counter( "synapse_user_registrations_total", - "Number of new users registered (since restart)", - labelnames=["guest", "shadow_banned", "auth_provider", SERVER_NAME_LABEL], + description="Number of new users registered (since restart)", ) -login_counter = Counter( +login_counter = meter.create_counter( "synapse_user_logins_total", - "Number of user logins (since restart)", - labelnames=["guest", "auth_provider", SERVER_NAME_LABEL], + description="Number of new users logged in (since restart)", ) @@ -90,19 +86,7 @@ def init_counters_for_auth_provider(auth_provider_id: str, server_name: str) -> auth_provider_id: The ID of the auth provider to initialise counters for. server_name: Our server name (used to label metrics) (this should be `hs.hostname`). """ - for is_guest in (True, False): - login_counter.labels( - guest=is_guest, - auth_provider=auth_provider_id, - **{SERVER_NAME_LABEL: server_name}, - ) - for shadow_banned in (True, False): - registration_counter.labels( - guest=is_guest, - shadow_banned=shadow_banned, - auth_provider=auth_provider_id, - **{SERVER_NAME_LABEL: server_name}, - ) + # OTEL counters don't need pre-initialization class LoginDict(TypedDict): @@ -377,12 +361,15 @@ async def register_user( # if user id is taken, just generate another fail_count += 1 - registration_counter.labels( - guest=make_guest, - shadow_banned=shadow_banned, - auth_provider=(auth_provider_id or ""), - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + registration_counter.add( + 1, + { + "guest": str(make_guest), + "shadow_banned": str(shadow_banned), + "auth_provider": (auth_provider_id or ""), + SERVER_NAME_LABEL: self.server_name, + }, + ) # If the user does not need to consent at registration, auto-join any # configured rooms. @@ -807,11 +794,14 @@ async def register_device( auth_provider_session_id=auth_provider_session_id, ) - login_counter.labels( - guest=is_guest, - auth_provider=(auth_provider_id or ""), - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + login_counter.add( + 1, + { + "guest": str(is_guest), + "auth_provider": (auth_provider_id or ""), + SERVER_NAME_LABEL: self.server_name, + }, + ) return ( res["device_id"], diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index 2a6652b585..0dbf40860a 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -35,7 +35,6 @@ ) import attr -from prometheus_client import Counter from synapse.api.constants import ( AccountDataTypes, @@ -59,7 +58,7 @@ start_active_span, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.storage.databases.main.event_push_actions import RoomNotifCounts from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.stream import PaginateFunction @@ -96,12 +95,11 @@ # "initial_sync", "full_state_sync" or "incremental_sync", `lazy_loaded` is # "true" or "false" depending on if the request asked for lazy loaded members or # not. -non_empty_sync_counter = Counter( +non_empty_sync_counter = meter.create_counter( "synapse_handlers_sync_nonempty_total", - "Count of non empty sync responses. type is initial_sync/full_state_sync" + description="Count of non empty sync responses. type is initial_sync/full_state_sync" "/incremental_sync. lazy_loaded indicates if lazy loaded members were " "enabled for that request.", - labelnames=["type", "lazy_loaded", SERVER_NAME_LABEL], ) # Store the cache that tracks which lazy-loaded members have been sent to a given @@ -496,11 +494,14 @@ async def current_sync_callback( lazy_loaded = "true" else: lazy_loaded = "false" - non_empty_sync_counter.labels( - type=sync_label, - lazy_loaded=lazy_loaded, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + non_empty_sync_counter.add( + 1, + { + "type": sync_label, + "lazy_loaded": lazy_loaded, + SERVER_NAME_LABEL: self.server_name, + }, + ) return result diff --git a/synapse/http/client.py b/synapse/http/client.py index 370cdc3568..68937ccda2 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -40,7 +40,6 @@ import treq from canonicaljson import encode_canonical_json from netaddr import AddrFormatError, IPAddress, IPSet -from prometheus_client import Counter from zope.interface import implementer from OpenSSL import SSL @@ -84,7 +83,7 @@ from synapse.http.types import QueryParams from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, start_active_span, tags -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import ISynapseReactor, StrSequence from synapse.util.async_helpers import timeout_deferred from synapse.util.clock import Clock @@ -109,14 +108,8 @@ logger = logging.getLogger(__name__) -outgoing_requests_counter = Counter( - "synapse_http_client_requests", "", labelnames=["method", SERVER_NAME_LABEL] -) -incoming_responses_counter = Counter( - "synapse_http_client_responses", - "", - labelnames=["method", "code", SERVER_NAME_LABEL], -) +outgoing_requests_counter = meter.create_counter("synapse_http_client_requests") +incoming_responses_counter = meter.create_counter("synapse_http_client_responses") # the type of the headers map, to be passed to the t.w.h.Headers. # @@ -391,9 +384,9 @@ async def request( RequestTimedOutError if the request times out before the headers are read """ - outgoing_requests_counter.labels( - method=method, **{SERVER_NAME_LABEL: self.server_name} - ).inc() + outgoing_requests_counter.add( + 1, {"method": method, SERVER_NAME_LABEL: self.server_name} + ) # log request but strip `access_token` (AS requests for example include this) logger.debug("Sending request %s %s", method, redact_uri(uri)) @@ -447,11 +440,14 @@ async def request( response = await make_deferred_yieldable(request_deferred) - incoming_responses_counter.labels( - method=method, - code=response.code, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + incoming_responses_counter.add( + 1, + { + "method": method, + "code": str(response.code), + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info( "Received response to %s %s: %s", method, @@ -460,11 +456,14 @@ async def request( ) return response except Exception as e: - incoming_responses_counter.labels( - method=method, - code="ERR", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + incoming_responses_counter.add( + 1, + { + "method": method, + "code": "ERR", + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info( "Error sending request to %s %s: %s %s", method, @@ -913,9 +912,9 @@ async def request( RequestTimedOutError if the request times out before the headers are read """ - outgoing_requests_counter.labels( - method=method, **{SERVER_NAME_LABEL: self.server_name} - ).inc() + outgoing_requests_counter.add( + 1, {"method": method, SERVER_NAME_LABEL: self.server_name} + ) logger.debug("Sending request %s %s", method, uri) @@ -972,11 +971,14 @@ async def request( response = await make_deferred_yieldable(request_deferred) - incoming_responses_counter.labels( - method=method, - code=response.code, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + incoming_responses_counter.add( + 1, + { + "method": method, + "code": str(response.code), + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info( "Received response to %s %s: %s", method, @@ -985,11 +987,14 @@ async def request( ) return response except Exception as e: - incoming_responses_counter.labels( - method=method, - code="ERR", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + incoming_responses_counter.add( + 1, + { + "method": method, + "code": "ERR", + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info( "Error sending request to %s %s: %s %s", method, diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 4d72c72d01..2558ac7d2a 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -47,7 +47,6 @@ import attr import treq from canonicaljson import encode_canonical_json -from prometheus_client import Counter from signedjson.sign import sign_json from twisted.internet import defer @@ -87,7 +86,7 @@ from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, start_active_span, tags -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import JsonDict from synapse.util.async_helpers import AwakenableSleeper, Linearizer, timeout_deferred from synapse.util.clock import Clock @@ -100,15 +99,11 @@ logger = logging.getLogger(__name__) -outgoing_requests_counter = Counter( - "synapse_http_matrixfederationclient_requests", - "", - labelnames=["method", SERVER_NAME_LABEL], +outgoing_requests_counter = meter.create_counter( + "synapse_http_matrixfederationclient_requests" ) -incoming_responses_counter = Counter( - "synapse_http_matrixfederationclient_responses", - "", - labelnames=["method", "code", SERVER_NAME_LABEL], +incoming_responses_counter = meter.create_counter( + "synapse_http_matrixfederationclient_responses" ) @@ -716,9 +711,10 @@ async def _send_request( _sec_timeout, ) - outgoing_requests_counter.labels( - method=request.method, **{SERVER_NAME_LABEL: self.server_name} - ).inc() + outgoing_requests_counter.add( + 1, + {"method": request.method, SERVER_NAME_LABEL: self.server_name}, + ) try: with Measure( @@ -756,11 +752,14 @@ async def _send_request( except Exception as e: raise RequestSendFailed(e, can_retry=True) from e - incoming_responses_counter.labels( - method=request.method, - code=response.code, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + incoming_responses_counter.add( + 1, + { + "method": request.method, + "code": str(response.code), + SERVER_NAME_LABEL: self.server_name, + }, + ) set_tag(tags.HTTP_STATUS_CODE, response.code) response_phrase = response.phrase.decode("ascii", errors="replace") diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 83f52edb7c..c882aa9edc 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -24,31 +24,25 @@ import traceback from typing import Dict, Mapping, Set, Tuple -from prometheus_client.core import Counter, Histogram +from prometheus_client.core import Histogram from synapse.logging.context import current_context -from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter logger = logging.getLogger(__name__) # total number of responses served, split by method/servlet/tag -response_count = Counter( +response_count = meter.create_counter( "synapse_http_server_response_count", - "", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) -requests_counter = Counter( +requests_counter = meter.create_counter( "synapse_http_server_requests_received", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) -outgoing_responses_counter = Counter( +outgoing_responses_counter = meter.create_counter( "synapse_http_server_responses", - "", - labelnames=["method", "code", SERVER_NAME_LABEL], ) response_timer = Histogram( @@ -57,80 +51,60 @@ labelnames=["method", "servlet", "tag", "code", SERVER_NAME_LABEL], ) -response_ru_utime = Counter( +response_ru_utime = meter.create_counter( "synapse_http_server_response_ru_utime_seconds", - "sec", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], + description="sec", ) -response_ru_stime = Counter( +response_ru_stime = meter.create_counter( "synapse_http_server_response_ru_stime_seconds", - "sec", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], + description="sec", ) -response_db_txn_count = Counter( +response_db_txn_count = meter.create_counter( "synapse_http_server_response_db_txn_count", - "", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # seconds spent waiting for db txns, excluding scheduling time, when processing # this request -response_db_txn_duration = Counter( +response_db_txn_duration = meter.create_counter( "synapse_http_server_response_db_txn_duration_seconds", - "", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # seconds spent waiting for a db connection, when processing this request -response_db_sched_duration = Counter( +response_db_sched_duration = meter.create_counter( "synapse_http_server_response_db_sched_duration_seconds", - "", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # size in bytes of the response written -response_size = Counter( +response_size = meter.create_counter( "synapse_http_server_response_size", - "", - labelnames=["method", "servlet", "tag", SERVER_NAME_LABEL], ) # In flight metrics are incremented while the requests are in flight, rather # than when the response was written. -in_flight_requests_ru_utime = Counter( +in_flight_requests_ru_utime = meter.create_counter( "synapse_http_server_in_flight_requests_ru_utime_seconds", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) -in_flight_requests_ru_stime = Counter( +in_flight_requests_ru_stime = meter.create_counter( "synapse_http_server_in_flight_requests_ru_stime_seconds", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) -in_flight_requests_db_txn_count = Counter( +in_flight_requests_db_txn_count = meter.create_counter( "synapse_http_server_in_flight_requests_db_txn_count", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) # seconds spent waiting for db txns, excluding scheduling time, when processing # this request -in_flight_requests_db_txn_duration = Counter( +in_flight_requests_db_txn_duration = meter.create_counter( "synapse_http_server_in_flight_requests_db_txn_duration_seconds", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) # seconds spent waiting for a db connection, when processing this request -in_flight_requests_db_sched_duration = Counter( +in_flight_requests_db_sched_duration = meter.create_counter( "synapse_http_server_in_flight_requests_db_sched_duration_seconds", - "", - labelnames=["method", "servlet", SERVER_NAME_LABEL], ) _in_flight_requests: Set["RequestMetrics"] = set() @@ -227,11 +201,14 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None: response_code_str = str(response_code) - outgoing_responses_counter.labels( - method=self.method, - code=response_code_str, - **{SERVER_NAME_LABEL: self.our_server_name}, - ).inc() + outgoing_responses_counter.add( + 1, + { + "method": self.method, + "code": response_code_str, + SERVER_NAME_LABEL: self.our_server_name, + }, + ) response_base_labels = { "method": self.method, @@ -240,7 +217,7 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None: SERVER_NAME_LABEL: self.our_server_name, } - response_count.labels(**response_base_labels).inc() + response_count.add(1, response_base_labels) response_timer.labels( code=response_code_str, @@ -249,18 +226,16 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None: resource_usage = context.get_resource_usage() - response_ru_utime.labels(**response_base_labels).inc(resource_usage.ru_utime) - response_ru_stime.labels(**response_base_labels).inc(resource_usage.ru_stime) - response_db_txn_count.labels(**response_base_labels).inc( - resource_usage.db_txn_count - ) - response_db_txn_duration.labels(**response_base_labels).inc( - resource_usage.db_txn_duration_sec + response_ru_utime.add(resource_usage.ru_utime, response_base_labels) + response_ru_stime.add(resource_usage.ru_stime, response_base_labels) + response_db_txn_count.add(resource_usage.db_txn_count, response_base_labels) + response_db_txn_duration.add( + resource_usage.db_txn_duration_sec, response_base_labels ) - response_db_sched_duration.labels(**response_base_labels).inc( - resource_usage.db_sched_duration_sec + response_db_sched_duration.add( + resource_usage.db_sched_duration_sec, response_base_labels ) - response_size.labels(**response_base_labels).inc(sent_bytes) + response_size.add(sent_bytes, response_base_labels) # We always call this at the end to ensure that we update the metrics # regardless of whether a call to /metrics while the request was in @@ -289,21 +264,15 @@ def update_metrics(self) -> None: # max() is used since rapid use of ru_stime/ru_utime can end up with the # count going backwards due to NTP, time smearing, fine-grained # correction, or floating points. Who knows, really? - in_flight_requests_ru_utime.labels(**in_flight_labels).inc( - max(diff.ru_utime, 0) - ) - in_flight_requests_ru_stime.labels(**in_flight_labels).inc( - max(diff.ru_stime, 0) - ) + in_flight_requests_ru_utime.add(max(diff.ru_utime, 0), in_flight_labels) + in_flight_requests_ru_stime.add(max(diff.ru_stime, 0), in_flight_labels) - in_flight_requests_db_txn_count.labels(**in_flight_labels).inc( - diff.db_txn_count - ) + in_flight_requests_db_txn_count.add(diff.db_txn_count, in_flight_labels) - in_flight_requests_db_txn_duration.labels(**in_flight_labels).inc( - diff.db_txn_duration_sec + in_flight_requests_db_txn_duration.add( + diff.db_txn_duration_sec, in_flight_labels ) - in_flight_requests_db_sched_duration.labels(**in_flight_labels).inc( - diff.db_sched_duration_sec + in_flight_requests_db_sched_duration.add( + diff.db_sched_duration_sec, in_flight_labels ) diff --git a/synapse/http/site.py b/synapse/http/site.py index cf31b64d80..2d844645f3 100644 --- a/synapse/http/site.py +++ b/synapse/http/site.py @@ -343,11 +343,14 @@ def render(self, resrc: Resource) -> None: # dispatching to the handler, so that the handler # can update the servlet name in the request # metrics - requests_counter.labels( - method=self.get_method(), - servlet=self.request_metrics.name, - **{SERVER_NAME_LABEL: self.our_server_name}, - ).inc() + requests_counter.add( + 1, + { + "method": self.get_method(), + "servlet": self.request_metrics.name, + SERVER_NAME_LABEL: self.our_server_name, + }, + ) @contextlib.contextmanager def processing(self) -> Generator[None, None, None]: diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 05e84038ac..1ba1ed71a1 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -41,7 +41,7 @@ ) from prometheus_client import Metric -from prometheus_client.core import REGISTRY, Counter, Gauge +from prometheus_client.core import REGISTRY, Gauge from typing_extensions import Concatenate, ParamSpec from twisted.internet import defer @@ -57,7 +57,7 @@ start_active_span, start_active_span_follows_from, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics._types import Collector if TYPE_CHECKING: @@ -77,10 +77,9 @@ logger = logging.getLogger(__name__) -_background_process_start_count = Counter( +_background_process_start_count = meter.create_counter( "synapse_background_process_start_count", - "Number of background processes started", - labelnames=["name", SERVER_NAME_LABEL], + description="Number of background processes started", ) _background_process_in_flight_count = Gauge( @@ -93,42 +92,32 @@ # the default registry. Instead we collect them all via the CustomCollector, # which ensures that we can update them before they are collected. # -_background_process_ru_utime = Counter( +_background_process_ru_utime = meter.create_counter( "synapse_background_process_ru_utime_seconds", - "User CPU time used by background processes, in seconds", - labelnames=["name", SERVER_NAME_LABEL], - registry=None, + description="User CPU time used by background processes, in seconds", ) -_background_process_ru_stime = Counter( +_background_process_ru_stime = meter.create_counter( "synapse_background_process_ru_stime_seconds", - "System CPU time used by background processes, in seconds", - labelnames=["name", SERVER_NAME_LABEL], - registry=None, + description="System CPU time used by background processes, in seconds", ) -_background_process_db_txn_count = Counter( +_background_process_db_txn_count = meter.create_counter( "synapse_background_process_db_txn_count", - "Number of database transactions done by background processes", - labelnames=["name", SERVER_NAME_LABEL], - registry=None, + description="Number of database transactions done by background processes", ) -_background_process_db_txn_duration = Counter( +_background_process_db_txn_duration = meter.create_counter( "synapse_background_process_db_txn_duration_seconds", - ( + description=( "Seconds spent by background processes waiting for database " "transactions, excluding scheduling time" ), - labelnames=["name", SERVER_NAME_LABEL], - registry=None, ) -_background_process_db_sched_duration = Counter( +_background_process_db_sched_duration = meter.create_counter( "synapse_background_process_db_sched_duration_seconds", - "Seconds spent by background processes waiting for database connections", - labelnames=["name", SERVER_NAME_LABEL], - registry=None, + description="Seconds spent by background processes waiting for database connections", ) # map from description to a counter, so that we can name our logcontexts @@ -169,16 +158,7 @@ def collect(self) -> Iterable[Metric]: for process in _background_processes_copy: process.update_metrics() - # now we need to run collect() over each of the static Counters, and - # yield each metric they return. - for m in ( - _background_process_ru_utime, - _background_process_ru_stime, - _background_process_db_txn_count, - _background_process_db_txn_duration, - _background_process_db_sched_duration, - ): - yield from m.collect() + return [] # The `SERVER_NAME_LABEL` is included in the individual metrics added to this registry, @@ -204,21 +184,25 @@ def update_metrics(self) -> None: # For unknown reasons, the difference in times can be negative. See comment in # synapse.http.request_metrics.RequestMetrics.update_metrics. - _background_process_ru_utime.labels( - name=self.desc, **{SERVER_NAME_LABEL: self.server_name} - ).inc(max(diff.ru_utime, 0)) - _background_process_ru_stime.labels( - name=self.desc, **{SERVER_NAME_LABEL: self.server_name} - ).inc(max(diff.ru_stime, 0)) - _background_process_db_txn_count.labels( - name=self.desc, **{SERVER_NAME_LABEL: self.server_name} - ).inc(diff.db_txn_count) - _background_process_db_txn_duration.labels( - name=self.desc, **{SERVER_NAME_LABEL: self.server_name} - ).inc(diff.db_txn_duration_sec) - _background_process_db_sched_duration.labels( - name=self.desc, **{SERVER_NAME_LABEL: self.server_name} - ).inc(diff.db_sched_duration_sec) + _background_process_ru_utime.add( + max(diff.ru_utime, 0), + {"name": self.desc, SERVER_NAME_LABEL: self.server_name}, + ) + _background_process_ru_stime.add( + max(diff.ru_stime, 0), + {"name": self.desc, SERVER_NAME_LABEL: self.server_name}, + ) + _background_process_db_txn_count.add( + diff.db_txn_count, {"name": self.desc, SERVER_NAME_LABEL: self.server_name} + ) + _background_process_db_txn_duration.add( + diff.db_txn_duration_sec, + {"name": self.desc, SERVER_NAME_LABEL: self.server_name}, + ) + _background_process_db_sched_duration.add( + diff.db_sched_duration_sec, + {"name": self.desc, SERVER_NAME_LABEL: self.server_name}, + ) R = TypeVar("R") @@ -278,9 +262,9 @@ async def run() -> Optional[R]: count = _background_process_counts.get(desc, 0) _background_process_counts[desc] = count + 1 - _background_process_start_count.labels( - name=desc, **{SERVER_NAME_LABEL: server_name} - ).inc() + _background_process_start_count.add( + 1, {"name": desc, SERVER_NAME_LABEL: server_name} + ) _background_process_in_flight_count.labels( name=desc, **{SERVER_NAME_LABEL: server_name} ).inc() diff --git a/synapse/notifier.py b/synapse/notifier.py index 9169f50c4d..e39b38ce05 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -39,7 +39,6 @@ ) import attr -from prometheus_client import Counter from twisted.internet import defer from twisted.internet.defer import Deferred @@ -51,7 +50,7 @@ from synapse.logging import issue9533_logger from synapse.logging.context import PreserveLoggingContext from synapse.logging.opentracing import log_kv, start_active_span -from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter from synapse.streams.config import PaginationConfig from synapse.types import ( ISynapseReactor, @@ -76,14 +75,10 @@ logger = logging.getLogger(__name__) # FIXME: Unused metric, remove if not needed. -notified_events_counter = Counter( - "synapse_notifier_notified_events", "", labelnames=[SERVER_NAME_LABEL] -) +notified_events_counter = meter.create_counter("synapse_notifier_notified_events") -users_woken_by_stream_counter = Counter( - "synapse_notifier_users_woken_by_stream", - "", - labelnames=["stream", SERVER_NAME_LABEL], +users_woken_by_stream_counter = meter.create_counter( + "synapse_notifier_users_woken_by_stream" ) @@ -386,10 +381,13 @@ async def on_un_partial_stated_room( for listener in listeners: listener.callback(current_token) - users_woken_by_stream_counter.labels( - stream=StreamKeyType.UN_PARTIAL_STATED_ROOMS, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(len(user_streams)) + users_woken_by_stream_counter.add( + len(user_streams), + { + "stream": str(StreamKeyType.UN_PARTIAL_STATED_ROOMS), + SERVER_NAME_LABEL: self.server_name, + }, + ) # Poke the replication so that other workers also see the write to # the un-partial-stated rooms stream. @@ -613,10 +611,10 @@ def on_new_event( listener.callback(current_token) if user_streams: - users_woken_by_stream_counter.labels( - stream=stream_key, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(len(user_streams)) + users_woken_by_stream_counter.add( + len(user_streams), + {"stream": str(stream_key), SERVER_NAME_LABEL: self.server_name}, + ) self.notify_replication() diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py index ea9169aef0..4a36e1b32b 100644 --- a/synapse/push/bulk_push_rule_evaluator.py +++ b/synapse/push/bulk_push_rule_evaluator.py @@ -35,8 +35,6 @@ cast, ) -from prometheus_client import Counter - from twisted.internet.defer import Deferred from synapse.api.constants import ( @@ -51,7 +49,7 @@ from synapse.events import EventBase, relation_from_event from synapse.events.snapshot import EventContext, EventPersistencePair from synapse.logging.context import make_deferred_yieldable, run_in_background -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import meter from synapse.state import CREATE_KEY, POWER_KEY from synapse.storage.databases.main.roommember import EventIdMembership from synapse.storage.invite_rule import InviteRule @@ -71,16 +69,12 @@ logger = logging.getLogger(__name__) # FIXME: Unused metric, remove if not needed. -push_rules_invalidation_counter = Counter( - "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter", - "", - labelnames=[SERVER_NAME_LABEL], +push_rules_invalidation_counter = meter.create_counter( + "synapse_push_bulk_push_rule_evaluator_push_rules_invalidation_counter" ) # FIXME: Unused metric, remove if not needed. -push_rules_state_size_counter = Counter( - "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", - "", - labelnames=[SERVER_NAME_LABEL], +push_rules_state_size_counter = meter.create_counter( + "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter" ) diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py index 5cac5de8cb..bd784a9612 100644 --- a/synapse/push/httppusher.py +++ b/synapse/push/httppusher.py @@ -23,15 +23,13 @@ import urllib.parse from typing import TYPE_CHECKING, Dict, List, Optional, Union -from prometheus_client import Counter - from twisted.internet.error import AlreadyCalled, AlreadyCancelled from twisted.internet.interfaces import IDelayedCall from synapse.api.constants import EventTypes from synapse.events import EventBase from synapse.logging import opentracing -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.push import Pusher, PusherConfig, PusherConfigException from synapse.storage.databases.main.event_push_actions import HttpPushAction from synapse.types import JsonDict, JsonMapping @@ -43,28 +41,24 @@ logger = logging.getLogger(__name__) -http_push_processed_counter = Counter( +http_push_processed_counter = meter.create_counter( "synapse_http_httppusher_http_pushes_processed", - "Number of push notifications successfully sent", - labelnames=[SERVER_NAME_LABEL], + description="Number of push notifications successfully sent", ) -http_push_failed_counter = Counter( +http_push_failed_counter = meter.create_counter( "synapse_http_httppusher_http_pushes_failed", - "Number of push notifications which failed", - labelnames=[SERVER_NAME_LABEL], + description="Number of push notifications which failed", ) -http_badges_processed_counter = Counter( +http_badges_processed_counter = meter.create_counter( "synapse_http_httppusher_badge_updates_processed", - "Number of badge updates successfully sent", - labelnames=[SERVER_NAME_LABEL], + description="Number of badge updates successfully sent", ) -http_badges_failed_counter = Counter( +http_badges_failed_counter = meter.create_counter( "synapse_http_httppusher_badge_updates_failed", - "Number of badge updates which failed", - labelnames=[SERVER_NAME_LABEL], + description="Number of badge updates which failed", ) @@ -272,9 +266,9 @@ async def _unsafe_process(self) -> None: processed = await self._process_one(push_action) if processed: - http_push_processed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + http_push_processed_counter.add( + 1, {SERVER_NAME_LABEL: self.server_name} + ) self.backoff_delay = HttpPusher.INITIAL_BACKOFF_SEC self.last_stream_ordering = push_action.stream_ordering pusher_still_exists = ( @@ -298,9 +292,7 @@ async def _unsafe_process(self) -> None: self.app_id, self.pushkey, self.user_id, self.failing_since ) else: - http_push_failed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + http_push_failed_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) if not self.failing_since: self.failing_since = self.clock.time_msec() await self.store.update_pusher_failing_since( @@ -552,13 +544,9 @@ async def _send_badge(self, badge: int) -> None: } try: await self.http_client.post_json_get_json(self.url, d) - http_badges_processed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + http_badges_processed_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) except Exception as e: logger.warning( "Failed to send badge count to %s: %s %s", self.name, type(e), e ) - http_badges_failed_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + http_badges_failed_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) diff --git a/synapse/push/mailer.py b/synapse/push/mailer.py index d76cc8237b..e6aaf503f2 100644 --- a/synapse/push/mailer.py +++ b/synapse/push/mailer.py @@ -26,13 +26,12 @@ import bleach import jinja2 from markupsafe import Markup -from prometheus_client import Counter from synapse.api.constants import EventContentFields, EventTypes, Membership, RoomTypes from synapse.api.errors import StoreError from synapse.config.emailconfig import EmailSubjectConfig from synapse.events import EventBase -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.push.presentable_names import ( calculate_room_name, descriptor_from_member_events, @@ -58,10 +57,8 @@ T = TypeVar("T") -emails_sent_counter = Counter( - "synapse_emails_sent_total", - "Emails sent by type", - labelnames=["type", SERVER_NAME_LABEL], +emails_sent_counter = meter.create_counter( + "synapse_emails_sent_total", description="Emails sent by type" ) @@ -162,10 +159,9 @@ async def send_password_reset_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels( - type="password_reset", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + emails_sent_counter.add( + 1, {"type": "password_reset", SERVER_NAME_LABEL: self.server_name} + ) await self.send_email( email_address, @@ -197,10 +193,9 @@ async def send_registration_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels( - type="registration", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + emails_sent_counter.add( + 1, {"type": "registration", SERVER_NAME_LABEL: self.server_name} + ) await self.send_email( email_address, @@ -216,10 +211,9 @@ async def send_already_in_use_mail(self, email_address: str) -> None: email_address: Email address we're sending to the "already in use" mail """ - emails_sent_counter.labels( - type="already_in_use", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + emails_sent_counter.add( + 1, {"type": "already_in_use", SERVER_NAME_LABEL: self.server_name} + ) await self.send_email( email_address, @@ -252,10 +246,9 @@ async def send_add_threepid_mail( template_vars: TemplateVars = {"link": link} - emails_sent_counter.labels( - type="add_threepid", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + emails_sent_counter.add( + 1, {"type": "add_threepid", SERVER_NAME_LABEL: self.server_name} + ) await self.send_email( email_address, @@ -358,10 +351,9 @@ async def _fetch_room_state(room_id: str) -> None: "reason": reason, } - emails_sent_counter.labels( - type="notification", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + emails_sent_counter.add( + 1, {"type": "notification", SERVER_NAME_LABEL: self.server_name} + ) await self.send_email( email_address, summary_text, template_vars, unsubscribe_link diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 0850a99e0c..9c3e763ab3 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -25,7 +25,7 @@ from inspect import signature from typing import TYPE_CHECKING, Any, Awaitable, Callable, ClassVar, Dict, List, Tuple -from prometheus_client import Counter, Gauge +from prometheus_client import Gauge from twisted.internet.error import ConnectError, DNSLookupError from twisted.web.server import Request @@ -38,7 +38,7 @@ from synapse.http.site import SynapseRequest from synapse.logging import opentracing from synapse.logging.opentracing import trace_with_opname -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import JsonDict from synapse.util.caches.response_cache import ResponseCache from synapse.util.cancellation import is_function_cancellable @@ -55,10 +55,9 @@ labelnames=["name", SERVER_NAME_LABEL], ) -_outgoing_request_counter = Counter( +_outgoing_request_counter = meter.create_counter( "synapse_outgoing_replication_requests", - "Number of outgoing replication requests, by replication method name and result", - labelnames=["name", "code", SERVER_NAME_LABEL], + description="Number of outgoing replication requests, by replication method name and result", ) @@ -338,27 +337,36 @@ async def send_request( # We convert to SynapseError as we know that it was a SynapseError # on the main process that we should send to the client. (And # importantly, not stack traces everywhere) - _outgoing_request_counter.labels( - name=cls.NAME, - code=e.code, - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": e.code, + SERVER_NAME_LABEL: server_name, + }, + ) raise e.to_synapse_error() except Exception as e: - _outgoing_request_counter.labels( - name=cls.NAME, - code="ERR", - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": "ERR", + SERVER_NAME_LABEL: server_name, + }, + ) raise SynapseError( 502, f"Failed to talk to {instance_name} process" ) from e - _outgoing_request_counter.labels( - name=cls.NAME, - code=200, - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": 200, + SERVER_NAME_LABEL: server_name, + }, + ) # Wait on any streams that the remote may have written to. for stream_name, position in result.pop( diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index bcdd55d2e6..dcd10b15f2 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -22,11 +22,11 @@ import logging from typing import TYPE_CHECKING, Any, Optional -from prometheus_client import Counter, Histogram +from prometheus_client import Histogram from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.util.json import json_decoder, json_encoder if TYPE_CHECKING: @@ -34,16 +34,14 @@ from synapse.server import HomeServer -set_counter = Counter( +set_counter = meter.create_counter( "synapse_external_cache_set", - "Number of times we set a cache", - labelnames=["cache_name", SERVER_NAME_LABEL], + description="Number of times we set a cache", ) -get_counter = Counter( +get_counter = meter.create_counter( "synapse_external_cache_get", - "Number of times we get a cache", - labelnames=["cache_name", "hit", SERVER_NAME_LABEL], + description="Number of times we get a cache", ) response_timer = Histogram( @@ -96,9 +94,9 @@ async def set(self, cache_name: str, key: str, value: Any, expiry_ms: int) -> No if self._redis_connection is None: return - set_counter.labels( - cache_name=cache_name, **{SERVER_NAME_LABEL: self.server_name} - ).inc() + set_counter.add( + 1, {"cache_name": cache_name, SERVER_NAME_LABEL: self.server_name} + ) # txredisapi requires the value to be string, bytes or numbers, so we # encode stuff in JSON. @@ -140,11 +138,14 @@ async def get(self, cache_name: str, key: str) -> Optional[Any]: logger.debug("Got cache result %s %s: %r", cache_name, key, result) - get_counter.labels( - cache_name=cache_name, - hit=result is not None, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + get_counter.add( + 1, + { + "cache_name": cache_name, + "hit": result is not None, + SERVER_NAME_LABEL: self.server_name, + }, + ) if not result: return None diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index 4d0d3d44ab..d78f9c1bbb 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -36,11 +36,9 @@ Union, ) -from prometheus_client import Counter - from twisted.internet.protocol import ReconnectingClientFactory -from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter from synapse.replication.tcp.commands import ( ClearUserSyncsCommand, Command, @@ -83,26 +81,20 @@ # number of updates received for each RDATA stream -inbound_rdata_count = Counter( +inbound_rdata_count = meter.create_counter( "synapse_replication_tcp_protocol_inbound_rdata_count", - "", - labelnames=["stream_name", SERVER_NAME_LABEL], ) -user_sync_counter = Counter( - "synapse_replication_tcp_resource_user_sync", "", labelnames=[SERVER_NAME_LABEL] -) -federation_ack_counter = Counter( +user_sync_counter = meter.create_counter("synapse_replication_tcp_resource_user_sync") +federation_ack_counter = meter.create_counter( "synapse_replication_tcp_resource_federation_ack", - "", - labelnames=[SERVER_NAME_LABEL], ) # FIXME: Unused metric, remove if not needed. -remove_pusher_counter = Counter( - "synapse_replication_tcp_resource_remove_pusher", "", labelnames=[SERVER_NAME_LABEL] +remove_pusher_counter = meter.create_counter( + "synapse_replication_tcp_resource_remove_pusher" ) -user_ip_cache_counter = Counter( - "synapse_replication_tcp_resource_user_ip_cache", "", labelnames=[SERVER_NAME_LABEL] +user_ip_cache_counter = meter.create_counter( + "synapse_replication_tcp_resource_user_ip_cache" ) tcp_resource_total_connections_gauge = LaterGauge( @@ -480,7 +472,7 @@ def will_announce_positions(self) -> None: def on_USER_SYNC( self, conn: IReplicationConnection, cmd: UserSyncCommand ) -> Optional[Awaitable[None]]: - user_sync_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + user_sync_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) if self._is_presence_writer: return self._presence_handler.update_external_syncs_row( @@ -504,7 +496,7 @@ def on_CLEAR_USER_SYNC( def on_FEDERATION_ACK( self, conn: IReplicationConnection, cmd: FederationAckCommand ) -> None: - federation_ack_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + federation_ack_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) if self._federation_sender: self._federation_sender.federation_ack(cmd.instance_name, cmd.token) @@ -512,7 +504,7 @@ def on_FEDERATION_ACK( def on_USER_IP( self, conn: IReplicationConnection, cmd: UserIpCommand ) -> Optional[Awaitable[None]]: - user_ip_cache_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + user_ip_cache_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) if self._is_master or self._should_insert_client_ips: # We make a point of only returning an awaitable if there's actually diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index bcfc65c2c0..004197c4ee 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -30,7 +30,6 @@ from inspect import isawaitable from typing import TYPE_CHECKING, Any, Collection, List, Optional -from prometheus_client import Counter from zope.interface import Interface, implementer from twisted.internet import task @@ -39,7 +38,7 @@ from twisted.python.failure import Failure from synapse.logging.context import PreserveLoggingContext -from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter from synapse.metrics.background_process_metrics import ( BackgroundProcessLoggingContext, ) @@ -62,22 +61,18 @@ from synapse.server import HomeServer -connection_close_counter = Counter( +connection_close_counter = meter.create_counter( "synapse_replication_tcp_protocol_close_reason", - "", - labelnames=["reason_type", SERVER_NAME_LABEL], ) -tcp_inbound_commands_counter = Counter( +tcp_inbound_commands_counter = meter.create_counter( "synapse_replication_tcp_protocol_inbound_commands", - "Number of commands received from replication, by command and name of process connected to", - labelnames=["command", "name", SERVER_NAME_LABEL], + description="Number of commands received from replication, by command and name of process connected to", ) -tcp_outbound_commands_counter = Counter( +tcp_outbound_commands_counter = meter.create_counter( "synapse_replication_tcp_protocol_outbound_commands", - "Number of commands sent to replication, by command and name of process connected to", - labelnames=["command", "name", SERVER_NAME_LABEL], + description="Number of commands sent to replication, by command and name of process connected to", ) # A list of all connected protocols. This allows us to send metrics about the @@ -255,11 +250,14 @@ def _parse_and_dispatch_line(self, line: bytes) -> None: self.last_received_command = self.clock.time_msec() - tcp_inbound_commands_counter.labels( - command=cmd.NAME, - name=self.name, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + tcp_inbound_commands_counter.add( + 1, + { + "command": cmd.NAME, + "name": self.name, + SERVER_NAME_LABEL: self.server_name, + }, + ) self.handle_command(cmd) @@ -334,11 +332,14 @@ def send_command(self, cmd: Command, do_buffer: bool = True) -> None: self._queue_command(cmd) return - tcp_outbound_commands_counter.labels( - command=cmd.NAME, - name=self.name, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + tcp_outbound_commands_counter.add( + 1, + { + "command": cmd.NAME, + "name": self.name, + SERVER_NAME_LABEL: self.server_name, + }, + ) string = "%s %s" % (cmd.NAME, cmd.to_line()) if "\n" in string: @@ -410,15 +411,21 @@ def connectionLost(self, reason: Failure) -> None: # type: ignore[override] logger.info("[%s] Replication connection closed: %r", self.id(), reason) if isinstance(reason, Failure): assert reason.type is not None - connection_close_counter.labels( - reason_type=reason.type.__name__, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + connection_close_counter.add( + 1, + { + "reason_type": reason.type.__name__, + SERVER_NAME_LABEL: self.server_name, + }, + ) else: - connection_close_counter.labels( # type: ignore[unreachable] - reason_type=reason.__class__.__name__, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + connection_close_counter.add( # type: ignore[unreachable] + 1, + { + "reason_type": reason.__class__.__name__, + SERVER_NAME_LABEL: self.server_name, + }, + ) try: # Remove us from list of connections to be monitored diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py index ef72a0a532..db2b06d974 100644 --- a/synapse/replication/tcp/resource.py +++ b/synapse/replication/tcp/resource.py @@ -24,12 +24,10 @@ import random from typing import TYPE_CHECKING, List, Optional, Tuple -from prometheus_client import Counter - from twisted.internet.interfaces import IAddress from twisted.internet.protocol import ServerFactory -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.replication.tcp.commands import PositionCommand from synapse.replication.tcp.protocol import ServerReplicationStreamProtocol from synapse.replication.tcp.streams import EventsStream @@ -39,10 +37,8 @@ if TYPE_CHECKING: from synapse.server import HomeServer -stream_updates_counter = Counter( - "synapse_replication_tcp_resource_stream_updates", - "", - labelnames=["stream_name", SERVER_NAME_LABEL], +stream_updates_counter = meter.create_counter( + "synapse_replication_tcp_resource_stream_updates" ) logger = logging.getLogger(__name__) @@ -230,10 +226,13 @@ async def _run_notifier_loop(self) -> None: len(updates), current_token, ) - stream_updates_counter.labels( - stream_name=stream.NAME, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(len(updates)) + stream_updates_counter.add( + len(updates), + { + "stream_name": stream.NAME, + SERVER_NAME_LABEL: self.server_name, + }, + ) else: # The token has advanced but there is no data to diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py index 394dc72fa6..76c3fe51d6 100644 --- a/synapse/state/__init__.py +++ b/synapse/state/__init__.py @@ -39,7 +39,8 @@ import attr from immutabledict import immutabledict -from prometheus_client import Counter, Histogram +from opentelemetry.metrics import Counter +from prometheus_client import Histogram from synapse.api.constants import EventTypes from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, StateResolutionVersions @@ -51,7 +52,7 @@ ) from synapse.logging.context import ContextResourceUsage from synapse.logging.opentracing import tag_args, trace -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.replication.http.state import ReplicationUpdateCurrentStateRestServlet from synapse.state import v1, v2 from synapse.storage.databases.main.event_federation import StateDifference @@ -606,17 +607,15 @@ class _StateResMetrics: db_events: int = 0 -_biggest_room_by_cpu_counter = Counter( +_biggest_room_by_cpu_counter = meter.create_counter( "synapse_state_res_cpu_for_biggest_room_seconds", - "CPU time spent performing state resolution for the single most expensive " + description="CPU time spent performing state resolution for the single most expensive " "room for state resolution", - labelnames=[SERVER_NAME_LABEL], ) -_biggest_room_by_db_counter = Counter( +_biggest_room_by_db_counter = meter.create_counter( "synapse_state_res_db_for_biggest_room_seconds", - "Database time spent performing state resolution for the single most " + description="Database time spent performing state resolution for the single most " "expensive room for state resolution", - labelnames=[SERVER_NAME_LABEL], ) _cpu_times = Histogram( @@ -896,8 +895,8 @@ def _report_biggest( # report info on the single biggest to prometheus _, biggest_metrics = biggest[0] - prometheus_counter_metric.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( - extract_key(biggest_metrics) + prometheus_counter_metric.add( + extract_key(biggest_metrics), {SERVER_NAME_LABEL: self.server_name} ) diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index 646e2cf115..e1e951ff21 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -45,7 +45,7 @@ ) import attr -from prometheus_client import Counter, Histogram +from prometheus_client import Histogram from twisted.internet import defer @@ -61,7 +61,7 @@ start_active_span_follows_from, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.storage.controllers.state import StateStorageController from synapse.storage.databases import Databases from synapse.storage.databases.main.events import DeltaState @@ -82,23 +82,19 @@ logger = logging.getLogger(__name__) # The number of times we are recalculating the current state -state_delta_counter = Counter( - "synapse_storage_events_state_delta", "", labelnames=[SERVER_NAME_LABEL] -) +state_delta_counter = meter.create_counter("synapse_storage_events_state_delta") # The number of times we are recalculating state when there is only a # single forward extremity -state_delta_single_event_counter = Counter( - "synapse_storage_events_state_delta_single_event", - "", - labelnames=[SERVER_NAME_LABEL], +state_delta_single_event_counter = meter.create_counter( + "synapse_storage_events_state_delta_single_event" ) # The number of times we are reculating state when we could have resonably # calculated the delta when we calculated the state for an event we were # persisting. -state_delta_reuse_delta_counter = Counter( - "synapse_storage_events_state_delta_reuse_delta", "", labelnames=[SERVER_NAME_LABEL] +state_delta_reuse_delta_counter = meter.create_counter( + "synapse_storage_events_state_delta_reuse_delta" ) # The number of forward extremities for each new event. @@ -118,22 +114,19 @@ buckets=(0, 1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), ) -state_resolutions_during_persistence = Counter( +state_resolutions_during_persistence = meter.create_counter( "synapse_storage_events_state_resolutions_during_persistence", - "Number of times we had to do state res to calculate new current state", - labelnames=[SERVER_NAME_LABEL], + description="Number of times we had to do state res to calculate new current state", ) -potential_times_prune_extremities = Counter( +potential_times_prune_extremities = meter.create_counter( "synapse_storage_events_potential_times_prune_extremities", - "Number of times we might be able to prune extremities", - labelnames=[SERVER_NAME_LABEL], + description="Number of times we might be able to prune extremities", ) -times_pruned_extremities = Counter( +times_pruned_extremities = meter.create_counter( "synapse_storage_events_times_pruned_extremities", - "Number of times we were actually be able to prune extremities", - labelnames=[SERVER_NAME_LABEL], + description="Number of times we were actually be able to prune extremities", ) @@ -720,11 +713,11 @@ async def _calculate_new_forward_extremities_and_state_delta( if all_single_prev_not_state: return (new_forward_extremities, None) - state_delta_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + state_delta_counter.add(1, {SERVER_NAME_LABEL: self.server_name}) if len(new_latest_event_ids) == 1: - state_delta_single_event_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + state_delta_single_event_counter.add( + 1, {SERVER_NAME_LABEL: self.server_name} + ) # This is a fairly handwavey check to see if we could # have guessed what the delta would have been when @@ -739,9 +732,9 @@ async def _calculate_new_forward_extremities_and_state_delta( for ev, _ in ev_ctx_rm: prev_event_ids = set(ev.prev_event_ids()) if latest_event_ids == prev_event_ids: - state_delta_reuse_delta_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + state_delta_reuse_delta_counter.add( + 1, {SERVER_NAME_LABEL: self.server_name} + ) break logger.debug("Calculating state delta for room %s", room_id) @@ -1015,9 +1008,9 @@ async def _get_new_state_after_events( ), ) - state_resolutions_during_persistence.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + state_resolutions_during_persistence.add( + 1, {SERVER_NAME_LABEL: self.server_name} + ) # If the returned state matches the state group of one of the new # forward extremities then we check if we are able to prune some state @@ -1045,9 +1038,7 @@ async def _prune_extremities( """See if we can prune any of the extremities after calculating the resolved state. """ - potential_times_prune_extremities.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc() + potential_times_prune_extremities.add(1, {SERVER_NAME_LABEL: self.server_name}) # We keep all the extremities that have the same state group, and # see if we can drop the others. @@ -1145,7 +1136,7 @@ async def _prune_extremities( return new_latest_event_ids - times_pruned_extremities.labels(**{SERVER_NAME_LABEL: self.server_name}).inc() + times_pruned_extremities.add(1, {SERVER_NAME_LABEL: self.server_name}) logger.info( "Pruning forward extremities in room %s: from %s -> %s", diff --git a/synapse/storage/database.py b/synapse/storage/database.py index a4b2b26795..a24a62ed66 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -47,7 +47,7 @@ ) import attr -from prometheus_client import Counter, Histogram +from prometheus_client import Histogram from typing_extensions import Concatenate, ParamSpec from twisted.enterprise import adbapi @@ -61,7 +61,7 @@ current_context, make_deferred_yieldable, ) -from synapse.metrics import SERVER_NAME_LABEL, register_threadpool +from synapse.metrics import SERVER_NAME_LABEL, meter, register_threadpool from synapse.storage.background_updates import BackgroundUpdater from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine from synapse.storage.types import Connection, Cursor, SQLQueryParameters @@ -88,15 +88,11 @@ sql_query_timer = Histogram( "synapse_storage_query_time", "sec", labelnames=["verb", SERVER_NAME_LABEL] ) -sql_txn_count = Counter( - "synapse_storage_transaction_time_count", - "sec", - labelnames=["desc", SERVER_NAME_LABEL], +sql_txn_count = meter.create_counter( + "synapse_storage_transaction_time_count", description="sec" ) -sql_txn_duration = Counter( - "synapse_storage_transaction_time_sum", - "sec", - labelnames=["desc", SERVER_NAME_LABEL], +sql_txn_duration = meter.create_counter( + "synapse_storage_transaction_time_sum", description="sec" ) @@ -909,14 +905,10 @@ def new_transaction( self._current_txn_total_time += duration self._txn_perf_counters.update(desc, duration) - sql_txn_count.labels( - desc=desc, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(1) - sql_txn_duration.labels( - desc=desc, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc(duration) + sql_txn_count.add(1, {"desc": desc, SERVER_NAME_LABEL: self.server_name}) + sql_txn_duration.add( + duration, {"desc": desc, SERVER_NAME_LABEL: self.server_name} + ) async def runInteraction( self, diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index d77420ff47..191224aa90 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -38,14 +38,14 @@ ) import attr -from prometheus_client import Counter, Gauge +from prometheus_client import Gauge from synapse.api.constants import MAX_DEPTH from synapse.api.errors import StoreError from synapse.api.room_versions import EventFormatVersions, RoomVersion from synapse.events import EventBase, make_event_from_dict from synapse.logging.opentracing import tag_args, trace -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.background_updates import ForeignKeyConstraint @@ -80,11 +80,10 @@ labelnames=[SERVER_NAME_LABEL], ) -pdus_pruned_from_federation_queue = Counter( +pdus_pruned_from_federation_queue = meter.create_counter( "synapse_federation_server_number_inbound_pdu_pruned", - "The number of events in the inbound federation staging that have been " + description="The number of events in the inbound federation staging that have been " "pruned due to the queue getting too long", - labelnames=[SERVER_NAME_LABEL], ) logger = logging.getLogger(__name__) @@ -2253,9 +2252,9 @@ async def prune_staged_events_in_room( if not to_delete: return False - pdus_pruned_from_federation_queue.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).inc(len(to_delete)) + pdus_pruned_from_federation_queue.add( + len(to_delete), {SERVER_NAME_LABEL: self.server_name} + ) logger.info( "Pruning %d events in room %s from federation queue", len(to_delete), diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index b6037468b3..6d9c799e6d 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -40,7 +40,6 @@ ) import attr -from prometheus_client import Counter import synapse.metrics from synapse.api.constants import ( @@ -60,7 +59,7 @@ from synapse.events.snapshot import EventPersistencePair from synapse.events.utils import parse_stripped_state_event from synapse.logging.opentracing import trace -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, @@ -95,14 +94,8 @@ logger = logging.getLogger(__name__) -persist_event_counter = Counter( - "synapse_storage_events_persisted_events", "", labelnames=[SERVER_NAME_LABEL] -) -event_counter = Counter( - "synapse_storage_events_persisted_events_sep", - "", - labelnames=["type", "origin_type", "origin_entity", SERVER_NAME_LABEL], -) +persist_event_counter = meter.create_counter("synapse_storage_events_persisted_events") +event_counter = meter.create_counter("synapse_storage_events_persisted_events_sep") # State event type/key pairs that we need to gather to fill in the # `sliding_sync_joined_rooms`/`sliding_sync_membership_snapshots` tables. @@ -366,8 +359,8 @@ async def _persist_events_and_state_updates( new_event_links=new_event_links, sliding_sync_table_changes=sliding_sync_table_changes, ) - persist_event_counter.labels(**{SERVER_NAME_LABEL: self.server_name}).inc( - len(events_and_contexts) + persist_event_counter.add( + len(events_and_contexts), {SERVER_NAME_LABEL: self.server_name} ) if not use_negative_stream_ordering: @@ -388,12 +381,15 @@ async def _persist_events_and_state_updates( origin_type = "remote" origin_entity = get_domain_from_id(event.sender) - event_counter.labels( - type=event.type, - origin_type=origin_type, - origin_entity=origin_entity, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + event_counter.add( + 1, + { + "type": event.type, + "origin_type": origin_type, + "origin_entity": origin_entity, + SERVER_NAME_LABEL: self.server_name, + }, + ) if ( not self.hs.config.experimental.msc4293_enabled From 4f86555e8f539bedf7b4e95649e2f91843139fb5 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Wed, 3 Dec 2025 14:36:44 +0100 Subject: [PATCH 04/18] finish updating Counters --- synapse/replication/tcp/handler.py | 6 ++-- synapse/replication/tcp/redis.py | 26 +++++++++------ synapse/util/metrics.py | 53 +++++++++++++----------------- synapse/util/ratelimitutils.py | 36 ++++++++++---------- 4 files changed, 61 insertions(+), 60 deletions(-) diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index d78f9c1bbb..3988e0b08c 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -544,9 +544,9 @@ def on_RDATA(self, conn: IReplicationConnection, cmd: RdataCommand) -> None: return stream_name = cmd.stream_name - inbound_rdata_count.labels( - stream_name=stream_name, **{SERVER_NAME_LABEL: self.server_name} - ).inc() + inbound_rdata_count.add( + 1, {"stream_name": stream_name, SERVER_NAME_LABEL: self.server_name} + ) # We put the received command into a queue here for two reasons: # 1. so we don't try and concurrently handle multiple rows for the diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py index caffb2913e..6c4857e7b3 100644 --- a/synapse/replication/tcp/redis.py +++ b/synapse/replication/tcp/redis.py @@ -192,11 +192,14 @@ def _parse_and_dispatch_message(self, message: str) -> None: # We use "redis" as the name here as we don't have 1:1 connections to # remote instances. - tcp_inbound_commands_counter.labels( - command=cmd.NAME, - name="redis", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + tcp_inbound_commands_counter.add( + 1, + { + "command": cmd.NAME, + "name": "redis", + SERVER_NAME_LABEL: self.server_name, + }, + ) self.handle_command(cmd) @@ -267,11 +270,14 @@ async def _async_send_command(self, cmd: Command) -> None: # We use "redis" as the name here as we don't have 1:1 connections to # remote instances. - tcp_outbound_commands_counter.labels( - command=cmd.NAME, - name="redis", - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + tcp_outbound_commands_counter.add( + 1, + { + "command": cmd.NAME, + "name": "redis", + SERVER_NAME_LABEL: self.server_name, + }, + ) channel_name = cmd.redis_channel_name(self.synapse_stream_prefix) diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index f71380d689..1675e0d9ed 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -33,7 +33,7 @@ TypeVar, ) -from prometheus_client import CollectorRegistry, Counter, Metric +from prometheus_client import CollectorRegistry, Metric from typing_extensions import Concatenate, ParamSpec from synapse.logging.context import ( @@ -41,61 +41,54 @@ LoggingContext, current_context, ) -from synapse.metrics import SERVER_NAME_LABEL, InFlightGauge +from synapse.metrics import SERVER_NAME_LABEL, InFlightGauge, meter from synapse.util.clock import Clock logger = logging.getLogger(__name__) # Metrics to see the number of and how much time is spend in various blocks of code. # -block_counter = Counter( +block_counter = meter.create_counter( "synapse_util_metrics_block_count", - documentation="The number of times this block has been called.", - labelnames=["block_name", SERVER_NAME_LABEL], + description="The number of times this block has been called.", ) """The number of times this block has been called.""" -block_timer = Counter( +block_timer = meter.create_counter( "synapse_util_metrics_block_time_seconds", - documentation="The cumulative time spent executing this block across all calls, in seconds.", - labelnames=["block_name", SERVER_NAME_LABEL], + description="The cumulative time spent executing this block across all calls, in seconds.", ) """The cumulative time spent executing this block across all calls, in seconds.""" -block_ru_utime = Counter( +block_ru_utime = meter.create_counter( "synapse_util_metrics_block_ru_utime_seconds", - documentation="Resource usage: user CPU time in seconds used in this block", - labelnames=["block_name", SERVER_NAME_LABEL], + description="Resource usage: user CPU time in seconds used in this block", ) """Resource usage: user CPU time in seconds used in this block""" -block_ru_stime = Counter( +block_ru_stime = meter.create_counter( "synapse_util_metrics_block_ru_stime_seconds", - documentation="Resource usage: system CPU time in seconds used in this block", - labelnames=["block_name", SERVER_NAME_LABEL], + description="Resource usage: system CPU time in seconds used in this block", ) """Resource usage: system CPU time in seconds used in this block""" -block_db_txn_count = Counter( +block_db_txn_count = meter.create_counter( "synapse_util_metrics_block_db_txn_count", - documentation="Number of database transactions completed in this block", - labelnames=["block_name", SERVER_NAME_LABEL], + description="Number of database transactions completed in this block", ) """Number of database transactions completed in this block""" # seconds spent waiting for db txns, excluding scheduling time, in this block -block_db_txn_duration = Counter( +block_db_txn_duration = meter.create_counter( "synapse_util_metrics_block_db_txn_duration_seconds", - documentation="Seconds spent waiting for database txns, excluding scheduling time, in this block", - labelnames=["block_name", SERVER_NAME_LABEL], + description="Seconds spent waiting for database txns, excluding scheduling time, in this block", ) """Seconds spent waiting for database txns, excluding scheduling time, in this block""" # seconds spent waiting for a db connection, in this block -block_db_sched_duration = Counter( +block_db_sched_duration = meter.create_counter( "synapse_util_metrics_block_db_sched_duration_seconds", - documentation="Seconds spent waiting for a db connection, in this block", - labelnames=["block_name", SERVER_NAME_LABEL], + description="Seconds spent waiting for a db connection, in this block", ) """Seconds spent waiting for a db connection, in this block""" @@ -255,13 +248,13 @@ def __exit__( try: labels = {"block_name": self.name, SERVER_NAME_LABEL: self.server_name} - block_counter.labels(**labels).inc() - block_timer.labels(**labels).inc(duration) - block_ru_utime.labels(**labels).inc(usage.ru_utime) - block_ru_stime.labels(**labels).inc(usage.ru_stime) - block_db_txn_count.labels(**labels).inc(usage.db_txn_count) - block_db_txn_duration.labels(**labels).inc(usage.db_txn_duration_sec) - block_db_sched_duration.labels(**labels).inc(usage.db_sched_duration_sec) + block_counter.add(1, labels) + block_timer.add(duration, labels) + block_ru_utime.add(usage.ru_utime, labels) + block_ru_stime.add(usage.ru_stime, labels) + block_db_txn_count.add(usage.db_txn_count, labels) + block_db_txn_duration.add(usage.db_txn_duration_sec, labels) + block_db_sched_duration.add(usage.db_sched_duration_sec, labels) except ValueError as exc: logger.warning("Failed to save metrics! Usage: %s Error: %s", usage, exc) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 756677fe6c..1d2c8a0126 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -40,8 +40,6 @@ ) from weakref import WeakSet -from prometheus_client.core import Counter - from twisted.internet import defer from synapse.api.errors import LimitExceededError @@ -52,7 +50,7 @@ run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge, meter from synapse.util.clock import Clock if typing.TYPE_CHECKING: @@ -62,15 +60,13 @@ # Track how much the ratelimiter is affecting requests -rate_limit_sleep_counter = Counter( +rate_limit_sleep_counter = meter.create_counter( "synapse_rate_limit_sleep", - "Number of requests slept by the rate limiter", - labelnames=["rate_limiter_name", SERVER_NAME_LABEL], + description="Number of requests slept by the rate limiter", ) -rate_limit_reject_counter = Counter( +rate_limit_reject_counter = meter.create_counter( "synapse_rate_limit_reject", - "Number of requests rejected by the rate limiter", - labelnames=["rate_limiter_name", SERVER_NAME_LABEL], + description="Number of requests rejected by the rate limiter", ) queue_wait_timer = Histogram( "synapse_rate_limit_queue_wait_time_seconds", @@ -315,10 +311,13 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) if self.metrics_name: - rate_limit_reject_counter.labels( - rate_limiter_name=self.metrics_name, - **{SERVER_NAME_LABEL: self.our_server_name}, - ).inc() + rate_limit_reject_counter.add( + 1, + { + "rate_limiter_name": self.metrics_name, + SERVER_NAME_LABEL: self.our_server_name, + }, + ) raise LimitExceededError( limiter_name="rc_federation", retry_after_ms=int(self.window_size / self.sleep_limit), @@ -355,10 +354,13 @@ def queue_request() -> "defer.Deferred[None]": self.sleep_sec, ) if self.metrics_name: - rate_limit_sleep_counter.labels( - rate_limiter_name=self.metrics_name, - **{SERVER_NAME_LABEL: self.our_server_name}, - ).inc() + rate_limit_sleep_counter.add( + 1, + { + "rate_limiter_name": self.metrics_name, + SERVER_NAME_LABEL: self.our_server_name, + }, + ) ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) From c812eb38feba99d3ad94b75dbc704fccdb22a5b2 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Wed, 3 Dec 2025 16:44:45 +0100 Subject: [PATCH 05/18] converting Gauges --- synapse/app/phone_stats_home.py | 47 +++++++--------- synapse/federation/federation_server.py | 7 +-- .../federation/sender/transaction_manager.py | 9 +-- synapse/metrics/__init__.py | 56 +++++++------------ synapse/metrics/_gc.py | 7 ++- synapse/metrics/background_process_metrics.py | 7 +-- synapse/metrics/common_usage_metrics.py | 50 ++++++++--------- synapse/push/pusherpool.py | 9 +-- synapse/replication/http/_base.py | 7 +-- .../databases/main/event_federation.py | 11 ++-- .../storage/databases/main/events_worker.py | 24 ++++---- synapse/util/batching_queue.py | 19 +++---- synapse/util/caches/deferred_cache.py | 9 +-- 13 files changed, 111 insertions(+), 151 deletions(-) diff --git a/synapse/app/phone_stats_home.py b/synapse/app/phone_stats_home.py index 4bbc33cba2..63c09ae24f 100644 --- a/synapse/app/phone_stats_home.py +++ b/synapse/app/phone_stats_home.py @@ -24,11 +24,9 @@ import sys from typing import TYPE_CHECKING, List, Mapping, Sized, Tuple -from prometheus_client import Gauge - from twisted.internet import defer -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import JsonDict from synapse.util.constants import ( MILLISECONDS_PER_SECOND, @@ -57,25 +55,21 @@ _stats_process: List[Tuple[int, "resource.struct_rusage"]] = [] # Gauges to expose monthly active user control metrics -current_mau_gauge = Gauge( +current_mau_gauge = meter.create_gauge( "synapse_admin_mau_current", - "Current MAU", - labelnames=[SERVER_NAME_LABEL], + description="Current MAU", ) -current_mau_by_service_gauge = Gauge( +current_mau_by_service_gauge = meter.create_gauge( "synapse_admin_mau_current_mau_by_service", - "Current MAU by service", - labelnames=["app_service", SERVER_NAME_LABEL], + description="Current MAU by service", ) -max_mau_gauge = Gauge( +max_mau_gauge = meter.create_gauge( "synapse_admin_mau_max", - "MAU Limit", - labelnames=[SERVER_NAME_LABEL], + description="MAU Limit", ) -registered_reserved_users_mau_gauge = Gauge( +registered_reserved_users_mau_gauge = meter.create_gauge( "synapse_admin_mau_registered_reserved_users", - "Registered users with reserved threepids", - labelnames=[SERVER_NAME_LABEL], + description="Registered users with reserved threepids", ) @@ -244,20 +238,21 @@ async def _generate_monthly_active_users() -> None: await store.get_monthly_active_count_by_service() ) reserved_users = await store.get_registered_reserved_users() - current_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set( - float(current_mau_count) + current_mau_gauge.set( + float(current_mau_count), {SERVER_NAME_LABEL: server_name} ) for app_service, count in current_mau_count_by_service.items(): - current_mau_by_service_gauge.labels( - app_service=app_service, **{SERVER_NAME_LABEL: server_name} - ).set(float(count)) - - registered_reserved_users_mau_gauge.labels( - **{SERVER_NAME_LABEL: server_name} - ).set(float(len(reserved_users))) - max_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set( - float(hs.config.server.max_mau_value) + current_mau_by_service_gauge.set( + float(count), + {"app_service": app_service, SERVER_NAME_LABEL: server_name}, + ) + + registered_reserved_users_mau_gauge.set( + float(len(reserved_users)), {SERVER_NAME_LABEL: server_name} + ) + max_mau_gauge.set( + float(hs.config.server.max_mau_value), {SERVER_NAME_LABEL: server_name} ) return hs.run_as_background_process( diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 33a7d904f0..acd7e38232 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -35,7 +35,7 @@ Union, ) -from prometheus_client import Gauge, Histogram +from prometheus_client import Histogram from twisted.python import failure @@ -119,10 +119,9 @@ labelnames=[SERVER_NAME_LABEL], ) -last_pdu_ts_metric = Gauge( +last_pdu_ts_metric = meter.create_gauge( "synapse_federation_last_received_pdu_time", - "The timestamp of the last PDU which was successfully received from the given domain", - labelnames=("origin_server_name", SERVER_NAME_LABEL), + description="The timestamp of the last PDU which was successfully received from the given domain", ) diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index f47c011487..83a86b30bb 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -20,8 +20,6 @@ import logging from typing import TYPE_CHECKING, List -from prometheus_client import Gauge - from synapse.api.constants import EduTypes from synapse.api.errors import HttpResponseException from synapse.events import EventBase @@ -34,7 +32,7 @@ tags, whitelisted_homeserver, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import JsonDict from synapse.util.json import json_decoder from synapse.util.metrics import measure_func @@ -45,10 +43,9 @@ logger = logging.getLogger(__name__) issue_8631_logger = logging.getLogger("synapse.8631_debug") -last_pdu_ts_metric = Gauge( +last_pdu_ts_metric = meter.create_gauge( "synapse_federation_last_sent_pdu_time", - "The timestamp of the last PDU which was successfully sent to the given domain", - labelnames=("destination_server_name", SERVER_NAME_LABEL), + description="The timestamp of the last PDU which was successfully sent to the given domain", ) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index af36d8d87c..1b8b6fcd2f 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -47,7 +47,6 @@ from packaging.version import parse as parse_version from prometheus_client import ( CollectorRegistry, - Gauge, Histogram, Metric, generate_latest, @@ -629,27 +628,21 @@ def collect(self) -> Iterable[Metric]: # Used to track where various components have processed in the event stream, # e.g. federation sending, appservice sending, etc. -event_processing_positions = Gauge( - "synapse_event_processing_positions", "", labelnames=["name", SERVER_NAME_LABEL] +event_processing_positions = meter.create_gauge( + "synapse_event_processing_positions", ) # Used to track the current max events stream position -event_persisted_position = Gauge( - "synapse_event_persisted_position", "", labelnames=[SERVER_NAME_LABEL] -) +event_persisted_position = meter.create_gauge("synapse_event_persisted_position") # Used to track the received_ts of the last event processed by various # components -event_processing_last_ts = Gauge( - "synapse_event_processing_last_ts", "", labelnames=["name", SERVER_NAME_LABEL] -) +event_processing_last_ts = meter.create_gauge("synapse_event_processing_last_ts") # Used to track the lag processing events. This is the time difference # between the last processed event's received_ts and the time it was # finished being processed. -event_processing_lag = Gauge( - "synapse_event_processing_lag", "", labelnames=["name", SERVER_NAME_LABEL] -) +event_processing_lag = meter.create_gauge("synapse_event_processing_lag") event_processing_lag_by_event = Histogram( "synapse_event_processing_lag_by_event", @@ -662,8 +655,8 @@ def collect(self) -> Iterable[Metric]: # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. We # consider this process-level because all Synapse homeservers running in the process # will use the same Synapse version. -build_info = Gauge( # type: ignore[missing-server-name-label] - "synapse_build_info", "Build information", ["pythonversion", "version", "osversion"] +build_info = meter.create_gauge( # type: ignore[missing-server-name-label] + "synapse_build_info", description="Build information" ) build_info.labels( " ".join([platform.python_implementation(), platform.python_version()]), @@ -672,10 +665,9 @@ def collect(self) -> Iterable[Metric]: ).set(1) # Loaded modules info -module_instances_info = Gauge( +module_instances_info = meter.create_gauge( "synapse_module_info", - "Information about loaded modules", - labelnames=["package_name", "module_name", "module_version", SERVER_NAME_LABEL], + description="Information about loaded modules", ) # 3PID send info @@ -688,41 +680,35 @@ def collect(self) -> Iterable[Metric]: labelnames=("type", "reason", SERVER_NAME_LABEL), ) -threadpool_total_threads = Gauge( +threadpool_total_threads = meter.create_gauge( "synapse_threadpool_total_threads", - "Total number of threads currently in the threadpool", - labelnames=["name", SERVER_NAME_LABEL], + description="Total number of threads currently in the threadpool", ) -threadpool_total_working_threads = Gauge( +threadpool_total_working_threads = meter.create_gauge( "synapse_threadpool_working_threads", - "Number of threads currently working in the threadpool", - labelnames=["name", SERVER_NAME_LABEL], + description="Number of threads currently working in the threadpool", ) -threadpool_total_min_threads = Gauge( +threadpool_total_min_threads = meter.create_gauge( "synapse_threadpool_min_threads", - "Minimum number of threads configured in the threadpool", - labelnames=["name", SERVER_NAME_LABEL], + description="Minimum number of threads configured in the threadpool", ) -threadpool_total_max_threads = Gauge( +threadpool_total_max_threads = meter.create_gauge( "synapse_threadpool_max_threads", - "Maximum number of threads configured in the threadpool", - labelnames=["name", SERVER_NAME_LABEL], + description="Maximum number of threads configured in the threadpool", ) # Gauges for room counts -known_rooms_gauge = Gauge( +known_rooms_gauge = meter.create_gauge( "synapse_known_rooms_total", - "Total number of rooms", - labelnames=[SERVER_NAME_LABEL], + description="Total number of rooms", ) -locally_joined_rooms_gauge = Gauge( +locally_joined_rooms_gauge = meter.create_gauge( "synapse_locally_joined_rooms_total", - "Total number of locally joined rooms", - labelnames=[SERVER_NAME_LABEL], + description="Total number of locally joined rooms", ) diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 1da871f18f..c80ead3611 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -29,7 +29,6 @@ from prometheus_client.core import ( REGISTRY, CounterMetricFamily, - Gauge, GaugeMetricFamily, Histogram, Metric, @@ -39,6 +38,8 @@ from synapse.metrics._types import Collector +from . import meter + """Prometheus metrics for garbage collection""" @@ -55,7 +56,9 @@ # # These are process-level metrics, so they do not have the `SERVER_NAME_LABEL`. -gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) # type: ignore[missing-server-name-label] +gc_unreachable = meter.create_gauge( + "python_gc_unreachable_total", description="Unreachable GC objects" +) # type: ignore[missing-server-name-label] gc_time = Histogram( # type: ignore[missing-server-name-label] "python_gc_time", "Time taken to GC (sec)", diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 1ba1ed71a1..ca73a9514f 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -41,7 +41,7 @@ ) from prometheus_client import Metric -from prometheus_client.core import REGISTRY, Gauge +from prometheus_client.core import REGISTRY from typing_extensions import Concatenate, ParamSpec from twisted.internet import defer @@ -82,10 +82,9 @@ description="Number of background processes started", ) -_background_process_in_flight_count = Gauge( +_background_process_in_flight_count = meter.create_gauge( "synapse_background_process_in_flight_count", - "Number of background processes in flight", - labelnames=["name", SERVER_NAME_LABEL], + description="Number of background processes in flight", ) # we set registry=None in all of these to stop them getting registered with diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py index 0c3f380177..ec11afd681 100644 --- a/synapse/metrics/common_usage_metrics.py +++ b/synapse/metrics/common_usage_metrics.py @@ -23,38 +23,33 @@ import attr -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter if TYPE_CHECKING: from synapse.server import HomeServer -from prometheus_client import Gauge # Gauge to expose daily active users metrics -current_dau_gauge = Gauge( +current_dau_gauge = meter.create_gauge( "synapse_admin_daily_active_users", - "Current daily active users count", - labelnames=[SERVER_NAME_LABEL], + description="Current daily active users count", ) # Gauge for users -users_in_status_gauge = Gauge( +users_in_status_gauge = meter.create_gauge( "synapse_user_count", - "Number of users in active, deactivated, suspended, and locked status", - ["status", SERVER_NAME_LABEL], + description="Number of users in active, deactivated, suspended, and locked status", ) -users_in_time_ranges_gauge = Gauge( +users_in_time_ranges_gauge = meter.create_gauge( "synapse_active_users", - "Number of active users in time ranges in 24h, 7d, and 30d", - ["time_range", SERVER_NAME_LABEL], + description="Number of active users in time ranges in 24h, 7d, and 30d", ) # We may want to add additional ranges in the future. -retained_users_gauge = Gauge( +retained_users_gauge = meter.create_gauge( "synapse_retained_users", - "Number of retained users in 30d", - ["time_range", SERVER_NAME_LABEL], + description="Number of retained users in 30d", ) @@ -142,9 +137,10 @@ async def _update_gauges(self) -> None: """Update the Prometheus gauges.""" metrics = await self._collect() - current_dau_gauge.labels( - **{SERVER_NAME_LABEL: self.server_name}, - ).set(float(metrics.daily_active_users)) + current_dau_gauge.set( + float(metrics.daily_active_users), + {SERVER_NAME_LABEL: self.server_name}, + ) time_range_to_metric = { "24h": metrics.daily_active_users, @@ -152,9 +148,10 @@ async def _update_gauges(self) -> None: "30d": metrics.monthly_active_users, } for time_range, _metric in time_range_to_metric.items(): - users_in_time_ranges_gauge.labels( - time_range=time_range, **{SERVER_NAME_LABEL: self.server_name} - ).set(float(_metric)) + users_in_time_ranges_gauge.set( + float(_metric), + {"time_range": time_range, SERVER_NAME_LABEL: self.server_name}, + ) status_to_metric = { "active": metrics.active_users, @@ -163,10 +160,11 @@ async def _update_gauges(self) -> None: "locked": metrics.locked_users, } for status, _metric in status_to_metric.items(): - users_in_status_gauge.labels( - status=status, **{SERVER_NAME_LABEL: self.server_name} - ).set(float(_metric)) + users_in_status_gauge.set( + float(_metric), {"status": status, SERVER_NAME_LABEL: self.server_name} + ) - retained_users_gauge.labels( - time_range="30d", **{SERVER_NAME_LABEL: self.server_name} - ).set(float(metrics.monthly_retained_users)) + retained_users_gauge.set( + float(metrics.monthly_retained_users), + {"time_range": "30d", SERVER_NAME_LABEL: self.server_name}, + ) diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index 977c55b683..c0f87e71c9 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -22,10 +22,8 @@ import logging from typing import TYPE_CHECKING, Dict, Iterable, Optional -from prometheus_client import Gauge - from synapse.api.errors import Codes, SynapseError -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) @@ -46,10 +44,9 @@ logger = logging.getLogger(__name__) -synapse_pushers = Gauge( +synapse_pushers = meter.create_gauge( "synapse_pushers", - "Number of active synapse pushers", - labelnames=["kind", "app_id", SERVER_NAME_LABEL], + description="Number of active synapse pushers", ) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 9c3e763ab3..7a93d3f0d2 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -25,8 +25,6 @@ from inspect import signature from typing import TYPE_CHECKING, Any, Awaitable, Callable, ClassVar, Dict, List, Tuple -from prometheus_client import Gauge - from twisted.internet.error import ConnectError, DNSLookupError from twisted.web.server import Request @@ -49,10 +47,9 @@ logger = logging.getLogger(__name__) -_pending_outgoing_requests = Gauge( +_pending_outgoing_requests = meter.create_gauge( "synapse_pending_outgoing_replication_requests", - "Number of active outgoing replication requests, by replication method name", - labelnames=["name", SERVER_NAME_LABEL], + description="Number of active outgoing replication requests, by replication method name", ) _outgoing_request_counter = meter.create_counter( diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 191224aa90..bbb3ec6f29 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -38,7 +38,6 @@ ) import attr -from prometheus_client import Gauge from synapse.api.constants import MAX_DEPTH from synapse.api.errors import StoreError @@ -68,16 +67,14 @@ if TYPE_CHECKING: from synapse.server import HomeServer -oldest_pdu_in_federation_staging = Gauge( +oldest_pdu_in_federation_staging = meter.create_gauge( "synapse_federation_server_oldest_inbound_pdu_in_staging", - "The age in seconds since we received the oldest pdu in the federation staging area", - labelnames=[SERVER_NAME_LABEL], + description="The age in seconds since we received the oldest pdu in the federation staging area", ) -number_pdus_in_federation_queue = Gauge( +number_pdus_in_federation_queue = meter.create_gauge( "synapse_federation_server_number_inbound_pdu_in_staging", - "The total number of events in the inbound federation staging", - labelnames=[SERVER_NAME_LABEL], + description="The total number of events in the inbound federation staging", ) pdus_pruned_from_federation_queue = meter.create_counter( diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 4f9a1a4f78..607444a267 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -41,7 +41,6 @@ ) import attr -from prometheus_client import Gauge from twisted.internet import defer @@ -68,7 +67,7 @@ tag_args, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) @@ -136,10 +135,9 @@ def __init__( EVENT_QUEUE_TIMEOUT_S = 0.1 # Timeout when waiting for requests for events -event_fetch_ongoing_gauge = Gauge( +event_fetch_ongoing_gauge = meter.create_gauge( "synapse_event_fetch_ongoing", - "The number of event fetchers that are running", - labelnames=[SERVER_NAME_LABEL], + description="The number of event fetchers that are running", ) @@ -315,8 +313,8 @@ def __init__( Tuple[Iterable[str], "defer.Deferred[Dict[str, _EventRow]]"] ] = [] self._event_fetch_ongoing = 0 - event_fetch_ongoing_gauge.labels(**{SERVER_NAME_LABEL: self.server_name}).set( - self._event_fetch_ongoing + event_fetch_ongoing_gauge.set( + self._event_fetch_ongoing, {SERVER_NAME_LABEL: self.server_name} ) # We define this sequence here so that it can be referenced from both @@ -1145,9 +1143,9 @@ def _maybe_start_fetch_thread(self) -> None: and self._event_fetch_ongoing < EVENT_QUEUE_THREADS ): self._event_fetch_ongoing += 1 - event_fetch_ongoing_gauge.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(self._event_fetch_ongoing) + event_fetch_ongoing_gauge.set( + self._event_fetch_ongoing, {SERVER_NAME_LABEL: self.server_name} + ) # `_event_fetch_ongoing` is decremented in `_fetch_thread`. should_start = True else: @@ -1169,9 +1167,9 @@ async def _fetch_thread(self) -> None: event_fetches_to_fail = [] with self._event_fetch_lock: self._event_fetch_ongoing -= 1 - event_fetch_ongoing_gauge.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(self._event_fetch_ongoing) + event_fetch_ongoing_gauge.set( + self._event_fetch_ongoing, {SERVER_NAME_LABEL: self.server_name} + ) # There may still be work remaining in `_event_fetch_list` if we # failed, or it was added in between us deciding to exit and diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index f77301afd8..01596f13ea 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -33,12 +33,12 @@ TypeVar, ) -from prometheus_client import Gauge +from opentelemetry.metrics._internal.instrument import Gauge from twisted.internet import defer from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.util.clock import Clock if TYPE_CHECKING: @@ -50,22 +50,19 @@ V = TypeVar("V") R = TypeVar("R") -number_queued = Gauge( +number_queued = meter.create_gauge( "synapse_util_batching_queue_number_queued", - "The number of items waiting in the queue across all keys", - labelnames=("name", SERVER_NAME_LABEL), + description="The number of items waiting in the queue across all keys", ) -number_in_flight = Gauge( +number_in_flight = meter.create_gauge( "synapse_util_batching_queue_number_pending", - "The number of items across all keys either being processed or waiting in a queue", - labelnames=("name", SERVER_NAME_LABEL), + description="The number of items across all keys either being processed or waiting in a queue", ) -number_of_keys = Gauge( +number_of_keys = meter.create_gauge( "synapse_util_batching_queue_number_of_keys", - "The number of distinct keys that have items queued", - labelnames=("name", SERVER_NAME_LABEL), + description="The number of distinct keys that have items queued", ) diff --git a/synapse/util/caches/deferred_cache.py b/synapse/util/caches/deferred_cache.py index 016acbac71..a11b30876f 100644 --- a/synapse/util/caches/deferred_cache.py +++ b/synapse/util/caches/deferred_cache.py @@ -38,21 +38,18 @@ cast, ) -from prometheus_client import Gauge - from twisted.internet import defer from twisted.python.failure import Failure -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.lrucache import LruCache from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry from synapse.util.clock import Clock -cache_pending_metric = Gauge( +cache_pending_metric = meter.create_gauge( "synapse_util_caches_cache_pending", - "Number of lookups currently pending for this cache", - labelnames=["name", SERVER_NAME_LABEL], + description="Number of lookups currently pending for this cache", ) T = TypeVar("T") From 68f2073e8d64a44193ec8a835853e01a995e82d7 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 10:27:58 +0100 Subject: [PATCH 06/18] convert simple histograms --- synapse/api/auth/__init__.py | 9 +++------ synapse/federation/federation_server.py | 7 ++----- synapse/handlers/sliding_sync/__init__.py | 8 +++----- synapse/http/request_metrics.py | 7 ++----- synapse/metrics/__init__.py | 5 ++--- synapse/state/__init__.py | 10 ++++------ synapse/storage/database.py | 9 +++------ 7 files changed, 19 insertions(+), 36 deletions(-) diff --git a/synapse/api/auth/__init__.py b/synapse/api/auth/__init__.py index d253938329..0b4beb08bc 100644 --- a/synapse/api/auth/__init__.py +++ b/synapse/api/auth/__init__.py @@ -20,13 +20,11 @@ # from typing import TYPE_CHECKING, Optional, Protocol, Tuple -from prometheus_client import Histogram - from twisted.web.server import Request from synapse.appservice import ApplicationService from synapse.http.site import SynapseRequest -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.types import Requester if TYPE_CHECKING: @@ -36,10 +34,9 @@ GUEST_DEVICE_ID = "guest_device" -introspection_response_timer = Histogram( +introspection_response_timer = meter.create_histogram( "synapse_api_auth_delegated_introspection_response", - "Time taken to get a response for an introspection request", - labelnames=["code", SERVER_NAME_LABEL], + description="Time taken to get a response for an introspection request", ) diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index acd7e38232..162c8a80d6 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -35,8 +35,6 @@ Union, ) -from prometheus_client import Histogram - from twisted.python import failure from synapse.api.constants import ( @@ -113,10 +111,9 @@ "synapse_federation_server_received_queries", ) -pdu_process_time = Histogram( +pdu_process_time = meter.create_histogram( "synapse_federation_server_pdu_process_time", - "Time taken to process an event", - labelnames=[SERVER_NAME_LABEL], + description="Time taken to process an event", ) last_pdu_ts_metric = meter.create_gauge( diff --git a/synapse/handlers/sliding_sync/__init__.py b/synapse/handlers/sliding_sync/__init__.py index 255a041d0e..9188371453 100644 --- a/synapse/handlers/sliding_sync/__init__.py +++ b/synapse/handlers/sliding_sync/__init__.py @@ -17,7 +17,6 @@ from itertools import chain from typing import TYPE_CHECKING, AbstractSet, Dict, List, Mapping, Optional, Set, Tuple -from prometheus_client import Histogram from typing_extensions import assert_never from synapse.api.constants import Direction, EventTypes, Membership @@ -38,7 +37,7 @@ tag_args, trace, ) -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.state_deltas import StateDelta from synapse.storage.databases.main.stream import PaginateFunction @@ -77,10 +76,9 @@ logger = logging.getLogger(__name__) -sync_processing_time = Histogram( +sync_processing_time = meter.create_histogram( "synapse_sliding_sync_processing_time", - "Time taken to generate a sliding sync response, ignoring wait times.", - labelnames=["initial", SERVER_NAME_LABEL], + description="Time taken to generate a sliding sync response, ignoring wait times.", ) # Limit the number of state_keys we should remember sending down the connection for each diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index c882aa9edc..9450d6be62 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -24,8 +24,6 @@ import traceback from typing import Dict, Mapping, Set, Tuple -from prometheus_client.core import Histogram - from synapse.logging.context import current_context from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter @@ -45,10 +43,9 @@ "synapse_http_server_responses", ) -response_timer = Histogram( +response_timer = meter.create_histogram( "synapse_http_server_response_time_seconds", - "sec", - labelnames=["method", "servlet", "tag", "code", SERVER_NAME_LABEL], + unit="sec", ) response_ru_utime = meter.create_counter( diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 1b8b6fcd2f..d65e637981 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -644,10 +644,9 @@ def collect(self) -> Iterable[Metric]: # finished being processed. event_processing_lag = meter.create_gauge("synapse_event_processing_lag") -event_processing_lag_by_event = Histogram( +event_processing_lag_by_event = meter.create_histogram( "synapse_event_processing_lag_by_event", - "Time between an event being persisted and it being queued up to be sent to the relevant remote servers", - labelnames=["name", SERVER_NAME_LABEL], + description="Time between an event being persisted and it being queued up to be sent to the relevant remote servers", ) # Build info of the running server. diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py index 76c3fe51d6..087b07038e 100644 --- a/synapse/state/__init__.py +++ b/synapse/state/__init__.py @@ -618,15 +618,13 @@ class _StateResMetrics: "expensive room for state resolution", ) -_cpu_times = Histogram( +_cpu_times = meter.create_histogram( "synapse_state_res_cpu_for_all_rooms_seconds", - "CPU time (utime+stime) spent computing a single state resolution", - labelnames=[SERVER_NAME_LABEL], + description="CPU time (utime+stime) spent computing a single state resolution", ) -_db_times = Histogram( +_db_times = meter.create_histogram( "synapse_state_res_db_for_all_rooms_seconds", - "Database time spent computing a single state resolution", - labelnames=[SERVER_NAME_LABEL], + description="Database time spent computing a single state resolution", ) diff --git a/synapse/storage/database.py b/synapse/storage/database.py index a24a62ed66..438bb975c5 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -47,7 +47,6 @@ ) import attr -from prometheus_client import Histogram from typing_extensions import Concatenate, ParamSpec from twisted.enterprise import adbapi @@ -81,13 +80,11 @@ transaction_logger = logging.getLogger("synapse.storage.txn") perf_logger = logging.getLogger("synapse.storage.TIME") -sql_scheduling_timer = Histogram( - "synapse_storage_schedule_time", "sec", labelnames=[SERVER_NAME_LABEL] +sql_scheduling_timer = meter.create_histogram( + "synapse_storage_schedule_time", unit="sec" ) -sql_query_timer = Histogram( - "synapse_storage_query_time", "sec", labelnames=["verb", SERVER_NAME_LABEL] -) +sql_query_timer = meter.create_histogram("synapse_storage_query_time", unit="sec") sql_txn_count = meter.create_counter( "synapse_storage_transaction_time_count", description="sec" ) From bb60a79b7c5dff84ad8c51a4779424a831b84f4c Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 11:21:40 +0100 Subject: [PATCH 07/18] more conversion --- synapse/api/auth/mas.py | 20 +++++---- synapse/api/auth/msc3861_delegated.py | 20 +++++---- synapse/app/_base.py | 17 ++++---- synapse/federation/federation_server.py | 14 ++++--- synapse/federation/sender/__init__.py | 40 +++++++++++------- .../federation/sender/transaction_manager.py | 11 +++-- synapse/handlers/appservice.py | 11 +++-- synapse/handlers/federation.py | 13 +++--- synapse/handlers/federation_event.py | 12 ++---- synapse/metrics/_gc.py | 8 ++-- synapse/replication/tcp/external_cache.py | 11 ++--- synapse/rest/client/room.py | 16 ++++---- synapse/state/__init__.py | 22 +++++----- synapse/storage/controllers/persist_events.py | 41 ++++++++++++------- synapse/util/ratelimitutils.py | 31 +++++++------- 15 files changed, 155 insertions(+), 132 deletions(-) diff --git a/synapse/api/auth/mas.py b/synapse/api/auth/mas.py index baa6b27336..5990727fb1 100644 --- a/synapse/api/auth/mas.py +++ b/synapse/api/auth/mas.py @@ -236,23 +236,25 @@ async def _introspect_token( ) except HttpResponseException as e: end_time = self._clock.time() - introspection_response_timer.labels( - code=e.code, **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, + {"code": e.code, SERVER_NAME_LABEL: self.server_name}, + ) raise except Exception: end_time = self._clock.time() - introspection_response_timer.labels( - code="ERR", **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, + {"code": "ERR", SERVER_NAME_LABEL: self.server_name}, + ) raise logger.debug("Fetched token from MAS") end_time = self._clock.time() - introspection_response_timer.labels( - code=200, **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, {"code": 200, SERVER_NAME_LABEL: self.server_name} + ) raw_response = json_decoder.decode(resp_body.decode("utf-8")) try: diff --git a/synapse/api/auth/msc3861_delegated.py b/synapse/api/auth/msc3861_delegated.py index b6adcc83dc..3d017b4969 100644 --- a/synapse/api/auth/msc3861_delegated.py +++ b/synapse/api/auth/msc3861_delegated.py @@ -334,23 +334,25 @@ async def _introspect_token( ) except HttpResponseException as e: end_time = self._clock.time() - introspection_response_timer.labels( - code=e.code, **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, + {"code": e.code, SERVER_NAME_LABEL: self.server_name}, + ) raise except Exception: end_time = self._clock.time() - introspection_response_timer.labels( - code="ERR", **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, + {"code": "ERR", SERVER_NAME_LABEL: self.server_name}, + ) raise logger.debug("Fetched token from MAS") end_time = self._clock.time() - introspection_response_timer.labels( - code=200, **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time - start_time) + introspection_response_timer.record( + end_time - start_time, {"code": 200, SERVER_NAME_LABEL: self.server_name} + ) resp = json_decoder.decode(resp_body.decode("utf-8")) diff --git a/synapse/app/_base.py b/synapse/app/_base.py index f91834e9cd..94d8a80a6a 100644 --- a/synapse/app/_base.py +++ b/synapse/app/_base.py @@ -643,13 +643,16 @@ def run_sighup(*args: Any, **kwargs: Any) -> None: module ) # Set module info metrics for prometheus - module_instances_info.labels( - package_name=package_name, - # what is given in the config - module_name=module_name, - module_version=module_version, - **{SERVER_NAME_LABEL: hs.hostname}, - ).set(1) + module_instances_info.set( + 1, + { + "package_name": package_name, + # what is given in the config + "module_name": module_name, + "module_version": module_version, + SERVER_NAME_LABEL: hs.hostname, + }, + ) logger.info("Loaded module %s", m) if hs.config.auto_accept_invites.enabled: diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 162c8a80d6..cc61c8cc6c 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -545,9 +545,10 @@ async def process_pdu(pdu: EventBase) -> JsonDict: ) if newest_pdu_ts and origin in self._federation_metrics_domains: - last_pdu_ts_metric.labels( - origin_server_name=origin, **{SERVER_NAME_LABEL: self.server_name} - ).set(newest_pdu_ts / 1000) + last_pdu_ts_metric.set( + newest_pdu_ts / 1000, + {"origin_server_name": origin, SERVER_NAME_LABEL: self.server_name}, + ) return pdu_results @@ -1347,9 +1348,10 @@ async def _process_incoming_pdus_in_room_inner( origin, event.event_id ) if received_ts is not None: - pdu_process_time.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe((self._clock.time_msec() - received_ts) / 1000) + pdu_process_time.record( + (self._clock.time_msec() - received_ts) / 1000, + {SERVER_NAME_LABEL: self.server_name}, + ) next = await self._get_next_nonspam_staged_event_for_room( room_id, room_version diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 4f123a6c0c..35234ed3ca 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -717,10 +717,13 @@ async def handle_event(event: EventBase) -> None: now = self.clock.time_msec() ts = event_to_received_ts[event.event_id] assert ts is not None - synapse.metrics.event_processing_lag_by_event.labels( - name="federation_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe((now - ts) / 1000) + synapse.metrics.event_processing_lag_by_event.record( + (now - ts) / 1000, + { + "name": "federation_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) async def handle_room_events(events: List[EventBase]) -> None: logger.debug( @@ -762,14 +765,20 @@ async def handle_room_events(events: List[EventBase]) -> None: ts = max(t for t in event_to_received_ts.values() if t) assert ts is not None - synapse.metrics.event_processing_lag.labels( - name="federation_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).set(now - ts) - synapse.metrics.event_processing_last_ts.labels( - name="federation_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).set(ts) + synapse.metrics.event_processing_lag.set( + now - ts, + { + "name": "federation_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) + synapse.metrics.event_processing_last_ts.set( + ts, + { + "name": "federation_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) events_processed_counter.add( len(event_entries), {SERVER_NAME_LABEL: self.server_name} @@ -791,9 +800,10 @@ async def handle_room_events(events: List[EventBase]) -> None: }, ) - synapse.metrics.event_processing_positions.labels( - name="federation_sender", **{SERVER_NAME_LABEL: self.server_name} - ).set(next_token) + synapse.metrics.event_processing_positions.set( + next_token, + {"name": "federation_sender", SERVER_NAME_LABEL: self.server_name}, + ) finally: self._is_processing = False diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index 83a86b30bb..3f8d91475d 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -201,7 +201,10 @@ def json_data_cb() -> JsonDict: if pdus and destination in self._federation_metrics_domains: last_pdu = pdus[-1] - last_pdu_ts_metric.labels( - destination_server_name=destination, - **{SERVER_NAME_LABEL: self.server_name}, - ).set(last_pdu.origin_server_ts / 1000) + last_pdu_ts_metric.set( + last_pdu.origin_server_ts / 1000, + { + "destination_server_name": destination, + SERVER_NAME_LABEL: self.server_name, + }, + ) diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index 267765a3a9..db37d46103 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -183,10 +183,13 @@ async def start_scheduler() -> None: ts = event_to_received_ts[event.event_id] assert ts is not None - synapse.metrics.event_processing_lag_by_event.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe((now - ts) / 1000) + synapse.metrics.event_processing_lag_by_event.record( + (now - ts) / 1000, + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) async def handle_room_events(events: Iterable[EventBase]) -> None: for event in events: diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 73505a0dc1..17197634da 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -40,7 +40,6 @@ ) import attr -from prometheus_client import Histogram from signedjson.key import decode_verify_key_bytes from signedjson.sign import verify_signed_json from unpaddedbase64 import decode_base64 @@ -71,7 +70,7 @@ from synapse.http.servlet import assert_params_in_dict from synapse.logging.context import nested_logging_context from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.module_api import NOT_SPAM from synapse.storage.databases.main.events_worker import EventRedactBehaviour from synapse.storage.invite_rule import InviteRule @@ -87,11 +86,10 @@ logger = logging.getLogger(__name__) # Added to debug performance and track progress on optimizations -backfill_processing_before_timer = Histogram( +backfill_processing_before_timer = meter.create_histogram( "synapse_federation_backfill_processing_before_time_seconds", - "sec", - labelnames=[SERVER_NAME_LABEL], - buckets=( + unit="sec", + explicit_bucket_boundaries_advisory=[ 0.1, 0.5, 1.0, @@ -105,8 +103,7 @@ 40.0, 60.0, 80.0, - "+Inf", - ), + ], ) diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 82f8f0d0e9..028524386b 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -36,8 +36,6 @@ Tuple, ) -from prometheus_client import Histogram - from synapse import event_auth from synapse.api.constants import ( EventContentFields, @@ -112,11 +110,10 @@ ) # Added to debug performance and track progress on optimizations -backfill_processing_after_timer = Histogram( +backfill_processing_after_timer = meter.create_histogram( "synapse_federation_backfill_processing_after_time_seconds", - "sec", - labelnames=[SERVER_NAME_LABEL], - buckets=( + unit="sec", + explicit_bucket_boundaries_advisory=[ 0.1, 0.25, 0.5, @@ -137,8 +134,7 @@ 120.0, 150.0, 180.0, - "+Inf", - ), + ], ) diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index c80ead3611..d0ddae0258 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -30,7 +30,6 @@ REGISTRY, CounterMetricFamily, GaugeMetricFamily, - Histogram, Metric, ) @@ -59,11 +58,10 @@ gc_unreachable = meter.create_gauge( "python_gc_unreachable_total", description="Unreachable GC objects" ) # type: ignore[missing-server-name-label] -gc_time = Histogram( # type: ignore[missing-server-name-label] +gc_time = meter.create_histogram( # type: ignore[missing-server-name-label] "python_gc_time", - "Time taken to GC (sec)", - ["gen"], - buckets=[ + description="Time taken to GC (sec)", + explicit_bucket_boundaries_advisory=[ 0.0025, 0.005, 0.01, diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index dcd10b15f2..7a4087df5c 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -22,8 +22,6 @@ import logging from typing import TYPE_CHECKING, Any, Optional -from prometheus_client import Histogram - from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable from synapse.metrics import SERVER_NAME_LABEL, meter @@ -44,18 +42,17 @@ description="Number of times we get a cache", ) -response_timer = Histogram( +response_timer = meter.create_histogram( "synapse_external_cache_response_time_seconds", - "Time taken to get a response from Redis for a cache get/set request", - labelnames=["method", SERVER_NAME_LABEL], - buckets=( + description="Time taken to get a response from Redis for a cache get/set request", + explicit_bucket_boundaries_advisory=[ 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, - ), + ], ) diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 1084139df0..4d57908017 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -28,8 +28,6 @@ from typing import TYPE_CHECKING, Awaitable, Dict, List, Optional, Tuple from urllib import parse as urlparse -from prometheus_client.core import Histogram - from twisted.web.server import Request from synapse import event_auth @@ -65,7 +63,7 @@ from synapse.http.site import SynapseRequest from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag -from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics import SERVER_NAME_LABEL, meter from synapse.rest.client._base import client_patterns from synapse.rest.client.transactions import HttpTransactionCache from synapse.state import CREATE_KEY, POWER_KEY @@ -113,15 +111,16 @@ def from_member_count(member_count: int) -> "_RoomSize": # greater than 10s. We use a separate dedicated histogram with its own buckets # so that we don't increase the cardinality of the general one because it's # multiplied across hundreds of servlets. -messsages_response_timer = Histogram( +messsages_response_timer = meter.create_histogram( "synapse_room_message_list_rest_servlet_response_time_seconds", - "sec", + unit="sec", # We have a label for room size so we can try to see a more realistic # picture of /messages response time for bigger rooms. We don't want the # tiny rooms that can always respond fast skewing our results when we're trying # to optimize the bigger cases. - labelnames=["room_size", SERVER_NAME_LABEL], - buckets=( + # labelnames=["room_size", SERVER_NAME_LABEL], + # ^ not needed to be initialized with otel, keeping this here for the comment above + explicit_bucket_boundaries_advisory=[ 0.005, 0.01, 0.025, @@ -141,8 +140,7 @@ def from_member_count(member_count: int) -> "_RoomSize": 120.0, 150.0, 180.0, - "+Inf", - ), + ], ) diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py index 087b07038e..dd1cf4622f 100644 --- a/synapse/state/__init__.py +++ b/synapse/state/__init__.py @@ -40,7 +40,6 @@ import attr from immutabledict import immutabledict from opentelemetry.metrics import Counter -from prometheus_client import Histogram from synapse.api.constants import EventTypes from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, StateResolutionVersions @@ -74,11 +73,10 @@ metrics_logger = logging.getLogger("synapse.state.metrics") # Metrics for number of state groups involved in a resolution. -state_groups_histogram = Histogram( +state_groups_histogram = meter.create_histogram( "synapse_state_number_state_groups_in_resolution", - "Number of state groups used when performing a state resolution", - labelnames=[SERVER_NAME_LABEL], - buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), + description="Number of state groups used when performing a state resolution", + explicit_bucket_boundaries_advisory=[1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500], ) @@ -742,9 +740,9 @@ async def resolve_state_groups( f"State groups have been deleted: {shortstr(missing_state_groups)}" ) - state_groups_histogram.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe(len(state_groups_ids)) + state_groups_histogram.record( + len(state_groups_ids), {SERVER_NAME_LABEL: self.server_name} + ) new_state = await self.resolve_events_with_store( room_id, @@ -831,11 +829,11 @@ def _record_state_res_metrics( room_metrics.db_time += rusage.db_txn_duration_sec room_metrics.db_events += rusage.evt_db_fetch_count - _cpu_times.labels(**{SERVER_NAME_LABEL: self.server_name}).observe( - rusage.ru_utime + rusage.ru_stime + _cpu_times.record( + rusage.ru_utime + rusage.ru_stime, {SERVER_NAME_LABEL: self.server_name} ) - _db_times.labels(**{SERVER_NAME_LABEL: self.server_name}).observe( - rusage.db_txn_duration_sec + _db_times.record( + rusage.db_txn_duration_sec, {SERVER_NAME_LABEL: self.server_name} ) def _report_metrics(self) -> None: diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index e1e951ff21..102626f21b 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -45,7 +45,6 @@ ) import attr -from prometheus_client import Histogram from twisted.internet import defer @@ -98,20 +97,32 @@ ) # The number of forward extremities for each new event. -forward_extremities_counter = Histogram( +forward_extremities_counter = meter.create_histogram( "synapse_storage_events_forward_extremities_persisted", - "Number of forward extremities for each new event", - labelnames=[SERVER_NAME_LABEL], - buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), + description="Number of forward extremities for each new event", + explicit_bucket_boundaries_advisory=[1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500], ) # The number of stale forward extremities for each new event. Stale extremities # are those that were in the previous set of extremities as well as the new. -stale_forward_extremities_counter = Histogram( +stale_forward_extremities_counter = meter.create_histogram( "synapse_storage_events_stale_forward_extremities_persisted", - "Number of unchanged forward extremities for each new event", - labelnames=[SERVER_NAME_LABEL], - buckets=(0, 1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), + description="Number of unchanged forward extremities for each new event", + explicit_bucket_boundaries_advisory=[ + 0, + 1, + 2, + 3, + 5, + 7, + 10, + 15, + 20, + 50, + 100, + 200, + 500, + ], ) state_resolutions_during_persistence = meter.create_counter( @@ -843,13 +854,13 @@ async def _calculate_new_extremities( # We only update metrics for events that change forward extremities # (e.g. we ignore backfill/outliers/etc) if result != latest_event_ids: - forward_extremities_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe(len(result)) + forward_extremities_counter.record( + len(result), {SERVER_NAME_LABEL: self.server_name} + ) stale = latest_event_ids & result - stale_forward_extremities_counter.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe(len(stale)) + stale_forward_extremities_counter.record( + len(stale), {SERVER_NAME_LABEL: self.server_name} + ) return result diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 1d2c8a0126..7c8b2dbd43 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -23,6 +23,7 @@ import contextlib import logging import threading +import time import typing from typing import ( Any, @@ -50,7 +51,7 @@ run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge, meter +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge, meter from synapse.util.clock import Clock if typing.TYPE_CHECKING: @@ -68,11 +69,10 @@ "synapse_rate_limit_reject", description="Number of requests rejected by the rate limiter", ) -queue_wait_timer = Histogram( +queue_wait_timer = meter.create_histogram( "synapse_rate_limit_queue_wait_time_seconds", - "Amount of time spent waiting for the rate limiter to let our request through.", - labelnames=["rate_limiter_name", SERVER_NAME_LABEL], - buckets=( + description="Amount of time spent waiting for the rate limiter to let our request through.", + explicit_bucket_boundaries_advisory=[ 0.005, 0.01, 0.025, @@ -86,8 +86,7 @@ 5.0, 10.0, 20.0, - "+Inf", - ), + ], ) @@ -289,15 +288,19 @@ def should_sleep(self) -> bool: return len(self.request_times) > self.sleep_limit async def _on_enter_with_tracing(self, request_id: object) -> None: - maybe_metrics_cm: ContextManager = contextlib.nullcontext() - if self.metrics_name: - maybe_metrics_cm = queue_wait_timer.labels( - rate_limiter_name=self.metrics_name, - **{SERVER_NAME_LABEL: self.our_server_name}, - ).time() - with start_active_span("ratelimit wait"), maybe_metrics_cm: + with start_active_span("ratelimit wait"): + start = time.perf_counter() await self._on_enter(request_id) + if self.metrics_name: + queue_wait_timer.record( + time.perf_counter() - start, + { + "rate_limiter_name": self.metrics_name, + SERVER_NAME_LABEL: self.our_server_name, + }, + ) + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() From 3d3d5467b039b31d4df4367006ee321039fef7ed Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 14:55:02 +0100 Subject: [PATCH 08/18] updated almost all labels --- synapse/handlers/appservice.py | 33 +++++++++------ synapse/handlers/delayed_events.py | 14 ++++--- synapse/handlers/federation.py | 7 ++-- synapse/handlers/presence.py | 6 +-- synapse/handlers/room_member.py | 6 +-- synapse/handlers/sliding_sync/__init__.py | 7 ++-- synapse/handlers/stats.py | 16 ++++---- synapse/handlers/user_directory.py | 6 +-- synapse/http/request_metrics.py | 11 +++-- synapse/metrics/__init__.py | 41 +++++++++++-------- synapse/metrics/_gc.py | 4 +- synapse/push/pusherpool.py | 41 +++++++++++-------- synapse/replication/http/_base.py | 2 +- synapse/replication/tcp/external_cache.py | 15 ++++--- synapse/rest/client/room.py | 11 +++-- synapse/storage/database.py | 12 +++--- .../databases/main/event_federation.py | 10 ++--- synapse/storage/databases/main/events.py | 6 +-- synapse/util/batching_queue.py | 14 ++++--- synapse/util/caches/deferred_cache.py | 7 ++-- tests/metrics/test_metrics.py | 15 ++++--- 21 files changed, 162 insertions(+), 122 deletions(-) diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index db37d46103..50e9fa02a0 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -207,10 +207,13 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: await self.store.set_appservice_last_pos(upper_bound) - synapse.metrics.event_processing_positions.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).set(upper_bound) + synapse.metrics.event_processing_positions.set( + upper_bound, + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) events_processed_counter.add( len(events), {SERVER_NAME_LABEL: self.server_name} @@ -237,14 +240,20 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: ts = event_to_received_ts[events[-1].event_id] assert ts is not None - synapse.metrics.event_processing_lag.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).set(now - ts) - synapse.metrics.event_processing_last_ts.labels( - name="appservice_sender", - **{SERVER_NAME_LABEL: self.server_name}, - ).set(ts) + synapse.metrics.event_processing_lag.set( + now - ts, + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) + synapse.metrics.event_processing_last_ts.set( + ts, + { + "name": "appservice_sender", + SERVER_NAME_LABEL: self.server_name, + }, + ) finally: self.is_processing = False diff --git a/synapse/handlers/delayed_events.py b/synapse/handlers/delayed_events.py index 79dd3e8416..72c39aac2f 100644 --- a/synapse/handlers/delayed_events.py +++ b/synapse/handlers/delayed_events.py @@ -170,9 +170,10 @@ async def _unsafe_process_new_event(self) -> None: await self._store.update_delayed_events_stream_pos(room_max_stream_ordering) - event_processing_positions.labels( - name="delayed_events", **{SERVER_NAME_LABEL: self.server_name} - ).set(room_max_stream_ordering) + event_processing_positions.set( + room_max_stream_ordering, + {"name": "delayed_events", SERVER_NAME_LABEL: self.server_name}, + ) return @@ -220,9 +221,10 @@ async def _unsafe_process_new_event(self) -> None: self._event_pos = max_pos # Expose current event processing position to prometheus - event_processing_positions.labels( - name="delayed_events", **{SERVER_NAME_LABEL: self.server_name} - ).set(max_pos) + event_processing_positions.set( + max_pos, + {"name": "delayed_events", SERVER_NAME_LABEL: self.server_name}, + ) await self._store.update_delayed_events_stream_pos(max_pos) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 17197634da..e1b5d4488e 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -527,9 +527,10 @@ async def try_backfill(domains: StrCollection) -> None: # backfill points regardless of `current_depth`. if processing_start_time is not None: processing_end_time = self.clock.time_msec() - backfill_processing_before_timer.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe((processing_end_time - processing_start_time) / 1000) + backfill_processing_before_timer.record( + (processing_end_time - processing_start_time) / 1000, + {SERVER_NAME_LABEL: self.server_name}, + ) # TODO: we could also try servers which were previously in the room, but # are no longer. diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index cc7cc83253..0dad80a598 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -1556,9 +1556,9 @@ async def _unsafe_process(self) -> None: self._event_pos = max_pos # Expose current event processing position to prometheus - synapse.metrics.event_processing_positions.labels( - name="presence", **{SERVER_NAME_LABEL: self.server_name} - ).set(max_pos) + synapse.metrics.event_processing_positions.set( + max_pos, {"name": "presence", SERVER_NAME_LABEL: self.server_name} + ) async def _handle_state_delta(self, room_id: str, deltas: List[StateDelta]) -> None: """Process current state deltas for the room to find new joins that need diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index 2ab9b70f8c..6575056056 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -2264,9 +2264,9 @@ async def _unsafe_process(self) -> None: self.pos = max_pos # Expose current event processing position to prometheus - event_processing_positions.labels( - name="room_forgetter", **{SERVER_NAME_LABEL: self.server_name} - ).set(max_pos) + event_processing_positions.set( + max_pos, {"name": "room_forgetter", SERVER_NAME_LABEL: self.server_name} + ) await self._store.update_room_forgetter_stream_pos(max_pos) diff --git a/synapse/handlers/sliding_sync/__init__.py b/synapse/handlers/sliding_sync/__init__.py index 9188371453..448c96fae0 100644 --- a/synapse/handlers/sliding_sync/__init__.py +++ b/synapse/handlers/sliding_sync/__init__.py @@ -376,9 +376,10 @@ async def handle_room(room_id: str) -> None: set_tag(SynapseTags.FUNC_ARG_PREFIX + "sync_config.user", user_id) end_time_s = self.clock.time() - sync_processing_time.labels( - initial=from_token is not None, **{SERVER_NAME_LABEL: self.server_name} - ).observe(end_time_s - start_time_s) + sync_processing_time.record( + end_time_s - start_time_s, + {"initial": from_token is not None, SERVER_NAME_LABEL: self.server_name}, + ) return sliding_sync_result diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py index 91444b5d03..240b034081 100644 --- a/synapse/handlers/stats.py +++ b/synapse/handlers/stats.py @@ -154,9 +154,9 @@ async def _unsafe_process(self) -> None: logger.debug("Handled room stats to %s -> %s", self.pos, max_pos) - event_processing_positions.labels( - name="stats", **{SERVER_NAME_LABEL: self.server_name} - ).set(max_pos) + event_processing_positions.set( + max_pos, {"name": "stats", SERVER_NAME_LABEL: self.server_name} + ) self.pos = max_pos @@ -166,12 +166,12 @@ async def _unsafe_process(self) -> None: ) = await self.store.get_room_stats() # Update room count metrics - known_rooms_gauge.labels(**{SERVER_NAME_LABEL: self.server_name}).set( - known_room_count + known_rooms_gauge.set( + known_room_count, {SERVER_NAME_LABEL: self.server_name} + ) + locally_joined_rooms_gauge.set( + locally_joined_room_count, {SERVER_NAME_LABEL: self.server_name} ) - locally_joined_rooms_gauge.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(locally_joined_room_count) async def _handle_deltas( self, deltas: Iterable[StateDelta] diff --git a/synapse/handlers/user_directory.py b/synapse/handlers/user_directory.py index 28961f5925..6a6ed274ce 100644 --- a/synapse/handlers/user_directory.py +++ b/synapse/handlers/user_directory.py @@ -264,9 +264,9 @@ async def _unsafe_process(self) -> None: self.pos = max_pos # Expose current event processing position to prometheus - synapse.metrics.event_processing_positions.labels( - name="user_dir", **{SERVER_NAME_LABEL: self.server_name} - ).set(max_pos) + synapse.metrics.event_processing_positions.set( + max_pos, {"name": "user_dir", SERVER_NAME_LABEL: self.server_name} + ) await self.store.update_user_directory_stream_pos(max_pos) diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 9450d6be62..a676e168f6 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -216,10 +216,13 @@ def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None: response_count.add(1, response_base_labels) - response_timer.labels( - code=response_code_str, - **response_base_labels, - ).observe(time_sec - self.start_ts) + response_timer.record( + time_sec - self.start_ts, + { + "code": response_code_str, + **response_base_labels, + }, + ) resource_usage = context.get_resource_usage() diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index d65e637981..838feed735 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -657,11 +657,16 @@ def collect(self) -> Iterable[Metric]: build_info = meter.create_gauge( # type: ignore[missing-server-name-label] "synapse_build_info", description="Build information" ) -build_info.labels( - " ".join([platform.python_implementation(), platform.python_version()]), - SYNAPSE_VERSION, - " ".join([platform.system(), platform.release()]), -).set(1) +build_info.set( + 1, + { + "pythonversion": " ".join( + [platform.python_implementation(), platform.python_version()] + ), + "version": SYNAPSE_VERSION, + "osversion": " ".join([platform.system(), platform.release()]), + }, +) # Loaded modules info module_instances_info = meter.create_gauge( @@ -721,19 +726,19 @@ def register_threadpool(*, name: str, server_name: str, threadpool: ThreadPool) threadpool: The threadpool to register metrics for. """ - threadpool_total_min_threads.labels( - name=name, **{SERVER_NAME_LABEL: server_name} - ).set(threadpool.min) - threadpool_total_max_threads.labels( - name=name, **{SERVER_NAME_LABEL: server_name} - ).set(threadpool.max) - - threadpool_total_threads.labels( - name=name, **{SERVER_NAME_LABEL: server_name} - ).set_function(lambda: len(threadpool.threads)) - threadpool_total_working_threads.labels( - name=name, **{SERVER_NAME_LABEL: server_name} - ).set_function(lambda: len(threadpool.working)) + threadpool_total_min_threads.set( + threadpool.min, {"name": name, SERVER_NAME_LABEL: server_name} + ) + threadpool_total_max_threads.set( + threadpool.max, {"name": name, SERVER_NAME_LABEL: server_name} + ) + + threadpool_total_threads.set( + len(threadpool.threads), {"name": name, SERVER_NAME_LABEL: server_name} + ) + threadpool_total_working_threads.set( + len(threadpool.working), {"name": name, SERVER_NAME_LABEL: server_name} + ) class MetricsResource(Resource): diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index d0ddae0258..5a61a8040d 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -136,8 +136,8 @@ def _maybe_gc() -> None: _last_gc[i] = end - gc_time.labels(i).observe(end - start) - gc_unreachable.labels(i).set(unreachable) + gc_time.record(end - start, {"gen": i}) + gc_unreachable.set(unreachable, {"gen": i}) # We can ignore the lint here since this looping call does not hold a `HomeServer` # reference so can be cleaned up by other means on shutdown. diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index c0f87e71c9..a8c5c1b31c 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -44,7 +44,7 @@ logger = logging.getLogger(__name__) -synapse_pushers = meter.create_gauge( +synapse_pushers = meter.create_up_down_counter( "synapse_pushers", description="Number of active synapse pushers", ) @@ -421,18 +421,24 @@ async def _start_pusher(self, pusher_config: PusherConfig) -> Optional[Pusher]: previous_pusher = byuser[appid_pushkey] previous_pusher.on_stop() - synapse_pushers.labels( - kind=type(previous_pusher).__name__, - app_id=previous_pusher.app_id, - **{SERVER_NAME_LABEL: self.server_name}, - ).dec() + synapse_pushers.add( + -1, + { + "kind": type(previous_pusher).__name__, + "app_id": previous_pusher.app_id, + SERVER_NAME_LABEL: self.server_name, + }, + ) byuser[appid_pushkey] = pusher - synapse_pushers.labels( - kind=type(pusher).__name__, - app_id=pusher.app_id, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + synapse_pushers.add( + 1, + { + "kind": type(pusher).__name__, + "app_id": pusher.app_id, + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info("Starting pusher %s / %s", pusher.user_id, appid_pushkey) @@ -491,8 +497,11 @@ def maybe_stop_pusher(self, app_id: str, pushkey: str, user_id: str) -> None: pusher = byuser.pop(appid_pushkey) pusher.on_stop() - synapse_pushers.labels( - kind=type(pusher).__name__, - app_id=pusher.app_id, - **{SERVER_NAME_LABEL: self.server_name}, - ).dec() + synapse_pushers.add( + -1, + { + "kind": type(pusher).__name__, + "app_id": pusher.app_id, + SERVER_NAME_LABEL: self.server_name, + }, + ) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 7a93d3f0d2..3eb4cf00d7 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -47,7 +47,7 @@ logger = logging.getLogger(__name__) -_pending_outgoing_requests = meter.create_gauge( +_pending_outgoing_requests = meter.create_up_down_counter( "synapse_pending_outgoing_replication_requests", description="Number of active outgoing replication requests, by replication method name", ) diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index 7a4087df5c..ed29d083f1 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -20,6 +20,7 @@ # import logging +import time from typing import TYPE_CHECKING, Any, Optional from synapse.logging import opentracing @@ -126,12 +127,14 @@ async def get(self, cache_name: str, key: str) -> Optional[Any]: "ExternalCache.get", tags={opentracing.SynapseTags.CACHE_NAME: cache_name}, ): - with response_timer.labels( - method="get", **{SERVER_NAME_LABEL: self.server_name} - ).time(): - result = await make_deferred_yieldable( - self._redis_connection.get(self._get_redis_key(cache_name, key)) - ) + start = time.perf_counter() + result = await make_deferred_yieldable( + self._redis_connection.get(self._get_redis_key(cache_name, key)) + ) + response_timer.record( + time.perf_counter() - start, + {"method": "get", SERVER_NAME_LABEL: self.server_name}, + ) logger.debug("Got cache result %s %s: %r", cache_name, key, result) diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 4d57908017..b8862a494e 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -847,10 +847,13 @@ async def on_GET( processing_end_time = self.clock.time_msec() room_member_count = await make_deferred_yieldable(room_member_count_deferred) - messsages_response_timer.labels( - room_size=_RoomSize.from_member_count(room_member_count), - **{SERVER_NAME_LABEL: self.server_name}, - ).observe((processing_end_time - processing_start_time) / 1000) + messsages_response_timer.record( + (processing_end_time - processing_start_time) / 1000, + { + "room_size": str(_RoomSize.from_member_count(room_member_count)), + SERVER_NAME_LABEL: self.server_name, + }, + ) return 200, msgs diff --git a/synapse/storage/database.py b/synapse/storage/database.py index 438bb975c5..1f21cb50e7 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -521,9 +521,9 @@ def _do_execute( finally: secs = time.time() - start sql_logger.debug("[SQL time] {%s} %f sec", self.name, secs) - sql_query_timer.labels( - verb=sql.split()[0], **{SERVER_NAME_LABEL: self.server_name} - ).observe(secs) + sql_query_timer.record( + secs, {"verb": sql.split()[0], SERVER_NAME_LABEL: self.server_name} + ) def close(self) -> None: self.txn.close() @@ -1043,9 +1043,9 @@ def inner_func(conn: _PoolConnection, *args: P.args, **kwargs: P.kwargs) -> R: operation_name="db.connection", ): sched_duration_sec = monotonic_time() - start_time - sql_scheduling_timer.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).observe(sched_duration_sec) + sql_scheduling_timer.record( + sched_duration_sec, {SERVER_NAME_LABEL: self.server_name} + ) context.add_database_scheduled(sched_duration_sec) if self._txn_limit > 0: diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index bbb3ec6f29..c4967cd96a 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -2304,12 +2304,10 @@ def _get_stats_for_federation_staging_txn( "_get_stats_for_federation_staging", _get_stats_for_federation_staging_txn ) - number_pdus_in_federation_queue.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(count) - oldest_pdu_in_federation_staging.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(age) + number_pdus_in_federation_queue.set( + count, {SERVER_NAME_LABEL: self.server_name} + ) + oldest_pdu_in_federation_staging.set(age, {SERVER_NAME_LABEL: self.server_name}) async def clean_room_for_join(self, room_id: str) -> None: await self.db_pool.runInteraction( diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 6d9c799e6d..8c0f1a03b6 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -366,9 +366,9 @@ async def _persist_events_and_state_updates( if not use_negative_stream_ordering: # we don't want to set the event_persisted_position to a negative # stream_ordering. - synapse.metrics.event_persisted_position.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).set(stream) + synapse.metrics.event_persisted_position.set( + stream, {SERVER_NAME_LABEL: self.server_name} + ) for event, context in events_and_contexts: if context.app_service: diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 01596f13ea..481b427772 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -116,13 +116,15 @@ def __init__( # The function to call with batches of values. self._process_batch_callback = process_batch_callback - number_queued.labels( - name=self._name, **{SERVER_NAME_LABEL: self.server_name} - ).set_function(lambda: sum(len(q) for q in self._next_values.values())) + number_queued.set( + sum(len(q) for q in self._next_values.values()), + {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + ) - number_of_keys.labels( - name=self._name, **{SERVER_NAME_LABEL: self.server_name} - ).set_function(lambda: len(self._next_values)) + number_of_keys.set( + len(self._next_values), + {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + ) self._number_in_flight_metric: Gauge = number_in_flight.labels( name=self._name, **{SERVER_NAME_LABEL: self.server_name} diff --git a/synapse/util/caches/deferred_cache.py b/synapse/util/caches/deferred_cache.py index a11b30876f..a267355196 100644 --- a/synapse/util/caches/deferred_cache.py +++ b/synapse/util/caches/deferred_cache.py @@ -112,9 +112,10 @@ def __init__( ] = cache_type() def metrics_cb() -> None: - cache_pending_metric.labels( - name=name, **{SERVER_NAME_LABEL: server_name} - ).set(len(self._pending_deferred_cache)) + cache_pending_metric.set( + len(self._pending_deferred_cache), + {"name": name, SERVER_NAME_LABEL: server_name}, + ) # cache is used for completed results and maps to the result itself, rather than # a Deferred. diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 36932aea81..13e0fb2ad7 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -420,12 +420,15 @@ def test_module_instances_info_metric(self) -> None: test_module_version = "1.2.3" # Set the metric values - module_instances_info.labels( - module_name=test_module_name, - package_name=test_package_name, - module_version=test_module_version, - server_name=test_server_name, - ).set(1) + module_instances_info.set( + 1, + { + "module_name": test_module_name, + "package_name": test_package_name, + "module_version": test_module_version, + "server_name": test_server_name, + }, + ) metrics_output = generate_latest(REGISTRY) From 5c8e161b10579a3a3a119487fcd266691af3d32f Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 15:08:10 +0100 Subject: [PATCH 09/18] fix circular import --- synapse/api/auth/__init__.py | 2 +- synapse/metrics/__init__.py | 4 +++- synapse/util/ratelimitutils.py | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/synapse/api/auth/__init__.py b/synapse/api/auth/__init__.py index 0b4beb08bc..e544694cae 100644 --- a/synapse/api/auth/__init__.py +++ b/synapse/api/auth/__init__.py @@ -24,7 +24,7 @@ from synapse.appservice import ApplicationService from synapse.http.site import SynapseRequest -from synapse.metrics import SERVER_NAME_LABEL, meter +from synapse.metrics import meter from synapse.types import Requester if TYPE_CHECKING: diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 838feed735..d19b925ad0 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -63,7 +63,6 @@ # This module is imported for its side effects; flake8 needn't warn that it's unused. import synapse.metrics._reactor_metrics # noqa: F401 -from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager from synapse.metrics._types import Collector from synapse.types import StrSequence from synapse.util import SYNAPSE_VERSION @@ -141,6 +140,9 @@ def _set_prometheus_client_use_created_metrics(new_value: bool) -> None: # Global meter for registering otel metrics meter = get_meter_provider().get_meter("synapse") +# Import _gc after meter is defined to avoid circular import +from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager # noqa: E402 + class _RegistryProxy: @staticmethod diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 7c8b2dbd43..67438c5fb2 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -28,7 +28,6 @@ from typing import ( Any, Callable, - ContextManager, DefaultDict, Dict, Iterator, From 1f8a66d89c041412646d0baa07103bcf2952b282 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 15:40:10 +0100 Subject: [PATCH 10/18] synapse can run again --- synapse/metrics/__init__.py | 4 +- synapse/metrics/_gc.py | 4 +- synapse/metrics/background_process_metrics.py | 14 +- synapse/replication/http/_base.py | 271 +++++++++--------- synapse/util/batching_queue.py | 20 +- tests/util/test_batching_queue.py | 15 +- 6 files changed, 169 insertions(+), 159 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index d19b925ad0..fb4e395d43 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -656,9 +656,7 @@ def collect(self) -> Iterable[Metric]: # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. We # consider this process-level because all Synapse homeservers running in the process # will use the same Synapse version. -build_info = meter.create_gauge( # type: ignore[missing-server-name-label] - "synapse_build_info", description="Build information" -) +build_info = meter.create_gauge("synapse_build_info", description="Build information") build_info.set( 1, { diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 5a61a8040d..a6c04df87a 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -57,8 +57,8 @@ # These are process-level metrics, so they do not have the `SERVER_NAME_LABEL`. gc_unreachable = meter.create_gauge( "python_gc_unreachable_total", description="Unreachable GC objects" -) # type: ignore[missing-server-name-label] -gc_time = meter.create_histogram( # type: ignore[missing-server-name-label] +) +gc_time = meter.create_histogram( "python_gc_time", description="Time taken to GC (sec)", explicit_bucket_boundaries_advisory=[ diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index ca73a9514f..e1ac76a1e4 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -82,7 +82,7 @@ description="Number of background processes started", ) -_background_process_in_flight_count = meter.create_gauge( +_background_process_in_flight_count = meter.create_up_down_counter( "synapse_background_process_in_flight_count", description="Number of background processes in flight", ) @@ -264,9 +264,9 @@ async def run() -> Optional[R]: _background_process_start_count.add( 1, {"name": desc, SERVER_NAME_LABEL: server_name} ) - _background_process_in_flight_count.labels( - name=desc, **{SERVER_NAME_LABEL: server_name} - ).inc() + _background_process_in_flight_count.add( + 1, {"name": desc, SERVER_NAME_LABEL: server_name} + ) with BackgroundProcessLoggingContext( name=desc, server_name=server_name, instance_id=count @@ -373,9 +373,9 @@ def combined_context_manager() -> Generator[None, None, None]: ) return None finally: - _background_process_in_flight_count.labels( - name=desc, **{SERVER_NAME_LABEL: server_name} - ).dec() + _background_process_in_flight_count.add( + -1, {"name": desc, SERVER_NAME_LABEL: server_name} + ) # To explain how the log contexts work here: # - When `run_as_background_process` is called, the current context is stored diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 3eb4cf00d7..fc21760a4f 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -209,9 +209,9 @@ def make_client(cls, hs: "HomeServer") -> Callable: instance_map = hs.config.worker.instance_map - outgoing_gauge = _pending_outgoing_requests.labels( - name=cls.NAME, - **{SERVER_NAME_LABEL: server_name}, + outgoing_gauge = meter.create_up_down_counter( + "synapse_pending_outgoing_replication_requests", + description="Number of active outgoing replication requests, by replication method name", ) replication_secret = None @@ -228,154 +228,157 @@ async def send_request( streams = hs.get_replication_command_handler().get_streams_to_replicate() replication = hs.get_replication_data_handler() - with outgoing_gauge.track_inprogress(): - if instance_name == local_instance_name: - raise Exception("Trying to send HTTP request to self") - if instance_name not in instance_map: - raise Exception( - "Instance %r not in 'instance_map' config" % (instance_name,) - ) - - data = await cls._serialize_payload(**kwargs) + outgoing_gauge.add( + 1, + {"name": cls.NAME, SERVER_NAME_LABEL: server_name}, + ) + if instance_name == local_instance_name: + raise Exception("Trying to send HTTP request to self") + if instance_name not in instance_map: + raise Exception( + "Instance %r not in 'instance_map' config" % (instance_name,) + ) - if cls.METHOD != "GET" and cls.WAIT_FOR_STREAMS: - # Include the current stream positions that we write to. We - # don't do this for GETs as they don't have a body, and we - # generally assume that a GET won't rely on data we have - # written. - if _STREAM_POSITION_KEY in data: - raise Exception( - "data to send contains %r key", _STREAM_POSITION_KEY - ) + data = await cls._serialize_payload(**kwargs) - data[_STREAM_POSITION_KEY] = { - "streams": { - stream.NAME: stream.minimal_local_current_token() - for stream in streams - }, - "instance_name": local_instance_name, - } - - url_args = [ - urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS - ] - - if cls.CACHE: - txn_id = random_string(10) - url_args.append(txn_id) - - if cls.METHOD == "POST": - request_func: Callable[..., Awaitable[Any]] = ( - client.post_json_get_json - ) - elif cls.METHOD == "PUT": - request_func = client.put_json - elif cls.METHOD == "GET": - request_func = client.get_json - else: - # We have already asserted in the constructor that a - # compatible was picked, but lets be paranoid. + if cls.METHOD != "GET" and cls.WAIT_FOR_STREAMS: + # Include the current stream positions that we write to. We + # don't do this for GETs as they don't have a body, and we + # generally assume that a GET won't rely on data we have + # written. + if _STREAM_POSITION_KEY in data: raise Exception( - "Unknown METHOD on %s replication endpoint" % (cls.NAME,) + "data to send contains %r key", _STREAM_POSITION_KEY ) - # Hard code a special scheme to show this only used for replication. The - # instance_name will be passed into the ReplicationEndpointFactory to - # determine connection details from the instance_map. - uri = "synapse-replication://%s/_synapse/replication/%s/%s" % ( - instance_name, - cls.NAME, - "/".join(url_args), + data[_STREAM_POSITION_KEY] = { + "streams": { + stream.NAME: stream.minimal_local_current_token() + for stream in streams + }, + "instance_name": local_instance_name, + } + + url_args = [ + urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS + ] + + if cls.CACHE: + txn_id = random_string(10) + url_args.append(txn_id) + + if cls.METHOD == "POST": + request_func: Callable[..., Awaitable[Any]] = client.post_json_get_json + elif cls.METHOD == "PUT": + request_func = client.put_json + elif cls.METHOD == "GET": + request_func = client.get_json + else: + # We have already asserted in the constructor that a + # compatible was picked, but lets be paranoid. + raise Exception( + "Unknown METHOD on %s replication endpoint" % (cls.NAME,) ) - headers: Dict[bytes, List[bytes]] = {} - # Add an authorization header, if configured. - if replication_secret: - headers[b"Authorization"] = [b"Bearer " + replication_secret] - opentracing.inject_header_dict(headers, check_destination=False) - - try: - # Keep track of attempts made so we can bail if we don't manage to - # connect to the target after N tries. - attempts = 0 - # We keep retrying the same request for timeouts. This is so that we - # have a good idea that the request has either succeeded or failed - # on the master, and so whether we should clean up or not. - while True: - try: - result = await request_func(uri, data, headers=headers) - break - except RequestTimedOutError: - if not cls.RETRY_ON_TIMEOUT: - raise - - logger.warning("%s request timed out; retrying", cls.NAME) - - # If we timed out we probably don't need to worry about backing - # off too much, but lets just wait a little anyway. - await clock.sleep(1) - except (ConnectError, DNSLookupError) as e: - if not cls.RETRY_ON_CONNECT_ERROR: - raise - if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: - raise - - delay = 2**attempts - logger.warning( - "%s request connection failed; retrying in %ds: %r", - cls.NAME, - delay, - e, - ) - - await clock.sleep(delay) - attempts += 1 - except HttpResponseException as e: - # We convert to SynapseError as we know that it was a SynapseError - # on the main process that we should send to the client. (And - # importantly, not stack traces everywhere) - _outgoing_request_counter.add( - 1, - { - "name": cls.NAME, - "code": e.code, - SERVER_NAME_LABEL: server_name, - }, - ) - raise e.to_synapse_error() - except Exception as e: - _outgoing_request_counter.add( - 1, - { - "name": cls.NAME, - "code": "ERR", - SERVER_NAME_LABEL: server_name, - }, - ) - raise SynapseError( - 502, f"Failed to talk to {instance_name} process" - ) from e + # Hard code a special scheme to show this only used for replication. The + # instance_name will be passed into the ReplicationEndpointFactory to + # determine connection details from the instance_map. + uri = "synapse-replication://%s/_synapse/replication/%s/%s" % ( + instance_name, + cls.NAME, + "/".join(url_args), + ) + + headers: Dict[bytes, List[bytes]] = {} + # Add an authorization header, if configured. + if replication_secret: + headers[b"Authorization"] = [b"Bearer " + replication_secret] + opentracing.inject_header_dict(headers, check_destination=False) + + try: + # Keep track of attempts made so we can bail if we don't manage to + # connect to the target after N tries. + attempts = 0 + # We keep retrying the same request for timeouts. This is so that we + # have a good idea that the request has either succeeded or failed + # on the master, and so whether we should clean up or not. + while True: + try: + result = await request_func(uri, data, headers=headers) + break + except RequestTimedOutError: + if not cls.RETRY_ON_TIMEOUT: + raise + + logger.warning("%s request timed out; retrying", cls.NAME) + + # If we timed out we probably don't need to worry about backing + # off too much, but lets just wait a little anyway. + await clock.sleep(1) + except (ConnectError, DNSLookupError) as e: + if not cls.RETRY_ON_CONNECT_ERROR: + raise + if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: + raise + + delay = 2**attempts + logger.warning( + "%s request connection failed; retrying in %ds: %r", + cls.NAME, + delay, + e, + ) + await clock.sleep(delay) + attempts += 1 + except HttpResponseException as e: + # We convert to SynapseError as we know that it was a SynapseError + # on the main process that we should send to the client. (And + # importantly, not stack traces everywhere) + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": e.code, + SERVER_NAME_LABEL: server_name, + }, + ) + raise e.to_synapse_error() + except Exception as e: _outgoing_request_counter.add( 1, { "name": cls.NAME, - "code": 200, + "code": "ERR", SERVER_NAME_LABEL: server_name, }, ) + raise SynapseError( + 502, f"Failed to talk to {instance_name} process" + ) from e + + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": 200, + SERVER_NAME_LABEL: server_name, + }, + ) - # Wait on any streams that the remote may have written to. - for stream_name, position in result.pop( - _STREAM_POSITION_KEY, {} - ).items(): - await replication.wait_for_stream_position( - instance_name=instance_name, - stream_name=stream_name, - position=position, - ) + # Wait on any streams that the remote may have written to. + for stream_name, position in result.pop(_STREAM_POSITION_KEY, {}).items(): + await replication.wait_for_stream_position( + instance_name=instance_name, + stream_name=stream_name, + position=position, + ) + outgoing_gauge.add( + -1, + {"name": cls.NAME, SERVER_NAME_LABEL: server_name}, + ) - return result + return result return send_request diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 481b427772..8aed5ec50e 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -126,8 +126,9 @@ def __init__( {"name": self._name, SERVER_NAME_LABEL: self.server_name}, ) - self._number_in_flight_metric: Gauge = number_in_flight.labels( - name=self._name, **{SERVER_NAME_LABEL: self.server_name} + self._number_in_flight_metric = meter.create_up_down_counter( + "synapse_util_batching_queue_number_pending", + description="The number of items across all keys either being processed or waiting in a queue", ) def shutdown(self) -> None: @@ -135,8 +136,9 @@ def shutdown(self) -> None: Prepares the object for garbage collection by removing any handed out references. """ - number_queued.remove(self._name, self.server_name) - number_of_keys.remove(self._name, self.server_name) + # there doesn't seem to be an otel equivalent for those + # number_queued.remove(self._name, self.server_name) + # number_of_keys.remove(self._name, self.server_name) async def add_to_queue(self, value: V, key: Hashable = ()) -> R: """Adds the value to the queue with the given key, returning the result @@ -158,8 +160,14 @@ async def add_to_queue(self, value: V, key: Hashable = ()) -> R: if key not in self._processing_keys: self.hs.run_as_background_process(self._name, self._process_queue, key) - with self._number_in_flight_metric.track_inprogress(): - return await make_deferred_yieldable(d) + self._number_in_flight_metric.add( + 1, {"name": self._name, SERVER_NAME_LABEL: self.server_name} + ) + res = await make_deferred_yieldable(d) + self._number_in_flight_metric.add( + -1, {"name": self._name, SERVER_NAME_LABEL: self.server_name} + ) + return res async def _process_queue(self, key: Hashable) -> None: """A background task to repeatedly pull things off the queue for the diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 60bfdf38aa..8920453874 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -20,7 +20,7 @@ # from typing import List, Tuple -from prometheus_client import Gauge +from opentelemetry.metrics._internal.instrument import Gauge from twisted.internet import defer @@ -40,12 +40,13 @@ def setUp(self) -> None: super().setUp() # We ensure that we remove any existing metrics for "test_queue". - try: - number_queued.remove("test_queue", "test_server") - number_of_keys.remove("test_queue", "test_server") - number_in_flight.remove("test_queue", "test_server") - except KeyError: - pass + # there doesn't seem to be an equivalent for otel + # try: + # number_queued.remove("test_queue", "test_server") + # number_of_keys.remove("test_queue", "test_server") + # number_in_flight.remove("test_queue", "test_server") + # except KeyError: + # pass self._pending_calls: List[Tuple[List[str], defer.Deferred]] = [] self.queue: BatchingQueue[str, str] = BatchingQueue( From 833b0aa3045b25b266eafa103445e0a549c07c46 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 16:13:43 +0100 Subject: [PATCH 11/18] more fixes --- synapse/handlers/federation_event.py | 34 ++++++++++++----------- synapse/replication/tcp/external_cache.py | 20 +++++++------ synapse/util/batching_queue.py | 2 -- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 028524386b..d1d40a672b 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -22,6 +22,7 @@ import collections import itertools import logging +import time from http import HTTPStatus from typing import ( TYPE_CHECKING, @@ -730,23 +731,24 @@ async def backfill( if not events: return - with backfill_processing_after_timer.labels( - **{SERVER_NAME_LABEL: self.server_name} - ).time(): - # if there are any events in the wrong room, the remote server is buggy and - # should not be trusted. - for ev in events: - if ev.room_id != room_id: - raise InvalidResponseError( - f"Remote server {dest} returned event {ev.event_id} which is in " - f"room {ev.room_id}, when we were backfilling in {room_id}" - ) + start = time.perf_counter() + # if there are any events in the wrong room, the remote server is buggy and + # should not be trusted. + for ev in events: + if ev.room_id != room_id: + raise InvalidResponseError( + f"Remote server {dest} returned event {ev.event_id} which is in " + f"room {ev.room_id}, when we were backfilling in {room_id}" + ) - await self._process_pulled_events( - dest, - events, - backfilled=True, - ) + await self._process_pulled_events( + dest, + events, + backfilled=True, + ) + backfill_processing_after_timer.record( + time.perf_counter() - start, {SERVER_NAME_LABEL: self.server_name} + ) @trace async def _get_missing_events_for_pdu( diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index ed29d083f1..83f8f98d38 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -106,16 +106,18 @@ async def set(self, cache_name: str, key: str, value: Any, expiry_ms: int) -> No "ExternalCache.set", tags={opentracing.SynapseTags.CACHE_NAME: cache_name}, ): - with response_timer.labels( - method="set", **{SERVER_NAME_LABEL: self.server_name} - ).time(): - return await make_deferred_yieldable( - self._redis_connection.set( - self._get_redis_key(cache_name, key), - encoded_value, - pexpire=expiry_ms, - ) + start = time.perf_counter() + await make_deferred_yieldable( + self._redis_connection.set( + self._get_redis_key(cache_name, key), + encoded_value, + pexpire=expiry_ms, ) + ) + response_timer.record( + time.perf_counter() - start, + {"method": "set", SERVER_NAME_LABEL: self.server_name}, + ) async def get(self, cache_name: str, key: str) -> Optional[Any]: """Look up a key/value in the named cache.""" diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 8aed5ec50e..be60cbe31c 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -33,8 +33,6 @@ TypeVar, ) -from opentelemetry.metrics._internal.instrument import Gauge - from twisted.internet import defer from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable From b5e00eb1b571866d19c999cf4b39938332cd6411 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 17:09:41 +0100 Subject: [PATCH 12/18] more stuff --- synapse/metrics/__init__.py | 17 ++++++------- synapse/metrics/_reactor_metrics.py | 25 ++++++++++++++---- synapse/rest/client/account.py | 39 ++++++++++++++++++----------- synapse/rest/client/register.py | 26 +++++++++++-------- 4 files changed, 68 insertions(+), 39 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index fb4e395d43..5072f10acf 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -44,10 +44,10 @@ import attr from opentelemetry.metrics import get_meter_provider +from opentelemetry.metrics._internal import Meter from packaging.version import parse as parse_version from prometheus_client import ( CollectorRegistry, - Histogram, Metric, generate_latest, ) @@ -61,8 +61,6 @@ from twisted.web.resource import Resource from twisted.web.server import Request -# This module is imported for its side effects; flake8 needn't warn that it's unused. -import synapse.metrics._reactor_metrics # noqa: F401 from synapse.metrics._types import Collector from synapse.types import StrSequence from synapse.util import SYNAPSE_VERSION @@ -138,9 +136,11 @@ def _set_prometheus_client_use_created_metrics(new_value: bool) -> None: # Global meter for registering otel metrics -meter = get_meter_provider().get_meter("synapse") +meter: Meter = get_meter_provider().get_meter("synapse") -# Import _gc after meter is defined to avoid circular import +# Import after meter is defined to avoid circular import +# This module is imported for its side effects; flake8 needn't warn that it's unused. +import synapse.metrics._reactor_metrics # noqa: F401, E402 from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager # noqa: E402 @@ -675,13 +675,12 @@ def collect(self) -> Iterable[Metric]: ) # 3PID send info -threepid_send_requests = Histogram( +threepid_send_requests = meter.create_histogram( "synapse_threepid_send_requests_with_tries", - documentation="Number of requests for a 3pid token by try count. Note if" + description="Number of requests for a 3pid token by try count. Note if" " there is a request with try count of 4, then there would have been one" " each for 1, 2 and 3", - buckets=(1, 2, 3, 4, 5, 10), - labelnames=("type", "reason", SERVER_NAME_LABEL), + explicit_bucket_boundaries_advisory=[1, 2, 3, 4, 5, 10], ) threadpool_total_threads = meter.create_gauge( diff --git a/synapse/metrics/_reactor_metrics.py b/synapse/metrics/_reactor_metrics.py index 9852d0b932..e3a0c5dbd0 100644 --- a/synapse/metrics/_reactor_metrics.py +++ b/synapse/metrics/_reactor_metrics.py @@ -24,7 +24,7 @@ from selectors import SelectSelector, _PollLikeSelector # type: ignore[attr-defined] from typing import Any, Callable, Iterable -from prometheus_client import Histogram, Metric +from prometheus_client import Metric from prometheus_client.core import REGISTRY, GaugeMetricFamily from twisted.internet import reactor, selectreactor @@ -32,6 +32,8 @@ from synapse.metrics._types import Collector +from . import meter + try: from selectors import KqueueSelector # type: ignore[attr-defined] except ImportError: @@ -63,10 +65,23 @@ class PollReactor: # type: ignore[no-redef] # # This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. -tick_time = Histogram( # type: ignore[missing-server-name-label] +tick_time = meter.create_histogram( # type: ignore[missing-server-name-label] "python_twisted_reactor_tick_time", - "Tick time of the Twisted reactor (sec)", - buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5], + description="Tick time of the Twisted reactor (sec)", + explicit_bucket_boundaries_advisory=[ + 0.001, + 0.002, + 0.005, + 0.01, + 0.025, + 0.05, + 0.1, + 0.2, + 0.5, + 1, + 2, + 5, + ], ) @@ -81,7 +96,7 @@ def __call__(self, *args, **kwargs) -> Any: # type: ignore[no-untyped-def] # record the time since this was last called. This gives a good proxy for # how long it takes to run everything in the reactor - ie, how long anything # waiting for the next tick will have to wait. - tick_time.observe(time.time() - self.last_polled) + tick_time.record(time.time() - self.last_polled) ret = self._wrapped(*args, **kwargs) diff --git a/synapse/rest/client/account.py b/synapse/rest/client/account.py index d9f0c169e8..054070d252 100644 --- a/synapse/rest/client/account.py +++ b/synapse/rest/client/account.py @@ -137,11 +137,14 @@ async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: self.mailer.send_password_reset_mail, body.next_link, ) - threepid_send_requests.labels( - type="email", - reason="password_reset", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe(body.send_attempt) + threepid_send_requests.record( + body.send_attempt, + { + "type": "email", + "reason": "password_reset", + SERVER_NAME_LABEL: self.server_name, + }, + ) # Wrap the session id in a JSON object return 200, {"sid": sid} @@ -398,11 +401,14 @@ async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: body.next_link, ) - threepid_send_requests.labels( - type="email", - reason="add_threepid", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe(body.send_attempt) + threepid_send_requests.record( + body.send_attempt, + { + "type": "email", + "reason": "add_threepid", + SERVER_NAME_LABEL: self.server_name, + }, + ) # Wrap the session id in a JSON object return 200, {"sid": sid} @@ -476,11 +482,14 @@ async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: body.next_link, ) - threepid_send_requests.labels( - type="msisdn", - reason="add_threepid", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe(body.send_attempt) + threepid_send_requests.record( + body.send_attempt, + { + "type": "msisdn", + "reason": "add_threepid", + SERVER_NAME_LABEL: self.server_name, + }, + ) logger.info("MSISDN %s: got response from identity server: %s", msisdn, ret) return 200, ret diff --git a/synapse/rest/client/register.py b/synapse/rest/client/register.py index b42006e4ce..ed3bd508b1 100644 --- a/synapse/rest/client/register.py +++ b/synapse/rest/client/register.py @@ -164,11 +164,14 @@ async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: next_link, ) - threepid_send_requests.labels( - type="email", - reason="register", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe(send_attempt) + threepid_send_requests.record( + send_attempt, + { + "type": "email", + "reason": "register", + SERVER_NAME_LABEL: self.server_name, + }, + ) # Wrap the session id in a JSON object return 200, {"sid": sid} @@ -244,11 +247,14 @@ async def on_POST(self, request: SynapseRequest) -> Tuple[int, JsonDict]: next_link, ) - threepid_send_requests.labels( - type="msisdn", - reason="register", - **{SERVER_NAME_LABEL: self.server_name}, - ).observe(send_attempt) + threepid_send_requests.record( + send_attempt, + { + "type": "msisdn", + "reason": "register", + SERVER_NAME_LABEL: self.server_name, + }, + ) return 200, ret From 94128ed33bb831b6e7c6655d0f0feaba9f894087 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Thu, 11 Dec 2025 17:19:16 +0100 Subject: [PATCH 13/18] bye bye lint errors --- tests/util/test_batching_queue.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 8920453874..e3b044177d 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -21,6 +21,7 @@ from typing import List, Tuple from opentelemetry.metrics._internal.instrument import Gauge +from prometheus_client.core import REGISTRY from twisted.internet import defer @@ -65,9 +66,10 @@ def _get_sample_with_name(self, metric: Gauge, name: str) -> float: """For a prometheus metric get the value of the sample that has a matching "name" label. """ - for sample in next(iter(metric.collect())).samples: - if sample.labels.get("name") == name: - return sample.value + for metric_family in REGISTRY.collect(): + for sample in metric_family.samples: + if sample.labels.get("name") == name: + return sample.value self.fail("Found no matching sample") From 43a67667e09d94fe615b7dd6fa78d918e828e001 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Fri, 12 Dec 2025 12:21:53 +0100 Subject: [PATCH 14/18] expose all metrics to prometheus endpoint --- synapse/metrics/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 5072f10acf..beb67670b7 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -43,8 +43,12 @@ ) import attr +from opentelemetry import metrics +from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.metrics import get_meter_provider from opentelemetry.metrics._internal import Meter +from opentelemetry.sdk.metrics._internal import MeterProvider +from opentelemetry.sdk.resources import SERVICE_NAME, Resource as OtelResource from packaging.version import parse as parse_version from prometheus_client import ( CollectorRegistry, @@ -136,6 +140,10 @@ def _set_prometheus_client_use_created_metrics(new_value: bool) -> None: # Global meter for registering otel metrics +resource = OtelResource(attributes={SERVICE_NAME: "synapse"}) +reader = PrometheusMetricReader() +provider = MeterProvider(resource=resource, metric_readers=[reader]) +metrics.set_meter_provider(provider) meter: Meter = get_meter_provider().get_meter("synapse") # Import after meter is defined to avoid circular import From f552419675a2ca9605a7462ffc1f806018fc5243 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Fri, 12 Dec 2025 14:22:11 +0100 Subject: [PATCH 15/18] try to fix the tests --- tests/handlers/test_stats.py | 18 +++++++++++------- tests/util/test_batching_queue.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/handlers/test_stats.py b/tests/handlers/test_stats.py index 7ab211a497..08324a39e9 100644 --- a/tests/handlers/test_stats.py +++ b/tests/handlers/test_stats.py @@ -20,7 +20,7 @@ from typing import Any, Dict, List, Optional, Tuple, cast -from prometheus_client import REGISTRY, Gauge +from prometheus_client import REGISTRY from twisted.internet.testing import MemoryReactor @@ -57,12 +57,16 @@ def _set_metrics_to_zero(self) -> None: This method resets the metrics to zero before each test to ensure that each test starts with a clean slate. """ - metrics = ["synapse_known_rooms_total", "synapse_locally_joined_rooms_total"] - for metric_name in metrics: - gauge = REGISTRY._names_to_collectors.get(metric_name) - if gauge is not None and isinstance(gauge, Gauge): - for labels in gauge._metrics: - gauge.labels(*labels).set(0) + from opentelemetry import metrics + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + # Create a fresh reader and provider + self.reader = InMemoryMetricReader() + provider = MeterProvider(metric_readers=[self.reader]) + # Set the global provider + # Any new metric instruments created after this will use the clean state. + metrics.set_meter_provider(provider) def _add_background_updates(self) -> None: """ diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index e3b044177d..291aea79ca 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -68,7 +68,7 @@ def _get_sample_with_name(self, metric: Gauge, name: str) -> float: """ for metric_family in REGISTRY.collect(): for sample in metric_family.samples: - if sample.labels.get("name") == name: + if sample.labels.get("name") == name and sample.name == metric.name: return sample.value self.fail("Found no matching sample") From abca27955a82a4be4aa617055528dbe60122c473 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Fri, 12 Dec 2025 15:17:38 +0100 Subject: [PATCH 16/18] use ObservableGauge when set_function was used before --- synapse/metrics/__init__.py | 37 +++++++++++++++---------- synapse/util/batching_queue.py | 45 ++++++++++++++++++++----------- tests/util/test_batching_queue.py | 13 +++++---- 3 files changed, 58 insertions(+), 37 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index beb67670b7..dc993c7757 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -47,6 +47,7 @@ from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.metrics import get_meter_provider from opentelemetry.metrics._internal import Meter +from opentelemetry.metrics._internal.observation import Observation from opentelemetry.sdk.metrics._internal import MeterProvider from opentelemetry.sdk.resources import SERVICE_NAME, Resource as OtelResource from packaging.version import parse as parse_version @@ -691,16 +692,6 @@ def collect(self) -> Iterable[Metric]: explicit_bucket_boundaries_advisory=[1, 2, 3, 4, 5, 10], ) -threadpool_total_threads = meter.create_gauge( - "synapse_threadpool_total_threads", - description="Total number of threads currently in the threadpool", -) - -threadpool_total_working_threads = meter.create_gauge( - "synapse_threadpool_working_threads", - description="Number of threads currently working in the threadpool", -) - threadpool_total_min_threads = meter.create_gauge( "synapse_threadpool_min_threads", description="Minimum number of threads configured in the threadpool", @@ -740,11 +731,29 @@ def register_threadpool(*, name: str, server_name: str, threadpool: ThreadPool) threadpool.max, {"name": name, SERVER_NAME_LABEL: server_name} ) - threadpool_total_threads.set( - len(threadpool.threads), {"name": name, SERVER_NAME_LABEL: server_name} + meter.create_observable_gauge( + "synapse_threadpool_total_threads", + description="Total number of threads currently in the threadpool", + callbacks=[ + lambda options: [ + Observation( + len(threadpool.threads), + {"name": name, SERVER_NAME_LABEL: server_name}, + ) + ] + ], ) - threadpool_total_working_threads.set( - len(threadpool.working), {"name": name, SERVER_NAME_LABEL: server_name} + meter.create_observable_gauge( + "synapse_threadpool_working_threads", + description="Number of threads currently working in the threadpool", + callbacks=[ + lambda options: [ + Observation( + len(threadpool.working), + {"name": name, SERVER_NAME_LABEL: server_name}, + ) + ] + ], ) diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index be60cbe31c..864f0e6871 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -33,6 +33,8 @@ TypeVar, ) +from opentelemetry.metrics._internal.observation import Observation + from twisted.internet import defer from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable @@ -48,21 +50,16 @@ V = TypeVar("V") R = TypeVar("R") -number_queued = meter.create_gauge( - "synapse_util_batching_queue_number_queued", - description="The number of items waiting in the queue across all keys", -) +# number_queued = meter.create_observable_gauge( +# "synapse_util_batching_queue_number_queued", +# description="The number of items waiting in the queue across all keys", +# ) -number_in_flight = meter.create_gauge( +number_in_flight = meter.create_observable_gauge( "synapse_util_batching_queue_number_pending", description="The number of items across all keys either being processed or waiting in a queue", ) -number_of_keys = meter.create_gauge( - "synapse_util_batching_queue_number_of_keys", - description="The number of distinct keys that have items queued", -) - class BatchingQueue(Generic[V, R]): """A queue that batches up work, calling the provided processing function @@ -114,14 +111,30 @@ def __init__( # The function to call with batches of values. self._process_batch_callback = process_batch_callback - number_queued.set( - sum(len(q) for q in self._next_values.values()), - {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + self.number_queued = meter.create_observable_gauge( + "synapse_util_batching_queue_number_queued", + callbacks=[ + lambda options: [ + Observation( + sum(len(q) for q in self._next_values.values()), + {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + ) + ] + ], + description="The number of items waiting in the queue across all keys", ) - number_of_keys.set( - len(self._next_values), - {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + self.number_of_keys = meter.create_observable_gauge( + "synapse_util_batching_queue_number_of_keys", + description="The number of distinct keys that have items queued", + callbacks=[ + lambda options: [ + Observation( + len(self._next_values), + {"name": self._name, SERVER_NAME_LABEL: self.server_name}, + ) + ] + ], ) self._number_in_flight_metric = meter.create_up_down_counter( diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 291aea79ca..b497f5f035 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -20,7 +20,7 @@ # from typing import List, Tuple -from opentelemetry.metrics._internal.instrument import Gauge +from opentelemetry.metrics._internal.instrument import ObservableGauge from prometheus_client.core import REGISTRY from twisted.internet import defer @@ -29,8 +29,6 @@ from synapse.util.batching_queue import ( BatchingQueue, number_in_flight, - number_of_keys, - number_queued, ) from tests.unittest import HomeserverTestCase @@ -62,13 +60,14 @@ async def _process_queue(self, values: List[str]) -> str: self._pending_calls.append((values, d)) return await make_deferred_yieldable(d) - def _get_sample_with_name(self, metric: Gauge, name: str) -> float: + def _get_sample_with_name(self, metric: ObservableGauge, name: str) -> float: """For a prometheus metric get the value of the sample that has a matching "name" label. """ + print(vars(metric)) for metric_family in REGISTRY.collect(): for sample in metric_family.samples: - if sample.labels.get("name") == name and sample.name == metric.name: + if sample.labels.get("name") == name: # and sample.name == metric.name: return sample.value self.fail("Found no matching sample") @@ -76,14 +75,14 @@ def _get_sample_with_name(self, metric: Gauge, name: str) -> float: def _assert_metrics(self, queued: int, keys: int, in_flight: int) -> None: """Assert that the metrics are correct""" - sample = self._get_sample_with_name(number_queued, self.queue._name) + sample = self._get_sample_with_name(self.queue.number_queued, self.queue._name) self.assertEqual( sample, queued, "number_queued", ) - sample = self._get_sample_with_name(number_of_keys, self.queue._name) + sample = self._get_sample_with_name(self.queue.number_of_keys, self.queue._name) self.assertEqual(sample, keys, "number_of_keys") sample = self._get_sample_with_name(number_in_flight, self.queue._name) From fd33875e08c3caa9a89e103fa95c9c16489c30f4 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Fri, 12 Dec 2025 16:56:08 +0100 Subject: [PATCH 17/18] fix merge issues --- poetry.lock | 4 +- synapse/federation/federation_server.py | 8 +- synapse/replication/http/_base.py | 232 +++++++++--------------- synapse/util/ratelimitutils.py | 1 - 4 files changed, 87 insertions(+), 158 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9401ee2420..c6558b1e95 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -3704,4 +3704,4 @@ url-preview = ["lxml"] [metadata] lock-version = "2.1" python-versions = "^3.10.0" -content-hash = "403b3be5269c4c6e9b509976307f40ab198d4db3c1637d8403b3f5f53d43ea36" +content-hash = "a4adc425ae84e5b175bca28f76ff57c6f051fe8d62c03efc4398c6ef9e207c40" diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index aa266488d6..b3cf222d71 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -668,10 +668,10 @@ async def on_pdu_request( async def on_query_request( self, query_type: str, args: dict[str, str] ) -> tuple[int, dict[str, Any]]: - received_queries_counter.labels( - type=query_type, - **{SERVER_NAME_LABEL: self.server_name}, - ).inc() + received_queries_counter.add( + 1, + {"type": query_type, SERVER_NAME_LABEL: self.server_name}, + ) resp = await self.registry.on_query(query_type, args) return 200, resp diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 3b6f40daa0..505b450b40 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -25,6 +25,8 @@ from inspect import signature from typing import TYPE_CHECKING, Any, Awaitable, Callable, ClassVar +from prometheus_client import Gauge + from twisted.internet.error import ConnectError, DNSLookupError from twisted.web.server import Request @@ -47,9 +49,10 @@ logger = logging.getLogger(__name__) -_pending_outgoing_requests = meter.create_up_down_counter( +_pending_outgoing_requests = Gauge( "synapse_pending_outgoing_replication_requests", - description="Number of active outgoing replication requests, by replication method name", + "Number of active outgoing replication requests, by replication method name", + labelnames=["name", SERVER_NAME_LABEL], ) _outgoing_request_counter = meter.create_counter( @@ -209,9 +212,9 @@ def make_client(cls, hs: "HomeServer") -> Callable: instance_map = hs.config.worker.instance_map - outgoing_gauge = meter.create_up_down_counter( - "synapse_pending_outgoing_replication_requests", - description="Number of active outgoing replication requests, by replication method name", + outgoing_gauge = _pending_outgoing_requests.labels( + name=cls.NAME, + **{SERVER_NAME_LABEL: server_name}, ) replication_secret = None @@ -228,143 +231,65 @@ async def send_request( streams = hs.get_replication_command_handler().get_streams_to_replicate() replication = hs.get_replication_data_handler() - outgoing_gauge.add( - 1, - {"name": cls.NAME, SERVER_NAME_LABEL: server_name}, - ) - if instance_name == local_instance_name: - raise Exception("Trying to send HTTP request to self") - if instance_name not in instance_map: - raise Exception( - "Instance %r not in 'instance_map' config" % (instance_name,) - ) - - data = await cls._serialize_payload(**kwargs) - - if cls.METHOD != "GET" and cls.WAIT_FOR_STREAMS: - # Include the current stream positions that we write to. We - # don't do this for GETs as they don't have a body, and we - # generally assume that a GET won't rely on data we have - # written. - if _STREAM_POSITION_KEY in data: + with outgoing_gauge.track_inprogress(): + if instance_name == local_instance_name: + raise Exception("Trying to send HTTP request to self") + if instance_name not in instance_map: raise Exception( - "data to send contains %r key", _STREAM_POSITION_KEY + "Instance %r not in 'instance_map' config" % (instance_name,) ) - data[_STREAM_POSITION_KEY] = { - "streams": { - stream.NAME: stream.minimal_local_current_token() - for stream in streams - }, - "instance_name": local_instance_name, - } - - url_args = [ - urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS - ] - - if cls.CACHE: - txn_id = random_string(10) - url_args.append(txn_id) - - if cls.METHOD == "POST": - request_func: Callable[..., Awaitable[Any]] = client.post_json_get_json - elif cls.METHOD == "PUT": - request_func = client.put_json - elif cls.METHOD == "GET": - request_func = client.get_json - else: - # We have already asserted in the constructor that a - # compatible was picked, but lets be paranoid. - raise Exception( - "Unknown METHOD on %s replication endpoint" % (cls.NAME,) - ) - - # Hard code a special scheme to show this only used for replication. The - # instance_name will be passed into the ReplicationEndpointFactory to - # determine connection details from the instance_map. - uri = "synapse-replication://%s/_synapse/replication/%s/%s" % ( - instance_name, - cls.NAME, - "/".join(url_args), - ) + data = await cls._serialize_payload(**kwargs) - headers: Dict[bytes, List[bytes]] = {} - # Add an authorization header, if configured. - if replication_secret: - headers[b"Authorization"] = [b"Bearer " + replication_secret] - opentracing.inject_header_dict(headers, check_destination=False) - - try: - # Keep track of attempts made so we can bail if we don't manage to - # connect to the target after N tries. - attempts = 0 - # We keep retrying the same request for timeouts. This is so that we - # have a good idea that the request has either succeeded or failed - # on the master, and so whether we should clean up or not. - while True: - try: - result = await request_func(uri, data, headers=headers) - break - except RequestTimedOutError: - if not cls.RETRY_ON_TIMEOUT: - raise - - logger.warning("%s request timed out; retrying", cls.NAME) - - # If we timed out we probably don't need to worry about backing - # off too much, but lets just wait a little anyway. - await clock.sleep(1) - except (ConnectError, DNSLookupError) as e: - if not cls.RETRY_ON_CONNECT_ERROR: - raise - if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: - raise - - delay = 2**attempts - logger.warning( - "%s request connection failed; retrying in %ds: %r", - cls.NAME, - delay, - e, + if cls.METHOD != "GET" and cls.WAIT_FOR_STREAMS: + # Include the current stream positions that we write to. We + # don't do this for GETs as they don't have a body, and we + # generally assume that a GET won't rely on data we have + # written. + if _STREAM_POSITION_KEY in data: + raise Exception( + "data to send contains %r key", _STREAM_POSITION_KEY ) - await clock.sleep(delay) - attempts += 1 - except HttpResponseException as e: - # We convert to SynapseError as we know that it was a SynapseError - # on the main process that we should send to the client. (And - # importantly, not stack traces everywhere) - _outgoing_request_counter.add( - 1, - { - "name": cls.NAME, - "code": e.code, - SERVER_NAME_LABEL: server_name, - }, - ) - raise e.to_synapse_error() - except Exception as e: - _outgoing_request_counter.add( - 1, - { - "name": cls.NAME, - "code": "ERR", - SERVER_NAME_LABEL: server_name, - }, + data[_STREAM_POSITION_KEY] = { + "streams": { + stream.NAME: stream.minimal_local_current_token() + for stream in streams + }, + "instance_name": local_instance_name, + } + + url_args = [ + urllib.parse.quote(kwargs[name], safe="") for name in cls.PATH_ARGS + ] + + if cls.CACHE: + txn_id = random_string(10) + url_args.append(txn_id) + + if cls.METHOD == "POST": + request_func: Callable[..., Awaitable[Any]] = ( + client.post_json_get_json + ) + elif cls.METHOD == "PUT": + request_func = client.put_json + elif cls.METHOD == "GET": + request_func = client.get_json + else: + # We have already asserted in the constructor that a + # compatible was picked, but lets be paranoid. + raise Exception( + "Unknown METHOD on %s replication endpoint" % (cls.NAME,) + ) + + # Hard code a special scheme to show this only used for replication. The + # instance_name will be passed into the ReplicationEndpointFactory to + # determine connection details from the instance_map. + uri = "synapse-replication://%s/_synapse/replication/%s/%s" % ( + instance_name, + cls.NAME, + "/".join(url_args), ) - raise SynapseError( - 502, f"Failed to talk to {instance_name} process" - ) from e - - _outgoing_request_counter.add( - 1, - { - "name": cls.NAME, - "code": 200, - SERVER_NAME_LABEL: server_name, - }, - ) headers: dict[bytes, list[bytes]] = {} # Add an authorization header, if configured. @@ -412,27 +337,32 @@ async def send_request( # We convert to SynapseError as we know that it was a SynapseError # on the main process that we should send to the client. (And # importantly, not stack traces everywhere) - _outgoing_request_counter.labels( - name=cls.NAME, - code=e.code, - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": e.code, + SERVER_NAME_LABEL: server_name, + }, + ) raise e.to_synapse_error() except Exception as e: - _outgoing_request_counter.labels( - name=cls.NAME, - code="ERR", - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + { + "name": cls.NAME, + "code": "ERR", + SERVER_NAME_LABEL: server_name, + }, + ) raise SynapseError( 502, f"Failed to talk to {instance_name} process" ) from e - _outgoing_request_counter.labels( - name=cls.NAME, - code=200, - **{SERVER_NAME_LABEL: server_name}, - ).inc() + _outgoing_request_counter.add( + 1, + {"name": cls.NAME, "code": 200, SERVER_NAME_LABEL: server_name}, + ) # Wait on any streams that the remote may have written to. for stream_name, position in result.pop( @@ -444,7 +374,7 @@ async def send_request( position=position, ) - return result + return result return send_request diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 22a7243ac1..235b327611 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -28,7 +28,6 @@ from typing import ( Any, Callable, - ContextManager, Iterator, Mapping, MutableSet, From 112e27979d2f2330be1c27f99615d9d4851d8726 Mon Sep 17 00:00:00 2001 From: FrenchgGithubUser Date: Mon, 15 Dec 2025 13:34:16 +0100 Subject: [PATCH 18/18] attempt at fixing tests (done with LLM) --- synapse/handlers/stats.py | 4 + synapse/util/batching_queue.py | 112 +++++++++++++--------- tests/handlers/test_stats.py | 151 ++++++++++++++++++------------ tests/util/test_batching_queue.py | 20 ++-- 4 files changed, 176 insertions(+), 111 deletions(-) diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py index ffe4a9ec80..064c1aba71 100644 --- a/synapse/handlers/stats.py +++ b/synapse/handlers/stats.py @@ -72,6 +72,10 @@ def __init__(self, hs: "HomeServer"): # Guard to ensure we only process deltas one at a time self._is_processing = False + # Initialize room count metrics to 0 + known_rooms_gauge.set(0, {SERVER_NAME_LABEL: self.server_name}) + locally_joined_rooms_gauge.set(0, {SERVER_NAME_LABEL: self.server_name}) + if self.stats_enabled and hs.config.worker.run_background_tasks: self.notifier.add_replication_callback(self.notify_new_event) diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index f417496511..a530cec5f4 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -20,6 +20,7 @@ # import logging +import weakref from typing import ( TYPE_CHECKING, Awaitable, @@ -46,13 +47,66 @@ V = TypeVar("V") R = TypeVar("R") -# number_queued = meter.create_observable_gauge( -# "synapse_util_batching_queue_number_queued", -# description="The number of items waiting in the queue across all keys", -# ) +# Global registry to track all BatchingQueue instances. +# We use a WeakSet so that queues can be garbage collected when no longer referenced. +_batching_queue_registry: "weakref.WeakSet[BatchingQueue]" = weakref.WeakSet() + + +def _collect_number_queued(options: object) -> list[Observation]: + """Callback to collect number_queued metrics from all BatchingQueue instances.""" + observations = [] + for queue in _batching_queue_registry: + observations.append( + Observation( + sum(len(q) for q in queue._next_values.values()), + {"name": queue._name, SERVER_NAME_LABEL: queue.server_name}, + ) + ) + return observations + + +def _collect_number_of_keys(options: object) -> list[Observation]: + """Callback to collect number_of_keys metrics from all BatchingQueue instances.""" + observations = [] + for queue in _batching_queue_registry: + observations.append( + Observation( + len(queue._next_values), + {"name": queue._name, SERVER_NAME_LABEL: queue.server_name}, + ) + ) + return observations + + +def _collect_number_in_flight(options: object) -> list[Observation]: + """Callback to collect number_in_flight metrics from all BatchingQueue instances.""" + observations = [] + for queue in _batching_queue_registry: + observations.append( + Observation( + queue._number_in_flight, + {"name": queue._name, SERVER_NAME_LABEL: queue.server_name}, + ) + ) + return observations + + +# Global observable gauges that collect from all BatchingQueue instances +number_queued = meter.create_observable_gauge( + "synapse_util_batching_queue_number_queued", + callbacks=[_collect_number_queued], + description="The number of items waiting in the queue across all keys", +) + +number_of_keys = meter.create_observable_gauge( + "synapse_util_batching_queue_number_of_keys", + callbacks=[_collect_number_of_keys], + description="The number of distinct keys that have items queued", +) number_in_flight = meter.create_observable_gauge( "synapse_util_batching_queue_number_pending", + callbacks=[_collect_number_in_flight], description="The number of items across all keys either being processed or waiting in a queue", ) @@ -107,45 +161,19 @@ def __init__( # The function to call with batches of values. self._process_batch_callback = process_batch_callback - self.number_queued = meter.create_observable_gauge( - "synapse_util_batching_queue_number_queued", - callbacks=[ - lambda options: [ - Observation( - sum(len(q) for q in self._next_values.values()), - {"name": self._name, SERVER_NAME_LABEL: self.server_name}, - ) - ] - ], - description="The number of items waiting in the queue across all keys", - ) + # Counter for number of items in flight (being processed or waiting). + self._number_in_flight: int = 0 - self.number_of_keys = meter.create_observable_gauge( - "synapse_util_batching_queue_number_of_keys", - description="The number of distinct keys that have items queued", - callbacks=[ - lambda options: [ - Observation( - len(self._next_values), - {"name": self._name, SERVER_NAME_LABEL: self.server_name}, - ) - ] - ], - ) - - self._number_in_flight_metric = meter.create_up_down_counter( - "synapse_util_batching_queue_number_pending", - description="The number of items across all keys either being processed or waiting in a queue", - ) + # Register this instance with the global registry so metrics can be collected. + _batching_queue_registry.add(self) def shutdown(self) -> None: """ Prepares the object for garbage collection by removing any handed out references. """ - # there doesn't seem to be an otel equivalent for those - # number_queued.remove(self._name, self.server_name) - # number_of_keys.remove(self._name, self.server_name) + # The global registry uses WeakSet, so instances are automatically + # removed when garbage collected. No explicit cleanup needed. async def add_to_queue(self, value: V, key: Hashable = ()) -> R: """Adds the value to the queue with the given key, returning the result @@ -167,13 +195,11 @@ async def add_to_queue(self, value: V, key: Hashable = ()) -> R: if key not in self._processing_keys: self.hs.run_as_background_process(self._name, self._process_queue, key) - self._number_in_flight_metric.add( - 1, {"name": self._name, SERVER_NAME_LABEL: self.server_name} - ) - res = await make_deferred_yieldable(d) - self._number_in_flight_metric.add( - -1, {"name": self._name, SERVER_NAME_LABEL: self.server_name} - ) + self._number_in_flight += 1 + try: + res = await make_deferred_yieldable(d) + finally: + self._number_in_flight -= 1 return res async def _process_queue(self, key: Hashable) -> None: diff --git a/tests/handlers/test_stats.py b/tests/handlers/test_stats.py index fa01ef479c..6825ca008b 100644 --- a/tests/handlers/test_stats.py +++ b/tests/handlers/test_stats.py @@ -57,16 +57,50 @@ def _set_metrics_to_zero(self) -> None: This method resets the metrics to zero before each test to ensure that each test starts with a clean slate. """ - from opentelemetry import metrics - from opentelemetry.sdk.metrics import MeterProvider - from opentelemetry.sdk.metrics.export import InMemoryMetricReader - - # Create a fresh reader and provider - self.reader = InMemoryMetricReader() - provider = MeterProvider(metric_readers=[self.reader]) - # Set the global provider - # Any new metric instruments created after this will use the clean state. - metrics.set_meter_provider(provider) + from synapse.metrics import ( + SERVER_NAME_LABEL, + known_rooms_gauge, + locally_joined_rooms_gauge, + ) + + # Reset the gauge values to 0 for this server + known_rooms_gauge.set(0, {SERVER_NAME_LABEL: self.hs.hostname}) + locally_joined_rooms_gauge.set(0, {SERVER_NAME_LABEL: self.hs.hostname}) + + def _get_gauge_values( + self, metrics: list[tuple[str, dict[str, str]]] + ) -> list[Optional[float]]: + """ + Get multiple gauge values from the Prometheus registry in a single call. + + The standard REGISTRY.get_sample_value() doesn't work for OpenTelemetry + metrics because the OTel exporter doesn't register its metric names. + Additionally, the OTel collector only returns data on the first collect() + call, so we must collect all data once and then look up all values. + + Args: + metrics: List of (metric_name, labels) tuples to look up. + + Returns: + List of values in the same order as the input metrics. + """ + # Collect all data from all collectors into a lookup dict + all_samples: dict[tuple[str, tuple[tuple[str, str], ...]], float] = {} + for collector in REGISTRY._collector_to_names.keys(): + try: + for metric_family in collector.collect(): + for sample in metric_family.samples: + key = (metric_family.name, tuple(sorted(sample.labels.items()))) + all_samples[key] = sample.value + except Exception: + continue + + # Look up each requested metric + results: list[Optional[float]] = [] + for metric_name, labels in metrics: + key = (metric_name, tuple(sorted(labels.items()))) + results.append(all_samples.get(key)) + return results def _add_background_updates(self) -> None: """ @@ -184,19 +218,18 @@ def test_create_room(self) -> None: When we create a room, it should have statistics already ready. """ self._perform_background_initial_update() - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_known_rooms_total", labels={"server_name": self.hs.hostname} - ), - 0.0, - ) - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_locally_joined_rooms_total", - labels={"server_name": self.hs.hostname}, - ), - 0.0, - ) + known_rooms, locally_joined = self._get_gauge_values( + [ + ("synapse_known_rooms_total", {"server_name": self.hs.hostname}), + ( + "synapse_locally_joined_rooms_total", + {"server_name": self.hs.hostname}, + ), + ] + ) + self.assertEqual(known_rooms, 0.0) + self.assertEqual(locally_joined, 0.0) + u1 = self.register_user("u1", "pass") u1token = self.login("u1", "pass") r1 = self.helper.create_room_as(u1, tok=u1token) @@ -223,19 +256,17 @@ def test_create_room(self) -> None: self.assertEqual(r2stats["banned_members"], 0) # There are 2 rooms created. Check the room metrics were udpated. - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_known_rooms_total", labels={"server_name": self.hs.hostname} - ), - 2, - ) - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_locally_joined_rooms_total", - labels={"server_name": self.hs.hostname}, - ), - 2, - ) + known_rooms, locally_joined = self._get_gauge_values( + [ + ("synapse_known_rooms_total", {"server_name": self.hs.hostname}), + ( + "synapse_locally_joined_rooms_total", + {"server_name": self.hs.hostname}, + ), + ] + ) + self.assertEqual(known_rooms, 2) + self.assertEqual(locally_joined, 2) def test_updating_profile_information_does_not_increase_joined_members_count( self, @@ -647,19 +678,17 @@ def test_room_metrics(self) -> None: """ self._perform_background_initial_update() - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_known_rooms_total", labels={"server_name": self.hs.hostname} - ), - 0.0, - ) - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_locally_joined_rooms_total", - labels={"server_name": self.hs.hostname}, - ), - 0.0, - ) + known_rooms, locally_joined = self._get_gauge_values( + [ + ("synapse_known_rooms_total", {"server_name": self.hs.hostname}), + ( + "synapse_locally_joined_rooms_total", + {"server_name": self.hs.hostname}, + ), + ] + ) + self.assertEqual(known_rooms, 0.0) + self.assertEqual(locally_joined, 0.0) u1 = self.register_user("u1", "pass") u1token = self.login("u1", "pass") @@ -670,19 +699,17 @@ def test_room_metrics(self) -> None: self.helper.leave(r2, u1, tok=u1token) # Check the locally joined rooms metric after creating rooms - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_locally_joined_rooms_total", - labels={"server_name": self.hs.hostname}, - ), - 1, - ) - self.assertEqual( - REGISTRY.get_sample_value( - "synapse_known_rooms_total", labels={"server_name": self.hs.hostname} - ), - 2, - ) + known_rooms, locally_joined = self._get_gauge_values( + [ + ("synapse_known_rooms_total", {"server_name": self.hs.hostname}), + ( + "synapse_locally_joined_rooms_total", + {"server_name": self.hs.hostname}, + ), + ] + ) + self.assertEqual(locally_joined, 1) + self.assertEqual(known_rooms, 2) # Check the stats for both rooms r1stats = self._get_current_stats("room", r1) diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 90666f7186..9a0b0d4272 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -28,6 +28,8 @@ from synapse.util.batching_queue import ( BatchingQueue, number_in_flight, + number_of_keys, + number_queued, ) from tests.unittest import HomeserverTestCase @@ -61,27 +63,33 @@ async def _process_queue(self, values: list[str]) -> str: def _get_sample_with_name(self, metric: ObservableGauge, name: str) -> float: """For a prometheus metric get the value of the sample that has a - matching "name" label. + matching "name" label and matching metric name. """ - print(vars(metric)) + # The metric.name attribute gives us the OTel instrument name + metric_name = metric.name + for metric_family in REGISTRY.collect(): + # Check if this metric family corresponds to our metric + # (the family name should match or contain the metric name) + if metric_family.name != metric_name: + continue for sample in metric_family.samples: - if sample.labels.get("name") == name: # and sample.name == metric.name: + if sample.labels.get("name") == name: return sample.value - self.fail("Found no matching sample") + self.fail(f"Found no matching sample for metric={metric_name}, name={name}") def _assert_metrics(self, queued: int, keys: int, in_flight: int) -> None: """Assert that the metrics are correct""" - sample = self._get_sample_with_name(self.queue.number_queued, self.queue._name) + sample = self._get_sample_with_name(number_queued, self.queue._name) self.assertEqual( sample, queued, "number_queued", ) - sample = self._get_sample_with_name(self.queue.number_of_keys, self.queue._name) + sample = self._get_sample_with_name(number_of_keys, self.queue._name) self.assertEqual(sample, keys, "number_of_keys") sample = self._get_sample_with_name(number_in_flight, self.queue._name)