指标 | 指标含义 | 指标标签(维度) | 指标分类 | 指标类型 | 单位 | 指标周期(单位s) |
instance_cpu_count | 服务实例CPU数量 | instance,resource_type | CPU | Gauge | count | 60 |
instance_gpu_count | 服务实例GPU数量 | instance,resource_type | GPU | Gauge | count | 60 |
instance_cpu_usage | 服务实例CPU使用量 | instance | CPU | Gauge | core | 60 |
instance_user_cpu_usage | 服务实例用户进程CPU使用量 | instance | CPU | Gauge | core | 60 |
instance_system_cpu_usage | 服务实例系统进程CPU使用量 | instance | CPU | Gauge | core | 60 |
instance_cpu_util | 服务实例CPU使用率 | instance | CPU | Gauge | % | 60 |
instance_memory_rss_usage | 服务实例内存使用量 | instance | Memory | Gauge | byte | 60 |
instance_memory_cache_usage | 服务实例内存缓存使用量 | instance | Memory | Gauge | byte | 60 |
instance_memory_total | 服务实例内存总量 | instance | Memory | Gauge | byte | 60 |
instance_memory_util | 服务实例内存使用率 | instance | Memory | Gauge | % | 60 |
instance_response | 服务实例请求数 | instance | Request | Counter | count | 60 |
instance_gpu_util | 服务实例GPU使用率 | instance | GPU | Gauge | % | 60 |
instance_gpu_memory_usage | 服务实例显存使用量 | instance | GPU | Gauge | MiB | 60 |
instance_gpu_memory_total | 服务实例显存总量 | instance | GPU | Gauge | MiB | 60 |
instance_gpu_memory_util | 服务实例显存使用率 | instance | GPU | Gauge | MiB | 60 |
instance_gpu_memory_bandwidth_limit | 服务实例GPU显存带宽限制 | instance | GPU | Gauge | bytes/second | 60 |
instance_gpu_temperature | 服务实例GPU温度 | instance | GPU | Gauge | °C | 60 |
instance_gpu_slow_temperature | 服务实例GPU降频温度 | instance | GPU | Gauge | °C | 60 |
instance_gpu_shut_temperature | 服务实例GPU关机温度 | instance | GPU | Gauge | °C | 60 |
instance_gpu_nvswitch_error | 服务实例NVSwitch致命错误信息 | instance | GPU | Gauge | count | 60s |
instance_gpu_nvswitch_non_fatal_error | 服务实例NVSwitch非致命错误信息 | instance | GPU | Gauge | count | 60 |
instance_gpu_ecc_total_vol_sbe | 服务实例单比特易失性ECC错误总数 | instance | GPU | Counter | count | 60 |
instance_gpu_ecc_total_vol_dbe | 服务实例双比特易失性ECC错误总数 | instance | GPU | Counter | count | 60 |
instance_gpu_ecc_total_agg_sbe | 服务实例单比特聚合(持久性)ECC错误总数 | instance | GPU | Counter | count | 60 |
instance_gpu_ecc_total_agg_dbe | 服务实例双比特聚合(持久性)ECC错误总数 | instance | GPU | Counter | count | 60 |
instance_gpu_remap_fail | 服务实例行重映射失败次数 | instance | GPU | Gauge | count | 60 |
instance_gpu_remap_pending | 服务实例行重映射待处理次数 | instance | GPU | Gauge | count | 60 |
instance_gpu_pcie_replay_counter | 服务实例PCIe重传计数器 | instance | GPU | Gauge | count | 60 |
instance_gpu_pcie_transmit_measure_by_dcgm | 服务实例通过DCGM测量的PCIe传输速率 | instance | GPU | Gauge | bytes/second | 60 |
instance_gpu_pcie_receive_measure_by_dcgm | 服务实例通过DCGM测量的PCIe接收速率 | instance | GPU | Gauge | bytes/second | 60 |
instance_gpu_graphics_engine_util | 服务实例图形引擎利用率 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_sm_util | 服务实例SM(流式多处理器)利用率 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_dram_active | 服务实例设备内存接口活跃发送或接收数据的比率 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_tensortflops_used | 服务实例Tensor管道使用的Tflops | instance | GPU | Gauge | count | 60 |
instance_gpu_memory_bandwidth_used | 服务实例内存带宽使用量 | instance | GPU | Gauge | bytes/second | 60 |
instance_gpu_sm_clock | 服务实例SM时钟频率 | instance | GPU | Gauge | MHz | 60 |
instance_gpu_sm_occupancy | 服务实例SM上驻留的Warp数量比例 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_fp32tflops_used | 服务实例FP32管道使用的Tflops | instance | GPU | Gauge | count | 60 |
instance_gpu_fp16tflops_used | 服务实例FP16管道使用的Tflops | instance | GPU | Gauge | count | 60 |
instance_gpu_pipe_fp32_active | 服务实例FP32管道活跃周期比例 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_pipe_fp16_active | 服务实例FP16管道活跃周期比例 | instance | GPU | Gauge | ratio (0~1) | 60s |
instance_gpu_pipe_tensor_active | 服务实例Tensor管道活跃周期比例 | instance | GPU | Gauge | ratio (0~1) | 60 |
instance_gpu_power_usage | 服务实例GPU功耗 | instance | GPU | Gauge | watts | 60 |
instance_accelerator_power_usage | 服务实例加速器功耗 | instance | GPU | Gauge | milliwatts | 60 |
instance_gpu_mem_copy_util | 服务实例内存复制利用率 | instance | GPU | Gauge | % | 60 |
instance_gpu_health_count | 服务实例GPU健康状态计数的总和 | instance | GPU | Gauge | count | 60 |
instance_gpu_lost_card_num | 服务实例VM中丢失显卡数量 | instance | GPU | Gauge | count | 60 |
instance_gpu_driver_hang | 服务实例驱动挂起次数 | instance | GPU | Gauge | count | 60 |
instance_gpu_profile_status | 服务实例Amperf性能分析状态 | instance | GPU | Gauge | count | 60 |
instance_gpu_uncorrectable_ecc | 服务实例无法纠正的ECC错误数量 | instance | GPU | Gauge | count | 60 |
instance_gpu_xid_cnt | 服务实例Xid错误数 | instance | GPU | Gauge | count | 60 |
instance_gpu_fatal_xid_error | 服务实例致命Xid错误数 | instance | GPU | Gauge | count | 60 |
instance_gpu_kernel_err_cnt | 服务实例来自内核日志的非Xid错误数 | instance | GPU | Gauge | count | 60 |
instance_qps | 服务实例每秒请求数 | instance | Request | Gauge | count | 60 |
instance_traffic | 服务实例流量 | instance | Request | Gauge | bps | 60 |
instance_avg_latency | 服务实例平均请求响应时间 | instance | Request | Gauge | ms | 60 |
instance_tpxx_latency | 服务实例TOPXX请求响应时间 | instance | Request | Gauge | ms | 60 |
instance_traffic_in | 服务实例入流量 | instance | Request | Gauge | bps | 60 |
instance_traffic_out | 服务实例出流量 | instance | Request | Gauge | bps | 60 |
instance_tcp_connections | 服务实例TCP连接数 | instance | Request | Gauge | count | 60 |
service_replicas | 服务实例数 | service | Meta | Gauge | count | 60 |
service_pending_replicas | 待执行的服务实例数 | service | Meta | Gauge | count | 60 |
service_available_replicas | 运行中的服务实例数 | service | Meta | Gauge | count | 60 |
service_replicas_with_resource_type | 服务实例数(带有资源类型标签) | service | Meta | Gauge | count | 60 |
service_cpu_count | 服务占用CPU总数 | service | CPU | Gauge | core | 60 |
service_cpu_count_with_resource_type | 服务CPU总数(带有资源类型标签) | service | CPU | Gauge | core | 60 |
service_gpu_count_with_resource_type | 服务GPU总数(带有资源类型标签) | service | GPU | Gauge | count | 60 |
service_rps_status_2xx | 服务2XX响应请求数 | service | Request | Gauge | count | 60 |
service_rps_status_4xx | 服务4XX响应请求数 | service | Request | Gauge | count | 60 |
service_rps_status_5xx | 服务5XX响应请求数 | service | Request | Gauge | count | 60 |
service_rps_status_2xx_ratio | 服务2XX响应请求数占比 | service | Request | Gauge | % | 60 |
service_rps_status_4xx_ratio | 服务4XX响应请求数占比 | service | Request | Gauge | % | 60 |
service_rps_status_5xx_ratio | 服务5XX响应请求数占比 | service | Request | Gauge | % | 60 |
service_qps | 服务每秒请求数 | service | Request | Gauge | count | 60 |
service_avg_latency | 服务平均请求响应时间 | service | Request | Gauge | ms | 60 |
service_tpxx_latency | 服务TOPXX请求响应时间 | service | Request | Gauge | ms | 60 |
service_tp100_latency | 服务TOP100请求响应时间 | service | Request | Gauge | ms | 60 |
service_traffic_in | 服务入流量 | service | Network | Gauge | bps | 60 |
service_traffic_out | 服务出流量 | service | Network | Gauge | | 60 |
service_cpu_usage | 服务CPU使用量 | service | CPU | Gauge | core | 60 |
service_user_cpu_usage | 服务用户进程CPU使用量 | service | CPU | Gauge | core | 60 |
service_system_cpu_usage | 服务系统进程CPU使用量 | service | CPU | Gauge | core | 60 |
service_cpu_util | 服务CPU使用率 | service | CPU | Gauge | % | 60 |
service_memory_rss_usage | 服务内存使用量 | service | Memory | Gauge | byte | 60 |
service_memory_cache_usage | 服务内存缓存使用量 | service | Memory | Gauge | byte | 60 |
service_memory_total | 服务内存总量 | service | Memory | Gauge | byte | 60 |
service_memory_util | 服务内存使用率 | service | Memory | Gauge | % | 60 |
service_gpu_util | 服务GPU使用率 | service | GPU | Gauge | % | 60 |
service_gpu_memory_usage | 服务显存使用量 | service | GPU | Gauge | MiB | 60 |
service_gpu_memory_total | 服务显存总量 | service | GPU | Gauge | MiB | 60 |
service_gpu_memory_util | 服务显存使用率 | service | GPU | Gauge | MiB | 60 |
service_gpu_memory_bandwidth_limit | 服务GPU显存带宽限制 | service | GPU | Gauge | bytes/second | 60 |
service_gpu_temperature | 服务GPU温度 | service | GPU | Gauge | °C | 60 |
service_gpu_slow_temperature | 服务GPU降频温度 | service | GPU | Gauge | °C | 60 |
service_gpu_shut_temperature | 服务GPU关机温度 | service | GPU | Gauge | °C | 60 |
service_gpu_nvswitch_error | 服务NVSwitch致命错误信息 | service | GPU | Gauge | count | 60 |
service_gpu_nvswitch_non_fatal_error | 服务NVSwitch非致命错误信息 | service | GPU | Gauge | count | 60 |
service_gpu_ecc_total_vol_sbe | 服务单比特易失性ECC错误总数 | service | GPU | Counter | count | 60 |
service_gpu_ecc_total_vol_dbe | 服务双比特易失性ECC错误总数 | service | GPU | Counter | count | 60 |
service_gpu_ecc_total_agg_sbe | 服务单比特聚合(持久性)ECC错误总数 | service | GPU | Counter | count | 60 |
service_gpu_ecc_total_agg_dbe | 服务双比特聚合(持久性)ECC错误总数 | service | GPU | Counter | count | 60 |
service_gpu_remap_fail | 服务行重映射失败次数 | service | GPU | Gauge | count | 60 |
service_gpu_remap_pending | 服务行重映射待处理次数 | service | GPU | Gauge | count | 60 |
service_gpu_pcie_replay_counter | 服务PCIe重传计数器 | service | GPU | Gauge | count | 60 |
service_gpu_pcie_transmit_measure_by_dcgm | 服务通过DCGM测量的PCIe传输速率 | service | GPU | Gauge | bytes/second | 60 |
service_gpu_pcie_receive_measure_by_dcgm | 服务通过DCGM测量的PCIe接收速率 | service | GPU | Gauge | bytes/second | 60 |
service_gpu_graphics_engine_util | 服务图形引擎利用率 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_sm_util | 服务SM(流式多处理器)利用率 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_dram_active | 服务设备内存接口活跃发送或接收数据的比率 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_tensortflops_used | 服务Tensor管道使用的Tflops | service | GPU | Gauge | count | 60 |
service_gpu_memory_bandwidth_used | 服务内存带宽使用量 | service | GPU | Gauge | bytes/second | 60 |
service_gpu_sm_clock | 服务SM时钟频率 | service | GPU | Gauge | MHz | 60 |
service_gpu_sm_occupancy | 服务SM上驻留的Warp线程数量比例 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_fp32tflops_used | 服务FP32管道使用的Tflops | service | GPU | Gauge | count | 60 |
service_gpu_fp16tflops_used | 服务FP16管道使用的Tflops | service | GPU | Gauge | count | 60 |
service_gpu_pipe_fp32_active | 服务FP32管道活跃周期比例 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_pipe_fp16_active | 服务FP16管道活跃周期比例 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_pipe_tensor_active | 服务Tensor管道活跃周期比例 | service | GPU | Gauge | ratio (0~1) | 60 |
service_gpu_power_usage | 服务GPU功耗 | service | GPU | Gauge | watts | 60 |
service_accelerator_power_usage | 服务加速器功耗 | service | GPU | Gauge | milliwatts | 60 |
service_gpu_mem_copy_util | 服务内存复制利用率 | service | GPU | Gauge | % | 60 |
service_gpu_health_count | 服务GPU健康状态计数的总和 | service | GPU | Gauge | count | 60 |
service_gpu_lost_card_num | 服务VM中丢失显卡数量 | service | GPU | Gauge | count | 60 |
service_gpu_driver_hang | 服务驱动挂起次数 | service | GPU | Gauge | count | 60 |
service_gpu_profile_status | 服务Amperf性能分析状态 | service | GPU | Gauge | count | 60 |
service_gpu_uncorrectable_ecc | 服务无法纠正的ECC错误数量 | service | GPU | Gauge | count | 60 |
service_gpu_xid_cnt | 服务Xid错误数 | service | GPU | Gauge | count | 60 |
service_gpu_fatal_xid_error | 服务致命Xid错误数 | service | GPU | Gauge | count | 60 |
service_gpu_kernel_err_cnt | 服务来自内核日志的非Xid错误数 | service | GPU | Gauge | count | 60 |
service_tcp_connections | 服务TCP连接数 | service | Network | Gauge | count | 60 |
service_gateway_requests | llm-gateway:gateway当前接受到的请求数 | service | Request | Gauge | count | 60 |
service_gateway_pending_requests | llm-gateway:当前缓存在gateway中的请求数 | service | Request | Gauge | count | 60 |
service_llm_ttft_max | llm-gateway: llm流式请求的首包延时的最大值 | service | Request | Gauge | time | 60 |
service_llm_ttft_min | llm-gateway: llm流式请求的首包延时的最小值 | service | Request | Gauge | time | 60 |
service_llm_ttft_mean | llm-gateway: llm流式请求的首包延时的平均值 | service | Request | Gauge | time | 60 |
service_llm_ttft_percent | llm-gateway: llm流式请求的首包延时的分位值 | service | Request | Gauge | time | 60 |
service_llm_tpot_max | llm-gateway: llm流式请求的每包延时的最大值 | service | Request | Gauge | time | 60 |
service_llm_tpot_min | llm-gateway: llm流式请求的每包延时的最小值 | service | Request | Gauge | time | 60 |
service_llm_tpot_mean | llm-gateway: llm流式请求的每包延时的平均值 | service | Request | Gauge | time | 60 |
service_llm_tpot_percent | llm-gateway: llm流式请求的每包延时的分位值 | service | Request | Gauge | time | 60 |
service_endpoint_llm_waiting_requests | llm-gateway: llm推理引擎内部正在排队等待的请求数 | service | Request | Gauge | count | 60 |
service_endpoint_llm_running_requests | llm-gateway: llm推理引擎内部正在运行处理的请求数 | service | Request | Gauge | count | 60 |
service_endpoint_llm_gpu_cache_usage | llm-gateway: llm推理引擎gpu kv-cache的使用率 | service | Request | Gauge | count | 60 |
service_endpoint_llm_tps_in | llm-gateway: llm引擎每秒的输入的token数 | service | Request | Gauge | count | 60 |
service_endpoint_llm_tps_out | llm-gateway: llm引擎每秒输出的token数 | service | Request | Gauge | count | 60 |
resource_instance_cpu_util | 资源组实例CPU使用率 | instance_id | Resource Instance | Gauge | % | 60 |
resource_instance_memory_total | 资源组实例内存总量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_memory_used | 资源组实例内存使用量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_memory_util | 资源组实例内存使用率 | instance_id | Resource Instance | Gauge | % | 60 |
resource_instance_memory_cache | 资源组实例内存缓存使用量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_memory_free | 资源组实例内存空余量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_traffic_in | 资源组实例入流量 | instance_id | Resource Instance | Gauge | bytes/second | 60 |
resource_instance_traffic_out | 资源组实例出流量 | instance_id | Resource Instance | Gauge | bytes/second | 60 |
resource_instance_disk_used | 资源组实例硬盘使用量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_disk_total | 资源组实例硬盘总量 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_disk_util | 资源组实例硬盘使用率 | instance_id | Resource Instance | Gauge | byte | 60 |
resource_instance_tcp_established | 资源组实例TCP稳定链接数 | instance_id | Resource Instance | Gauge | count | 60 |
resource_instance_tcp_time_wait | 资源组实例TCP等待链接数 | instance_id | Resource Instance | Gauge | count | 60 |
resource_instance_gpu_util | 资源组实例GPU使用率 | instance_id | Resource Instance | Gauge | % | 60 |
resource_instance_gpu_memory_usage | 资源组实例显存使用量 | instance_id | Resource Instance | Gauge | MiB | 60 |
resource_instance_gpu_memory_total | 资源组实例显存总量 | instance_id | Resource Instance | Gauge | MiB | 60 |
resource_instance_gpu_memory_util | 资源组实例显存使用率 | instance_id | Resource Instance | Gauge | % | 60 |
resource_cpu_util | 资源组CPU使用率 | resource | Resource | Gauge | % | 60 |
resource_memory_total | 资源组内存总量 | resource | Resource | Gauge | byte | 60 |
resource_memory_used | 资源组内存使用量 | resource | Resource | Gauge | byte | 60 |
resource_memory_util | 资源组内存使用率 | resource | Resource | Gauge | % | 60 |
resource_memory_cache | 资源组内存缓存使用量 | resource | Resource | Gauge | byte | 60 |
resource_memory_free | 资源组内存空余量 | resource | Resource | Gauge | byte | 60 |
resource_traffic_in | 资源组入流量 | resource | Resource | Gauge | bytes/second | 60 |
resource_traffic_out | 资源组出流量 | resource | Resource | Gauge | bytes/second | 60 |
resource_disk_used | 资源组硬盘使用量 | resource | Resource | Gauge | byte | 60 |
resource_disk_total | 资源组硬盘总量 | resource | Resource | Gauge | byte | 60 |
resource_disk_util | 资源组硬盘使用率 | resource | Resource | Gauge | byte | 60 |
resource_tcp_established | 资源组TCP稳定链接数 | resource | Resource | Gauge | count | 60 |
resource_tcp_time_wait | 资源组TCP等待链接数 | resource | Resource | Gauge | count | 60 |
resource_gpu_util | 资源组GPU使用率 | resource | Resource | Gauge | % | 60 |
resource_gpu_memory_usage | 资源组显存使用量 | resource | Resource | Gauge | MiB | 60 |
resource_gpu_memory_total | 资源组显存总量 | resource | Resource | Gauge | MiB | 60 |
resource_gpu_memory_util | 资源组显存使用率 | resource | Resource | Gauge | % | 60 |