当前位置：首页 > news >正文

AI 推理服务弹性调度与 GPU 资源管理实践

news 2026/6/13 18:11:00

AI 推理服务弹性调度与 GPU 资源管理实践

一、场景痛点：GPU 资源稀缺与弹性需求

随着大语言模型在各行业的广泛应用，GPU 资源的管理和调度成为影响 AI 应用性能和成本的核心问题。与传统 CPU 计算不同，GPU 有以下独特挑战：

资源稀缺：高端 GPU（如 H100、A100）供应紧张，价格昂贵
异构资源：不同型号 GPU 性能差异大，需要智能调度
弹性需求波动：AI 推理请求量波动剧烈，需要快速扩缩容
多任务共享：同一 GPU 上可能运行多个推理任务，需要合理分配

传统的资源管理方式（固定分配、手动扩缩容）已经无法满足 AI 时代的挑战，需要更智能的弹性调度方案。

二、底层机制与原理深度剖析

2.1 GPU 资源调度架构

flowchart TD subgraph 调度层 A[API Gateway] --> B[调度器] B --> C{调度决策} end subgraph GPU 资源层 D[GPU Pool Manager] E[节点1: A100 x4] F[节点2: A100 x4] G[节点3: H100 x2] end subgraph AI 推理服务 H[vLLM Instance 1] I[vLLM Instance 2] J[TensorRT-LLM Instance] end C -->|资源分配| D D --> E D --> F D --> G E --> H E --> I G --> J K[Prometheus] --> B K --> D K -->监控指标 style B fill:#b8d4ff style D fill:#FFE4B5

调度器的核心职责：

资源感知：了解 GPU 的类型、数量、显存、温度等状态
请求路由：将推理请求路由到合适的 GPU 实例
弹性扩缩：根据负载自动调整实例数量
公平分配：在多个租户之间公平分配 GPU 资源

2.2 GPU 调度算法分类

flowchart LR A[调度算法] --> B[基于规则] A --> C[基于队列] A --> D[基于预测] A --> E[基于强化学习] B --> B1[轮询] B --> B2[最少连接] B --> B3[亲和性] C --> C1[优先级队列] C --> C2[公平调度] C --> C3[资源预留] D --> D1[流量预测] D --> D2[容量规划] E --> E1[DeepRM] E --> E2[Decima]

三、生产级代码实现与最佳实践

3.1 GPU 资源管理器

# ==================== GPU 资源管理器 ==================== """ 生产级 GPU 资源管理系统 支持多节点、多 GPU 的资源调度 """ import asyncio import time from dataclasses import dataclass, field from typing import Dict, List, Optional, Set from enum import Enum import threading import psutil import subprocess from collections import defaultdict import logging logger = logging.getLogger(__name__) class GPUState(Enum): IDLE = "idle" # 空闲 ALLOCATED = "allocated" # 已分配 RESERVED = "reserved" # 预留 FAULTY = "faulty" # 故障 @dataclass class GPUInfo: """GPU 信息""" index: int name: str memory_total: int # bytes memory_free: int # bytes utilization: float # 0-1 temperature: float # celsius state: GPUState = GPUState.IDLE allocated_instances: Set[str] = field(default_factory=set) @property def memory_used(self) -> int: return self.memory_total - self.memory_free @property def memory_utilization(self) -> float: return self.memory_used / self.memory_total if self.memory_total > 0 else 0 @property def can_allocate(self) -> bool: return self.state == GPUState.IDLE and len(self.allocated_instances) == 0 class GPUResourceManager: """ GPU 资源管理器 核心功能： 1. GPU 状态监控 2. 资源分配与回收 3. 负载均衡 4. 故障检测 """ def __init__(self, config: 'GPUManagerConfig'): self.config = config self.nodes: Dict[str, List[GPUInfo]] = {} self.instance_to_gpu: Dict[str, tuple] = {} # instance_id -> (node_id, gpu_index) self.gpu_to_instance: Dict[tuple, str] = {} # (node_id, gpu_index) -> instance_id self._lock = threading.RLock() self._monitoring = False self._monitor_task: Optional[asyncio.Task] = None async def start(self): """启动资源管理器""" await self._discover_gpus() self._monitoring = True self._monitor_task = asyncio.create_task(self._monitor_loop()) logger.info("GPU Resource Manager started") async def stop(self): """停止资源管理器""" self._monitoring = False if self._monitor_task: self._monitor_task.cancel() logger.info("GPU Resource Manager stopped") async def _discover_gpus(self): """发现集群中的 GPU 资源""" # 通过 nvidia-smi 或 NVML 发现 GPU try: result = subprocess.run( ['nvidia-smi', '--query-gpu=index,name,memory.total,memory.free,utilization.gpu,temperature.gpu', '--format=csv,noheader,nounits'], capture_output=True, text=True, check=True ) node_id = self._get_node_id() for line in result.stdout.strip().split('\n'): if not line: continue parts = [p.strip() for p in line.split(',')] if len(parts) != 6: continue gpu_info = GPUInfo( index=int(parts[0]), name=parts[1], memory_total=int(parts[2]) * 1024 * 1024, # MB to bytes memory_free=int(parts[3]) * 1024 * 1024, utilization=float(parts[4]) / 100, temperature=float(parts[5]) ) if node_id not in self.nodes: self.nodes[node_id] = [] self.nodes[node_id].append(gpu_info) logger.info(f"Discovered {len(self.nodes.get(node_id, []))} GPUs on node {node_id}") except Exception as e: logger.warning(f"Failed to discover GPUs: {e}") def _get_node_id(self) -> str: """获取节点 ID""" return subprocess.run(['hostname'], capture_output=True, text=True).stdout.strip() async def _monitor_loop(self): """监控循环""" while self._monitoring: try: await self._update_gpu_status() await self._check_gpu_health() await asyncio.sleep(self.config.monitoring_interval) except Exception as e: logger.error(f"Monitoring error: {e}") async def _update_gpu_status(self): """更新 GPU 状态""" try: result = subprocess.run( ['nvidia-smi', '--query-gpu=index,memory.free,utilization.gpu,temperature.gpu', '--format=csv,noheader,nounits'], capture_output=True, text=True, check=True ) node_id = self._get_node_id() gpus = self.nodes.get(node_id, []) for line, gpu in zip(result.stdout.strip().split('\n'), gpus): if not line: continue parts = [p.strip() for p in line.split(',')] if len(parts) != 4: continue with self._lock: gpu.memory_free = int(parts[1]) * 1024 * 1024 gpu.utilization = float(parts[2]) / 100 gpu.temperature = float(parts[3]) except Exception as e: logger.warning(f"Failed to update GPU status: {e}") async def _check_gpu_health(self): """检查 GPU 健康状态""" node_id = self._get_node_id() gpus = self.nodes.get(node_id, []) for gpu in gpus: with self._lock: # 温度过高的 GPU 标记为故障 if gpu.temperature > self.config.max_temperature: if gpu.state != GPUState.FAULTY: logger.warning(f"GPU {gpu.index} temperature too high: {gpu.temperature}°C") gpu.state = GPUState.FAULTY # 显存不足的 GPU 标记为 Reserve if gpu.memory_free < self.config.min_free_memory: if gpu.state != GPUState.RESERVED: logger.warning(f"GPU {gpu.index} low memory: {gpu.memory_free / 1024**2:.0f}MB free") gpu.state = GPUState.RESERVED def allocate_gpu( self, instance_id: str, memory_required: int, preference: Optional[Dict] = None ) -> Optional[tuple]: """ 分配 GPU 资源 返回 (node_id, gpu_index) 或 None """ with self._lock: for node_id, gpus in self.nodes.items(): for gpu in sorted(gpus, key=lambda g: g.memory_free, reverse=True): # 检查是否可用 if gpu.state == GPUState.FAULTY: continue if gpu.memory_free < memory_required: continue if gpu.state == GPUState.RESERVED and ( preference is None or not preference.get('allow_reserved', False) ): continue # 分配 GPU gpu.state = GPUState.ALLOCATED gpu.allocated_instances.add(instance_id) self.instance_to_gpu[instance_id] = (node_id, gpu.index) self.gpu_to_instance[(node_id, gpu.index)] = instance_id logger.info(f"Allocated GPU {node_id}:{gpu.index} to instance {instance_id}") return (node_id, gpu.index) return None def release_gpu(self, instance_id: str) -> bool: """释放 GPU 资源""" with self._lock: if instance_id not in self.instance_to_gpu: logger.warning(f"Instance {instance_id} not allocated") return False node_id, gpu_index = self.instance_to_gpu[instance_id] if node_id in self.nodes: for gpu in self.nodes[node_id]: if gpu.index == gpu_index: gpu.allocated_instances.discard(instance_id) if len(gpu.allocated_instances) == 0: gpu.state = GPUState.IDLE logger.info(f"Released GPU {node_id}:{gpu.index} from instance {instance_id}") break del self.instance_to_gpu[instance_id] del self.gpu_to_instance[(node_id, gpu_index)] return True def get_allocation_summary(self) -> Dict: """获取资源分配摘要""" with self._lock: total_gpus = sum(len(gpus) for gpus in self.nodes.values()) allocated_gpus = sum( 1 for gpus in self.nodes.values() for gpu in gpus if gpu.state == GPUState.ALLOCATED ) return { 'total_gpus': total_gpus, 'allocated_gpus': allocated_gpus, 'idle_gpus': total_gpus - allocated_gpus, 'utilization': allocated_gpus / total_gpus if total_gpus > 0 else 0, 'by_node': { node_id: { 'total': len(gpus), 'allocated': sum(1 for g in gpus if g.state == GPUState.ALLOCATED), 'idle': sum(1 for g in gpus if g.state == GPUState.IDLE), } for node_id, gpus in self.nodes.items() } } @dataclass class GPUManagerConfig: """资源配置""" monitoring_interval: int = 5 # 秒 max_temperature: float = 85.0 # celsius min_free_memory: int = 2 * 1024 * 1024 * 1024 # 2GB

3.2 弹性调度器

# ==================== AI 推理弹性调度器 ==================== """ 基于预测的弹性调度器 支持： 1. 主动扩缩容 2. 流量预测 3. 蓝绿部署 4. 金丝雀发布 """ import asyncio import time from dataclasses import dataclass, field from typing import Dict, List, Optional, Set from collections import deque import logging from datetime import datetime, timedelta logger = logging.getLogger(__name__) @dataclass class ScalingConfig: """扩缩容配置""" min_instances: int = 1 max_instances: int = 10 scale_up_threshold: float = 0.8 # 80% 利用率触发扩容 scale_down_threshold: float = 0.3 # 30% 利用率触发缩容 scale_up_cooldown: int = 60 # 扩容冷却时间（秒） scale_down_cooldown: int = 300 # 缩容冷却时间（秒） target_utilization: float = 0.7 # 目标利用率 @dataclass class InstanceInfo: """推理实例信息""" instance_id: str gpu_node: str gpu_index: int model_name: str memory_required: int current_load: float = 0.0 request_count: int = 0 created_at: datetime = field(default_factory=datetime.now) status: str = "starting" # starting, ready, draining, stopped class ElasticScheduler: """ 弹性调度器 核心功能： 1. 基于利用率的自动扩缩容 2. 基于时间序列的预测性扩容 3. 流量分配与负载均衡 """ def __init__( self, resource_manager: 'GPUResourceManager', scaling_config: ScalingConfig ): self.resource_manager = resource_manager self.scaling_config = scaling_config self.instances: Dict[str, InstanceInfo] = {} self.instances_by_model: Dict[str, Set[str]] = {} # 流量历史（用于预测） self.request_history: deque = deque(maxlen=1000) # 指标收集 self.metrics_history: deque = deque(maxlen=100) # 扩缩容状态 self.last_scale_up_time: datetime = datetime.min self.last_scale_down_time: datetime = datetime.min self._scheduler_task: Optional[asyncio.Task] = None self._running = False async def start(self): """启动调度器""" self._running = True self._scheduler_task = asyncio.create_task(self._scheduler_loop()) logger.info("Elastic Scheduler started") async def stop(self): """停止调度器""" self._running = False if self._scheduler_task: self._scheduler_task.cancel() logger.info("Elastic Scheduler stopped") async def _scheduler_loop(self): """调度循环""" while self._running: try: # 收集指标 await self._collect_metrics() # 预测流量 predicted_load = self._predict_load() # 执行扩缩容决策 await self._execute_scaling_decision(predicted_load) # 负载均衡 await self._rebalance_load() await asyncio.sleep(10) # 每 10 秒检查一次 except Exception as e: logger.error(f"Scheduler error: {e}") async def _collect_metrics(self): """收集实例指标""" total_load = 0.0 total_instances = 0 for instance in self.instances.values(): if instance.status == "ready": # 模拟指标收集（实际应从实例获取） instance.current_load = min(1.0, instance.request_count / 100) total_load += instance.current_load total_instances += 1 if total_instances > 0: avg_utilization = total_load / total_instances self.metrics_history.append({ 'timestamp': datetime.now(), 'utilization': avg_utilization, 'instances': total_instances }) def _predict_load(self) -> float: """基于时间序列预测未来负载""" if len(self.request_history) < 10: return 0.5 # 默认 50% # 简单移动平均预测 recent = list(self.request_history)[-30:] if not recent: return 0.5 avg_load = sum(r['load'] for r in recent) / len(recent) # 检测趋势 if len(recent) >= 60: older = sum(r['load'] for r in recent[-60:-30]) / 30 newer = sum(r['load'] for r in recent[-30:]) / 30 trend = (newer - older) / (older + 1e-6) else: trend = 0 # 预测 predicted = avg_load * (1 + trend * 0.5) return max(0, min(1, predicted)) async def _execute_scaling_decision(self, predicted_load: float): """执行扩缩容决策""" now = datetime.now() current_instances = sum(1 for i in self.instances.values() if i.status == "ready") # 计算目标实例数 target_instances = current_instances # 获取当前平均利用率 current_utilization = 0 if self.metrics_history: current_utilization = self.metrics_history[-1]['utilization'] # 扩容决策 if current_utilization > self.scaling_config.scale_up_threshold: if (now - self.last_scale_up_time).total_seconds() > self.scaling_config.scale_up_cooldown: if current_instances < self.scaling_config.max_instances: target_instances = min( self.scaling_config.max_instances, current_instances + 1 ) self.last_scale_up_time = now logger.info(f"Scaling up: {current_instances} -> {target_instances}") # 缩容决策 elif current_utilization < self.scaling_config.scale_down_threshold: if (now - self.last_scale_down_time).total_seconds() > self.scaling_config.scale_down_cooldown: if current_instances > self.scaling_config.min_instances: target_instances = max( self.scaling_config.min_instances, current_instances - 1 ) self.last_scale_down_time = now logger.info(f"Scaling down: {current_instances} -> {target_instances}") # 执行扩缩容 if target_instances != current_instances: if target_instances > current_instances: await self._scale_up(target_instances - current_instances) else: await self._scale_down(current_instances - target_instances) async def _scale_up(self, count: int): """扩容""" for _ in range(count): instance_id = f"inst_{int(time.time() * 1000)}" # 分配 GPU gpu_allocation = self.resource_manager.allocate_gpu( instance_id=instance_id, memory_required=8 * 1024**3 # 8GB ) if gpu_allocation is None: logger.warning(f"Failed to allocate GPU for new instance") break node_id, gpu_index = gpu_allocation instance = InstanceInfo( instance_id=instance_id, gpu_node=node_id, gpu_index=gpu_index, model_name="default", memory_required=8 * 1024**3 ) self.instances[instance_id] = instance if instance.model_name not in self.instances_by_model: self.instances_by_model[instance.model_name] = set() self.instances_by_model[instance.model_name].add(instance_id) # 异步启动实例 asyncio.create_task(self._start_instance(instance)) async def _scale_down(self, count: int): """缩容""" # 选择最空闲的实例 sorted_instances = sorted( [i for i in self.instances.values() if i.status == "ready"], key=lambda x: x.current_load ) for instance in sorted_instances[:count]: await self._stop_instance(instance) async def _start_instance(self, instance: InstanceInfo): """启动推理实例""" instance.status = "starting" # 模拟启动过程（实际应启动 vLLM 等） await asyncio.sleep(5) instance.status = "ready" logger.info(f"Instance {instance.instance_id} started on {instance.gpu_node}:{instance.gpu_index}") async def _stop_instance(self, instance: InstanceInfo): """停止推理实例""" instance.status = "draining" # 等待现有请求处理完成 await asyncio.sleep(10) # 释放 GPU self.resource_manager.release_gpu(instance.instance_id) # 移除实例 self.instances_by_model[instance.model_name].discard(instance.instance_id) del self.instances[instance.instance_id] logger.info(f"Instance {instance.instance_id} stopped") async def _rebalance_load(self): """负载均衡""" if len(self.request_history) == 0: return # 获取最新请求的模型 latest_request = self.request_history[-1] model_name = latest_request.get('model', 'default') # 选择负载最低的实例 ready_instances = [ i for i in self.instances.values() if i.status == "ready" and i.model_name == model_name ] if not ready_instances: return # 选择最空闲的实例 selected = min(ready_instances, key=lambda x: x.current_load) logger.debug(f"Selected instance {selected.instance_id} with load {selected.current_load}") async def route_request( self, model_name: str, request_data: dict ) -> Optional[str]: """路由请求到合适实例""" self.request_history.append({ 'timestamp': datetime.now(), 'model': model_name, 'load': 0.5 # 简化 }) ready_instances = [ i for i in self.instances.values() if i.status == "ready" and i.model_name == model_name ] if not ready_instances: return None # 简单轮询 return ready_instances[0].instance_id def get_status(self) -> dict: """获取调度器状态""" return { 'total_instances': len(self.instances), 'ready_instances': sum(1 for i in self.instances.values() if i.status == "ready"), 'metrics': { 'avg_utilization': ( self.metrics_history[-1]['utilization'] if self.metrics_history else 0 ), 'request_count': len(self.request_history), }, 'scaling': { 'last_scale_up': self.last_scale_up_time.isoformat(), 'last_scale_down': self.last_scale_down_time.isoformat(), } }

四、边界分析与架构权衡

4.1 GPU 调度策略对比

策略	优点	缺点	适用场景
FIFO	简单	可能导致长等待	批处理
公平调度	公平性好	可能导致资源浪费	多租户
负载均衡	资源利用率高	可能导致请求延迟	在线推理
预测调度	提前扩容	预测不准确时浪费	流量可预测

4.2 弹性调度注意事项

风险	缓解措施
扩容不及时	预测性扩容 + 资源预留
缩容过快	设置最小实例数 + 冷却时间
GPU 碎片化	资源池化 + 动态绑定
故障传播	熔断 + 自动恢复