Linux sch_fq公平队列FQ流分类与credit机制
Fair Queue(FQ)qdisc位于net/sched/sch_fq.c,核心目标是每个流(flow)一个FIFO队列,按轮询(DRR, Deficit Round Robin)方式调度,保证各流间的公平性,同时支持 pacing 和 per-flow 限速。
FQ的流分类基于sk_buff的hash值,默认使用内核计算的skb->hash。fq_classify函数将报文分配到对应流:
static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
u32 hash;
if (skb->protocol == htons(ETH_P_IP) ||
skb->protocol == htons(ETH_P_IPV6)) {
hash = skb_get_hash_perturb(skb, q->perturbation);
} else {
hash = skb->hash;
if (!hash)
hash = (u32)(unsigned long)skb_dst(skb) ^
skb->sk_hash;
}
hash = reciprocal_scale(hash, FQ_HASH_SIZE);
f = &q->flows[hash];
if (f->flowchain.prev == NULL) {
struct fq_flow *new_flow;
new_flow = fq_find_fitting_flow(q, skb, hash);
if (new_flow)
return new_flow;
if (f->qlen > 1 || f->stats.stoll > q->flow_refill)
return &q->internal;
f->fq_tin = skb_find_txq(skb, q->flow_max_rate ?
&q->rate_limiting_struct : NULL);
}
return f;
}
每个flow通过定时器进行pacing控制。fq_dequeue是核心调度函数:
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
struct sk_buff *skb;
u32 now = ktime_get_ns();
s64 credit;
f = list_first_entry_or_null(&q->new_flows, struct fq_flow,
flowchain);
if (!f) {
f = list_first_entry_or_null(&q->old_flows, struct fq_flow,
flowchain);
if (!f)
return NULL;
}
if (f->time_next_packet > now) {
if (!q->timer_active) {
q->timer_active = true;
hrtimer_start(&q->fq_timer,
ns_to_ktime(f->time_next_packet - now),
HRTIMER_MODE_REL_PINNED);
}
return NULL;
}
credit = f->credit;
skb = f->head;
if (skb) {
credit -= skb->len;
if (credit < 0 && !q->rate_enable) {
return NULL;
}
__skb_unlink(skb, &f->queue);
sch->q.qlen--;
f->credit = credit;
if (f->credit <= 0) {
f->credit += q->quantum;
list_move_tail(&f->flowchain, &q->old_flows);
}
}
if (f->credit > 0 && f->qlen > 0)
list_move_tail(&f->flowchain, &q->new_flows);
else if (f->qlen == 0)
list_del_init(&f->flowchain);
if (q->rate_enable) {
f->time_next_packet = now +
max_t(u64, q->flow_max_rate ?
div64_u64(skb->len * 1000ULL * NSEC_PER_USEC,
q->flow_max_rate) : 0,
skb->len * q->pacing_divider);
}
return skb;
}
credit机制是DRR的核心。每个flow拥有一个credit计数器,初始值为quantum。每次从该flow出队一个报文,减去其长度。当credit变为负数(或零以下)时,该flow被移到old_flows链表,并补充一个quantum的credit。调度器优先服务new_flows链表中的flow,只有new_flows为空时才处理old_flows。这种设计确保新产生的active flow不会被old_flows中的flow饿死。
quantum参数的默认值在fq_change函数中设定:
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct tc_fq_qopt *ctl = nla_data(opt);
if (ctl->quantum)
q->quantum = ctl->quantum;
else
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
if (ctl->initial_quantum)
q->initial_quantum = ctl->initial_quantum;
else
q->initial_quantum = 0;
if (ctl->maxrate) {
q->flow_max_rate = ctl->maxrate;
q->rate_enable = 1;
}
if (ctl->pacing) {
q->pacing_divisor = ctl->pacing;
}
}
在credit计算中,fq_dequeue对每个flow出队后重新计算credit:
static void fq_flow_add_to_list(struct fq_sched_data *q,
struct fq_flow *f)
{
if (f->credit <= 0) {
f->credit += q->quantum;
list_add_tail(&f->flowchain, &q->old_flows);
} else {
list_add_tail(&f->flowchain, &q->new_flows);
}
}
flow从new_flows转移到old_flows的条件是credit耗尽,转移后补充quantum credit。但如果补充后credit仍为正,则留在new_flows继续参与调度。这种设计保证credit用尽的flow暂时让出调度机会,实现按字节严格公平。
per-flow的pacing依靠fq->time_next_packet字段,hrtimer到期后才会调用fq_dequeue,控制报文发送速率。timer回调函数为fq_timer:
static enum hrtimer_restart fq_timer(struct hrtimer *timer)
{
struct fq_sched_data *q = container_of(timer, struct fq_sched_data,
fq_timer);
struct Qdisc *sch = q->sch;
struct net_device *dev = qdisc_dev(sch);
q->timer_active = false;
__netif_schedule(sch);
return HRTIMER_NORESTART;
}
timer仅触发一次调度,fq_dequeue在必要时重新启新timer。这种defer机制将pacing粒度控制在纳秒级,适合高速网卡下精确流量整形。