diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 22 | ||||
-rw-r--r-- | net/sched/Makefile | 3 | ||||
-rw-r--r-- | net/sched/cls_route.c | 2 | ||||
-rw-r--r-- | net/sched/cls_u32.c | 12 | ||||
-rw-r--r-- | net/sched/em_meta.c | 4 | ||||
-rw-r--r-- | net/sched/sch_api.c | 36 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 9 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 688 | ||||
-rw-r--r-- | net/sched/sch_drr.c | 2 | ||||
-rw-r--r-- | net/sched/sch_dsmark.c | 2 | ||||
-rw-r--r-- | net/sched/sch_fifo.c | 48 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 26 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 4 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 14 | ||||
-rw-r--r-- | net/sched/sch_mq.c | 1 | ||||
-rw-r--r-- | net/sched/sch_mqprio.c | 23 | ||||
-rw-r--r-- | net/sched/sch_multiq.c | 2 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 406 | ||||
-rw-r--r-- | net/sched/sch_prio.c | 2 | ||||
-rw-r--r-- | net/sched/sch_red.c | 11 | ||||
-rw-r--r-- | net/sched/sch_sfb.c | 709 | ||||
-rw-r--r-- | net/sched/sch_sfq.c | 54 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 4 | ||||
-rw-r--r-- | net/sched/sch_teql.c | 3 |
24 files changed, 1943 insertions, 144 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e318f45..a7a5583 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -126,6 +126,17 @@ config NET_SCH_RED To compile this code as a module, choose M here: the module will be called sch_red. +config NET_SCH_SFB + tristate "Stochastic Fair Blue (SFB)" + ---help--- + Say Y here if you want to use the Stochastic Fair Blue (SFB) + packet scheduling algorithm. + + See the top of <file:net/sched/sch_sfb.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_sfb. + config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" ---help--- @@ -217,6 +228,17 @@ config NET_SCH_MQPRIO If unsure, say N. +config NET_SCH_CHOKE + tristate "CHOose and Keep responsive flow scheduler (CHOKE)" + help + Say Y here if you want to use the CHOKe packet scheduler (CHOose + and Keep for responsive flows, CHOose and Kill for unresponsive + flows). This is a variation of RED which trys to penalize flows + that monopolize the queue. + + To compile this code as a module, choose M here: the + module will be called sch_choke. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 26ce681..2e77b8d 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o @@ -33,6 +34,8 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o + obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index d580cdf..a9079053 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -143,7 +143,7 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, if (head == NULL) goto old_method; - iif = ((struct rtable *)dst)->fl.iif; + iif = ((struct rtable *)dst)->rt_iif; h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 966920c..3b93fc0 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -134,12 +134,12 @@ next_knode: for (i = n->sel.nkeys; i > 0; i--, key++) { int toff = off + key->off + (off2 & key->offmask); - __be32 *data, _data; + __be32 *data, hdata; if (skb_headroom(skb) + toff > INT_MAX) goto out; - data = skb_header_pointer(skb, toff, 4, &_data); + data = skb_header_pointer(skb, toff, 4, &hdata); if (!data) goto out; if ((*data ^ key->val) & key->mask) { @@ -187,10 +187,10 @@ check_terminal: ht = n->ht_down; sel = 0; if (ht->divisor) { - __be32 *data, _data; + __be32 *data, hdata; data = skb_header_pointer(skb, off + n->sel.hoff, 4, - &_data); + &hdata); if (!data) goto out; sel = ht->divisor & u32_hash_fold(*data, &n->sel, @@ -202,11 +202,11 @@ check_terminal: if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; if (n->sel.flags & TC_U32_VAROFFSET) { - __be16 *data, _data; + __be16 *data, hdata; data = skb_header_pointer(skb, off + n->sel.offoff, - 2, &_data); + 2, &hdata); if (!data) goto out; off2 += ntohs(n->sel.offmask & *data) >> diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index a889d09..a4de67e 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -264,7 +264,7 @@ META_COLLECTOR(int_rtiif) if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = skb_rtable(skb)->fl.iif; + dst->value = skb_rtable(skb)->rt_iif; } /************************************************************************** @@ -401,7 +401,7 @@ META_COLLECTOR(int_sk_sndbuf) META_COLLECTOR(int_sk_alloc) { SKIP_NONLOCAL(skb); - dst->value = skb->sk->sk_allocation; + dst->value = (__force int) skb->sk->sk_allocation; } META_COLLECTOR(int_sk_route_caps) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 36ac0ec..7490f3f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -398,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) return stab; } +static void stab_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct qdisc_size_table, rcu)); +} + void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) @@ -407,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (--tab->refcnt == 0) { list_del(&tab->list); - kfree(tab); + call_rcu_bh(&tab->rcu, stab_kfree_rcu); } spin_unlock(&qdisc_stab_lock); @@ -430,7 +435,7 @@ nla_put_failure: return -1; } -void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab) +void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; @@ -456,7 +461,7 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(qdisc_calculate_pkt_len); +EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) { @@ -473,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); - wd->qdisc->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); return HRTIMER_NORESTART; @@ -495,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) &qdisc_root_sleeping(wd->qdisc)->state)) return; - wd->qdisc->flags |= TCQ_F_THROTTLED; + qdisc_throttled(wd->qdisc); time = ktime_set(0, 0); time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); @@ -505,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); - wd->qdisc->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); @@ -835,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, err = PTR_ERR(stab); goto err_out4; } - sch->stab = stab; + rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { spinlock_t *root_lock; @@ -875,7 +880,7 @@ err_out4: * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. */ - qdisc_put_stab(sch->stab); + qdisc_put_stab(rtnl_dereference(sch->stab)); if (ops->destroy) ops->destroy(sch); goto err_out3; @@ -883,7 +888,7 @@ err_out4: static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) { - struct qdisc_size_table *stab = NULL; + struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { @@ -900,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) return PTR_ERR(stab); } - qdisc_put_stab(sch->stab); - sch->stab = stab; + ostab = rtnl_dereference(sch->stab); + rcu_assign_pointer(sch->stab, stab); + qdisc_put_stab(ostab); if (tca[TCA_RATE]) { /* NB: ignores errors from replace_estimator @@ -1180,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; + struct qdisc_size_table *stab; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); @@ -1195,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; q->qstats.qlen = q->q.qlen; - if (q->stab && qdisc_dump_stab(skb, q->stab) < 0) + stab = rtnl_dereference(q->stab); + if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, @@ -1664,12 +1672,12 @@ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { int err = 0; - __be16 protocol; #ifdef CONFIG_NET_CLS_ACT + __be16 protocol; struct tcf_proto *otp = tp; reclassify: -#endif protocol = skb->protocol; +#endif err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 4aaf44c..24d94c0 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -351,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) { int toplevel = q->toplevel; - if (toplevel > cl->level && !(cl->q->flags & TCQ_F_THROTTLED)) { + if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { psched_time_t now; psched_tdiff_t incr; @@ -391,7 +391,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - qdisc_bstats_update(sch, skb); cbq_mark_toplevel(q, cl); if (!cl->next_alive) cbq_activate_class(cl); @@ -625,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); } - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); __netif_schedule(qdisc_root(sch)); return HRTIMER_NORESTART; } @@ -650,7 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - qdisc_bstats_update(sch, skb); if (!cl->next_alive) cbq_activate_class(cl); return 0; @@ -973,8 +971,9 @@ cbq_dequeue(struct Qdisc *sch) skb = cbq_dequeue_1(sch); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); return skb; } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c new file mode 100644 index 0000000..06afbae --- /dev/null +++ b/net/sched/sch_choke.c @@ -0,0 +1,688 @@ +/* + * net/sched/sch_choke.c CHOKE scheduler + * + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/reciprocal_div.h> +#include <linux/vmalloc.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/red.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> + +/* + CHOKe stateless AQM for fair bandwidth allocation + ================================================= + + CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for + unresponsive flows) is a variant of RED that penalizes misbehaving flows but + maintains no flow state. The difference from RED is an additional step + during the enqueuing process. If average queue size is over the + low threshold (qmin), a packet is chosen at random from the queue. + If both the new and chosen packet are from the same flow, both + are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it + needs to access packets in queue randomly. It has a minimal class + interface to allow overriding the builtin flow classifier with + filters. + + Source: + R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless + Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", + IEEE INFOCOM, 2000. + + A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial + Characteristics", IEEE/ACM Transactions on Networking, 2004 + + */ + +/* Upper bound on size of sk_buff table (packets) */ +#define CHOKE_MAX_QUEUE (128*1024 - 1) + +struct choke_sched_data { +/* Parameters */ + u32 limit; + unsigned char flags; + + struct red_parms parms; + +/* Variables */ + struct tcf_proto *filter_list; + struct { + u32 prob_drop; /* Early probability drops */ + u32 prob_mark; /* Early probability marks */ + u32 forced_drop; /* Forced drops, qavg > max_thresh */ + u32 forced_mark; /* Forced marks, qavg > max_thresh */ + u32 pdrop; /* Drops due to queue limits */ + u32 other; /* Drops due to drop() calls */ + u32 matched; /* Drops to flow match */ + } stats; + + unsigned int head; + unsigned int tail; + + unsigned int tab_mask; /* size - 1 */ + + struct sk_buff **tab; +}; + +/* deliver a random number between 0 and N - 1 */ +static u32 random_N(unsigned int N) +{ + return reciprocal_divide(random32(), N); +} + +/* number of elements in queue including holes */ +static unsigned int choke_len(const struct choke_sched_data *q) +{ + return (q->tail - q->head) & q->tab_mask; +} + +/* Is ECN parameter configured */ +static int use_ecn(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +/* Should packets over max just be dropped (versus marked) */ +static int use_harddrop(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +/* Move head pointer forward to skip over holes */ +static void choke_zap_head_holes(struct choke_sched_data *q) +{ + do { + q->head = (q->head + 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->head] == NULL); +} + +/* Move tail pointer backwards to reuse holes */ +static void choke_zap_tail_holes(struct choke_sched_data *q) +{ + do { + q->tail = (q->tail - 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->tail] == NULL); +} + +/* Drop packet from queue array by creating a "hole" */ +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = q->tab[idx]; + + q->tab[idx] = NULL; + + if (idx == q->head) + choke_zap_head_holes(q); + if (idx == q->tail) + choke_zap_tail_holes(q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + qdisc_tree_decrease_qlen(sch, 1); + --sch->q.qlen; +} + +/* + * Compare flow of two packets + * Returns true only if source and destination address and port match. + * false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, + struct sk_buff *skb2) +{ + int off1, off2, poff; + const u32 *ports1, *ports2; + u8 ip_proto; + __u32 hash1; + + if (skb1->protocol != skb2->protocol) + return false; + + /* Use hash value as quick check + * Assumes that __skb_get_rxhash makes IP header and ports linear + */ + hash1 = skb_get_rxhash(skb1); + if (!hash1 || hash1 != skb_get_rxhash(skb2)) + return false; + + /* Probably match, but be sure to avoid hash collisions */ + off1 = skb_network_offset(skb1); + off2 = skb_network_offset(skb2); + + switch (skb1->protocol) { + case __constant_htons(ETH_P_IP): { + const struct iphdr *ip1, *ip2; + + ip1 = (const struct iphdr *) (skb1->data + off1); + ip2 = (const struct iphdr *) (skb2->data + off2); + + ip_proto = ip1->protocol; + if (ip_proto != ip2->protocol || + ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr) + return false; + + if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + off1 += ip1->ihl * 4; + off2 += ip2->ihl * 4; + break; + } + + case __constant_htons(ETH_P_IPV6): { + const struct ipv6hdr *ip1, *ip2; + + ip1 = (const struct ipv6hdr *) (skb1->data + off1); + ip2 = (const struct ipv6hdr *) (skb2->data + off2); + + ip_proto = ip1->nexthdr; + if (ip_proto != ip2->nexthdr || + ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) || + ipv6_addr_cmp(&ip1->daddr, &ip2->daddr)) + return false; + off1 += 40; + off2 += 40; + } + + default: /* Maybe compare MAC header here? */ + return false; + } + + poff = proto_ports_offset(ip_proto); + if (poff < 0) + return true; + + off1 += poff; + off2 += poff; + + ports1 = (__force u32 *)(skb1->data + off1); + ports2 = (__force u32 *)(skb2->data + off2); + return *ports1 == *ports2; +} + +struct choke_skb_cb { + u16 classid; +}; + +static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) < + sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline void choke_set_classid(struct sk_buff *skb, u16 classid) +{ + choke_skb_cb(skb)->classid = classid; +} + +static u16 choke_get_classid(const struct sk_buff *skb) +{ + return choke_skb_cb(skb)->classid; +} + +/* + * Classify flow using either: + * 1. pre-existing classification result in skb + * 2. fast internal classification + * 3. use TC filter based classification + */ +static bool choke_classify(struct sk_buff *skb, + struct Qdisc *sch, int *qerr) + +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + choke_set_classid(skb, TC_H_MIN(res.classid)); + return true; + } + + return false; +} + +/* + * Select a packet at random from queue + * HACK: since queue can have holes from previous deletion; retry several + * times to find a random skb but then just give up and return the head + * Will return NULL if queue is empty (q->head == q->tail) + */ +static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, + unsigned int *pidx) +{ + struct sk_buff *skb; + int retrys = 3; + + do { + *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask; + skb = q->tab[*pidx]; + if (skb) + return skb; + } while (--retrys > 0); + + return q->tab[*pidx = q->head]; +} + +/* + * Compare new packet with random packet in queue + * returns true if matched and sets *pidx + */ +static bool choke_match_random(const struct choke_sched_data *q, + struct sk_buff *nskb, + unsigned int *pidx) +{ + struct sk_buff *oskb; + + if (q->head == q->tail) + return false; + + oskb = choke_peek_random(q, pidx); + if (q->filter_list) + return choke_get_classid(nskb) == choke_get_classid(oskb); + + return choke_match_flow(oskb, nskb); +} + +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct red_parms *p = &q->parms; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!choke_classify(skb, sch, &ret)) + goto other_drop; /* Packet was eaten by filter */ + } + + /* Compute average queue usage (see RED) */ + p->qavg = red_calc_qavg(p, sch->q.qlen); + if (red_is_idling(p)) + red_end_of_idle_period(p); + + /* Is queue small? */ + if (p->qavg <= p->qth_min) + p->qcount = -1; + else { + unsigned int idx; + + /* Draw a packet at random from queue and compare flow */ + if (choke_match_random(q, skb, &idx)) { + q->stats.matched++; + choke_drop_by_idx(sch, idx); + goto congestion_drop; + } + + /* Queue is large, always mark/drop */ + if (p->qavg > p->qth_max) { + p->qcount = -1; + + sch->qstats.overlimits++; + if (use_harddrop(q) || !use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + } else if (++p->qcount) { + if (red_mark_probability(p, p->qavg)) { + p->qcount = 0; + p->qR = red_random(p); + + sch->qstats.overlimits++; + if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + } + } else + p->qR = red_random(p); + } + + /* Admit new packet */ + if (sch->q.qlen < q->limit) { + q->tab[q->tail] = skb; + q->tail = (q->tail + 1) & q->tab_mask; + ++sch->q.qlen; + sch->qstats.backlog += qdisc_pkt_len(skb); + return NET_XMIT_SUCCESS; + } + + q->stats.pdrop++; + sch->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; + + congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; + + other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *choke_dequeue(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + if (q->head == q->tail) { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + return NULL; + } + + skb = q->tab[q->head]; + q->tab[q->head] = NULL; + choke_zap_head_holes(q); + --sch->q.qlen; + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + + return skb; +} + +static unsigned int choke_drop(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + unsigned int len; + + len = qdisc_queue_drop(sch); + if (len > 0) + q->stats.other++; + else { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + } + + return len; +} + +static void choke_reset(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + red_restart(&q->parms); +} + +static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { + [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, +}; + + +static void choke_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static int choke_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CHOKE_MAX + 1]; + const struct tc_red_qopt *ctl; + int err; + struct sk_buff **old = NULL; + unsigned int mask; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy); + if (err < 0) + return err; + + if (tb[TCA_CHOKE_PARMS] == NULL || + tb[TCA_CHOKE_STAB] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_CHOKE_PARMS]); + + if (ctl->limit > CHOKE_MAX_QUEUE) + return -EINVAL; + + mask = roundup_pow_of_two(ctl->limit + 1) - 1; + if (mask != q->tab_mask) { + struct sk_buff **ntab; + + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + if (!ntab) + ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + if (!ntab) + return -ENOMEM; + + sch_tree_lock(sch); + old = q->tab; + if (old) { + unsigned int oqlen = sch->q.qlen, tail = 0; + + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + if (tail < mask) { + ntab[tail++] = skb; + continue; + } + sch->qstats.backlog -= qdisc_pkt_len(skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + q->head = 0; + q->tail = tail; + } + + q->tab_mask = mask; + q->tab = ntab; + } else + sch_tree_lock(sch); + + q->flags = ctl->flags; + q->limit = ctl->limit; + + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_CHOKE_STAB])); + + if (q->head == q->tail) + red_end_of_idle_period(&q->parms); + + sch_tree_unlock(sch); + choke_free(old); + return 0; +} + +static int choke_init(struct Qdisc *sch, struct nlattr *opt) +{ + return choke_change(sch, opt); +} + +static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tc_choke_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .marked = q->stats.prob_mark + q->stats.forced_mark, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .matched = q->stats.matched, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void choke_destroy(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + choke_free(q->tab); +} + +static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long choke_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static void choke_put(struct Qdisc *q, unsigned long cl) +{ +} + +static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int choke_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + if (!arg->stop) { + if (arg->fn(sch, 1, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops choke_class_ops = { + .leaf = choke_leaf, + .get = choke_get, + .put = choke_put, + .tcf_chain = choke_find_tcf, + .bind_tcf = choke_bind, + .unbind_tcf = choke_put, + .dump = choke_dump_class, + .walk = choke_walk, +}; + +static struct sk_buff *choke_peek_head(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + return (q->head != q->tail) ? q->tab[q->head] : NULL; +} + +static struct Qdisc_ops choke_qdisc_ops __read_mostly = { + .id = "choke", + .priv_size = sizeof(struct choke_sched_data), + + .enqueue = choke_enqueue, + .dequeue = choke_dequeue, + .peek = choke_peek_head, + .drop = choke_drop, + .init = choke_init, + .destroy = choke_destroy, + .reset = choke_reset, + .change = choke_change, + .dump = choke_dump, + .dump_stats = choke_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init choke_module_init(void) +{ + return register_qdisc(&choke_qdisc_ops); +} + +static void __exit choke_module_exit(void) +{ + unregister_qdisc(&choke_qdisc_ops); +} + +module_init(choke_module_init) +module_exit(choke_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index de55e64..6b7fe4a 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -376,7 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) } bstats_update(&cl->bstats, skb); - qdisc_bstats_update(sch, skb); sch->q.qlen++; return err; @@ -403,6 +402,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) skb = qdisc_dequeue_peeked(cl->qdisc); if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 4970d56..2c79020 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -260,7 +260,6 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -283,6 +282,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) if (skb == NULL) return NULL; + qdisc_bstats_update(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index b3075f8..66effe2 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,15 +19,9 @@ /* 1 band FIFO pseudo-"scheduler" */ -struct fifo_sched_data { - u32 limit; -}; - static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct fifo_sched_data *q = qdisc_priv(sch); - - if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= q->limit)) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); return qdisc_reshape_fail(skb, sch); @@ -35,9 +29,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct fifo_sched_data *q = qdisc_priv(sch); - - if (likely(skb_queue_len(&sch->q) < q->limit)) + if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); return qdisc_reshape_fail(skb, sch); @@ -45,17 +37,12 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct sk_buff *skb_head; - struct fifo_sched_data *q = qdisc_priv(sch); - - if (likely(skb_queue_len(&sch->q) < q->limit)) + if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); /* queue full, remove one skb to fulfill the limit */ - skb_head = qdisc_dequeue_head(sch); + __qdisc_queue_drop_head(sch, &sch->q); sch->qstats.drops++; - kfree_skb(skb_head); - qdisc_enqueue_tail(skb, sch); return NET_XMIT_CN; @@ -63,31 +50,40 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) static int fifo_init(struct Qdisc *sch, struct nlattr *opt) { - struct fifo_sched_data *q = qdisc_priv(sch); + bool bypass; + bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; - if (sch->ops == &bfifo_qdisc_ops) + if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); - q->limit = limit; + sch->limit = limit; } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; - q->limit = ctl->limit; + sch->limit = ctl->limit; } + if (is_bfifo) + bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); + else + bypass = sch->limit >= 1; + + if (bypass) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct fifo_sched_data *q = qdisc_priv(sch); - struct tc_fifo_qopt opt = { .limit = q->limit }; + struct tc_fifo_qopt opt = { .limit = sch->limit }; NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; @@ -98,7 +94,7 @@ nla_put_failure: struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", - .priv_size = sizeof(struct fifo_sched_data), + .priv_size = 0, .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, @@ -113,7 +109,7 @@ EXPORT_SYMBOL(pfifo_qdisc_ops); struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .id = "bfifo", - .priv_size = sizeof(struct fifo_sched_data), + .priv_size = 0, .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, @@ -128,7 +124,7 @@ EXPORT_SYMBOL(bfifo_qdisc_ops); struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .id = "pfifo_head_drop", - .priv_size = sizeof(struct fifo_sched_data), + .priv_size = 0, .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2f1cb62..c84b659 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -527,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) skb_queue_head_init(band2list(priv, prio)); + /* Can by-pass the queue discipline */ + qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } @@ -548,21 +550,25 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, { void *p; struct Qdisc *sch; - unsigned int size; + unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; - /* ensure that the Qdisc and the private data are 64-byte aligned */ - size = QDISC_ALIGN(sizeof(*sch)); - size += ops->priv_size + (QDISC_ALIGNTO - 1); - p = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!p) goto errout; sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); - sch->padded = (char *) sch - (char *) p; - + /* if we got non aligned memory, ask more and do alignment ourself */ + if (sch != p) { + kfree(p); + p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); + if (!p) + goto errout; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; + } INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); spin_lock_init(&sch->busylock); @@ -632,7 +638,7 @@ void qdisc_destroy(struct Qdisc *qdisc) #ifdef CONFIG_NET_SCHED qdisc_list_del(qdisc); - qdisc_put_stab(qdisc->stab); + qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); if (ops->reset) @@ -691,9 +697,6 @@ static void attach_one_default_qdisc(struct net_device *dev, netdev_info(dev, "activation failed\n"); return; } - - /* Can by-pass the queue discipline for default qdisc */ - qdisc->flags |= TCQ_F_CAN_BYPASS; } dev_queue->qdisc_sleeping = qdisc; } @@ -841,6 +844,7 @@ void dev_deactivate(struct net_device *dev) list_add(&dev->unreg_list, &single); dev_deactivate_many(&single); + list_del(&single); } EXPORT_SYMBOL(dev_deactivate); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index dea4009..6488e64 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1598,7 +1598,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) set_active(cl, qdisc_pkt_len(skb)); bstats_update(&cl->bstats, skb); - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1664,7 +1663,8 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 3e86fd3..e1429a8 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -581,7 +581,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen++; - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -856,7 +855,7 @@ next: static struct sk_buff *htb_dequeue(struct Qdisc *sch) { - struct sk_buff *skb = NULL; + struct sk_buff *skb; struct htb_sched *q = qdisc_priv(sch); int level; psched_time_t next_event; @@ -865,7 +864,9 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) /* try to dequeue direct packets as high prio (!) to minimize cpu work */ skb = __skb_dequeue(&q->direct_queue); if (skb != NULL) { - sch->flags &= ~TCQ_F_THROTTLED; +ok: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); sch->q.qlen--; return skb; } @@ -899,11 +900,8 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) m |= 1 << prio; skb = htb_dequeue_tree(q, prio, level); - if (likely(skb != NULL)) { - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - goto fin; - } + if (likely(skb != NULL)) + goto ok; } } sch->qstats.overlimits++; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index ecc302f..ec5cbc8 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) TC_H_MIN(ntx + 1))); if (qdisc == NULL) goto err; - qdisc->flags |= TCQ_F_CAN_BYPASS; priv->qdiscs[ntx] = qdisc; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8620c65..ea17cbe 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -29,18 +29,18 @@ static void mqprio_destroy(struct Qdisc *sch) struct mqprio_sched *priv = qdisc_priv(sch); unsigned int ntx; - if (!priv->qdiscs) - return; - - for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) - qdisc_destroy(priv->qdiscs[ntx]); + if (priv->qdiscs) { + for (ntx = 0; + ntx < dev->num_tx_queues && priv->qdiscs[ntx]; + ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + kfree(priv->qdiscs); + } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) dev->netdev_ops->ndo_setup_tc(dev, 0); else netdev_set_num_tc(dev, 0); - - kfree(priv->qdiscs); } static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) @@ -130,7 +130,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) err = -ENOMEM; goto err; } - qdisc->flags |= TCQ_F_CAN_BYPASS; priv->qdiscs[i] = qdisc; } @@ -216,7 +215,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); - struct tc_mqprio_qopt opt; + struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int i; @@ -312,7 +311,9 @@ static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, } static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct gnet_dump *d) + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) { struct net_device *dev = qdisc_dev(sch); @@ -390,7 +391,7 @@ static const struct Qdisc_class_ops mqprio_class_ops = { .dump_stats = mqprio_dump_class_stats, }; -struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { +static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .cl_ops = &mqprio_class_ops, .id = "mqprio", .priv_size = sizeof(struct mqprio_sched), diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 820f2a7..edc1950 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -83,7 +83,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -112,6 +111,7 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch) qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index c2bbbe6..edbbf7a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -19,12 +19,13 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> -#define VERSION "1.2" +#define VERSION "1.3" /* Network Emulation Queuing algorithm. ==================================== @@ -47,6 +48,20 @@ layering other disciplines. It does not need to do bandwidth control either since that can be handled by using token bucket or other rate control. + + Correlated Loss Generator models + + Added generation of correlated loss according to the + "Gilbert-Elliot" model, a 4-state markov model. + + References: + [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG + [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general + and intuitive loss model for packet networks and its implementation + in the Netem module in the Linux kernel", available in [1] + + Authors: Stefano Salsano <stefano.salsano at uniroma2.it + Fabio Ludovici <fabio.ludovici at yahoo.it> */ struct netem_sched_data { @@ -73,6 +88,26 @@ struct netem_sched_data { u32 size; s16 table[0]; } *delay_dist; + + enum { + CLG_RANDOM, + CLG_4_STATES, + CLG_GILB_ELL, + } loss_model; + + /* Correlated Loss Generation models */ + struct clgstate { + /* state of the Markov chain */ + u8 state; + + /* 4-states and Gilbert-Elliot models */ + u32 a1; /* p13 for 4-states or p for GE */ + u32 a2; /* p31 for 4-states or r for GE */ + u32 a3; /* p32 for 4-states or h for GE */ + u32 a4; /* p14 for 4-states or 1-k for GE */ + u32 a5; /* p23 used only in 4-states */ + } clg; + }; /* Time stamp put into socket buffer control block */ @@ -115,6 +150,122 @@ static u32 get_crandom(struct crndstate *state) return answer; } +/* loss_4state - 4-state model loss generator + * Generates losses according to the 4-state Markov chain adopted in + * the GI (General and Intuitive) loss model. + */ +static bool loss_4state(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + u32 rnd = net_random(); + + /* + * Makes a comparision between rnd and the transition + * probabilities outgoing from the current state, then decides the + * next state and if the next packet has to be transmitted or lost. + * The four states correspond to: + * 1 => successfully transmitted packets within a gap period + * 4 => isolated losses within a gap period + * 3 => lost packets within a burst period + * 2 => successfully transmitted packets within a burst period + */ + switch (clg->state) { + case 1: + if (rnd < clg->a4) { + clg->state = 4; + return true; + } else if (clg->a4 < rnd && rnd < clg->a1) { + clg->state = 3; + return true; + } else if (clg->a1 < rnd) + clg->state = 1; + + break; + case 2: + if (rnd < clg->a5) { + clg->state = 3; + return true; + } else + clg->state = 2; + + break; + case 3: + if (rnd < clg->a3) + clg->state = 2; + else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + clg->state = 1; + return true; + } else if (clg->a2 + clg->a3 < rnd) { + clg->state = 3; + return true; + } + break; + case 4: + clg->state = 1; + break; + } + + return false; +} + +/* loss_gilb_ell - Gilbert-Elliot model loss generator + * Generates losses according to the Gilbert-Elliot loss model or + * its special cases (Gilbert or Simple Gilbert) + * + * Makes a comparision between random number and the transition + * probabilities outgoing from the current state, then decides the + * next state. A second random number is extracted and the comparision + * with the loss probability of the current state decides if the next + * packet will be transmitted or lost. + */ +static bool loss_gilb_ell(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + + switch (clg->state) { + case 1: + if (net_random() < clg->a1) + clg->state = 2; + if (net_random() < clg->a4) + return true; + case 2: + if (net_random() < clg->a2) + clg->state = 1; + if (clg->a3 > net_random()) + return true; + } + + return false; +} + +static bool loss_event(struct netem_sched_data *q) +{ + switch (q->loss_model) { + case CLG_RANDOM: + /* Random packet drop 0 => none, ~0 => all */ + return q->loss && q->loss >= get_crandom(&q->loss_cor); + + case CLG_4_STATES: + /* 4state loss model algorithm (used also for GI model) + * Extracts a value from the markov 4 state loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_4state(q); + + case CLG_GILB_ELL: + /* Gilbert-Elliot loss model algorithm + * Extracts a value from the Gilbert-Elliot loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_gilb_ell(q); + } + + return false; /* not reached */ +} + + /* tabledist - return a pseudo-randomly distributed value with mean mu and * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. @@ -161,14 +312,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) int ret; int count = 1; - pr_debug("netem_enqueue skb=%p\n", skb); - /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) ++count; - /* Random packet drop 0 => none, ~0 => all */ - if (q->loss && q->loss >= get_crandom(&q->loss_cor)) + /* Drop packet? */ + if (loss_event(q)) --count; if (count == 0) { @@ -238,15 +387,15 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = NET_XMIT_SUCCESS; } - if (likely(ret == NET_XMIT_SUCCESS)) { - sch->q.qlen++; - qdisc_bstats_update(sch, skb); - } else if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + return ret; + } } - pr_debug("netem: enqueue ret %d\n", ret); - return ret; + sch->q.qlen++; + return NET_XMIT_SUCCESS; } static unsigned int netem_drop(struct Qdisc *sch) @@ -266,7 +415,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - if (sch->flags & TCQ_F_THROTTLED) + if (qdisc_is_throttled(sch)) return NULL; skb = q->qdisc->ops->peek(q->qdisc); @@ -288,8 +437,10 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) skb->tstamp.tv64 = 0; #endif - pr_debug("netem_dequeue: return skb=%p\n", skb); + sch->q.qlen--; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); return skb; } @@ -308,6 +459,16 @@ static void netem_reset(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); } +static void dist_free(struct disttable *d) +{ + if (d) { + if (is_vmalloc_addr(d)) + vfree(d); + else + kfree(d); + } +} + /* * Distribution data is a variable size payload containing * signed 16 bit values. @@ -315,16 +476,20 @@ static void netem_reset(struct Qdisc *sch) static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - unsigned long n = nla_len(attr)/sizeof(__s16); + size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); spinlock_t *root_lock; struct disttable *d; int i; + size_t s; - if (n > 65536) + if (n > NETEM_DIST_MAX) return -EINVAL; - d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); + s = sizeof(struct disttable) + n * sizeof(s16); + d = kmalloc(s, GFP_KERNEL); + if (!d) + d = vmalloc(s); if (!d) return -ENOMEM; @@ -335,7 +500,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - kfree(q->delay_dist); + dist_free(q->delay_dist); q->delay_dist = d; spin_unlock_bh(root_lock); return 0; @@ -369,10 +534,66 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->corrupt_cor, r->correlation); } +static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct nlattr *la; + int rem; + + nla_for_each_nested(la, attr, rem) { + u16 type = nla_type(la); + + switch(type) { + case NETEM_LOSS_GI: { + const struct tc_netem_gimodel *gi = nla_data(la); + + if (nla_len(la) != sizeof(struct tc_netem_gimodel)) { + pr_info("netem: incorrect gi model size\n"); + return -EINVAL; + } + + q->loss_model = CLG_4_STATES; + + q->clg.state = 1; + q->clg.a1 = gi->p13; + q->clg.a2 = gi->p31; + q->clg.a3 = gi->p32; + q->clg.a4 = gi->p14; + q->clg.a5 = gi->p23; + break; + } + + case NETEM_LOSS_GE: { + const struct tc_netem_gemodel *ge = nla_data(la); + + if (nla_len(la) != sizeof(struct tc_netem_gemodel)) { + pr_info("netem: incorrect gi model size\n"); + return -EINVAL; + } + + q->loss_model = CLG_GILB_ELL; + q->clg.state = 1; + q->clg.a1 = ge->p; + q->clg.a2 = ge->r; + q->clg.a3 = ge->h; + q->clg.a4 = ge->k1; + break; + } + + default: + pr_info("netem: unknown loss type %u\n", type); + return -EINVAL; + } + } + + return 0; +} + static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -380,11 +601,15 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, { int nested_len = nla_len(nla) - NLA_ALIGN(len); - if (nested_len < 0) + if (nested_len < 0) { + pr_info("netem: invalid attributes len %d\n", nested_len); return -EINVAL; + } + if (nested_len >= nla_attr_size(0)) return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), nested_len, policy); + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; } @@ -407,7 +632,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) ret = fifo_set_limit(q->qdisc, qopt->limit); if (ret) { - pr_debug("netem: can't set fifo limit\n"); + pr_info("netem: can't set fifo limit\n"); return ret; } @@ -440,7 +665,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_CORRUPT]) get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); - return 0; + q->loss_model = CLG_RANDOM; + if (tb[TCA_NETEM_LOSS]) + ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); + + return ret; } /* @@ -476,7 +705,6 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) __skb_queue_after(list, skb, nskb); sch->qstats.backlog += qdisc_pkt_len(nskb); - qdisc_bstats_update(sch, nskb); return NET_XMIT_SUCCESS; } @@ -536,16 +764,17 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); + q->loss_model = CLG_RANDOM; q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1)); if (!q->qdisc) { - pr_debug("netem: qdisc create failed\n"); + pr_notice("netem: qdisc create tfifo qdisc failed\n"); return -ENOMEM; } ret = netem_change(sch, opt); if (ret) { - pr_debug("netem: change failed\n"); + pr_info("netem: change failed\n"); qdisc_destroy(q->qdisc); } return ret; @@ -557,14 +786,61 @@ static void netem_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); qdisc_destroy(q->qdisc); - kfree(q->delay_dist); + dist_free(q->delay_dist); +} + +static int dump_loss_model(const struct netem_sched_data *q, + struct sk_buff *skb) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_NETEM_LOSS); + if (nest == NULL) + goto nla_put_failure; + + switch (q->loss_model) { + case CLG_RANDOM: + /* legacy loss model */ + nla_nest_cancel(skb, nest); + return 0; /* no data */ + + case CLG_4_STATES: { + struct tc_netem_gimodel gi = { + .p13 = q->clg.a1, + .p31 = q->clg.a2, + .p32 = q->clg.a3, + .p14 = q->clg.a4, + .p23 = q->clg.a5, + }; + + NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi); + break; + } + case CLG_GILB_ELL: { + struct tc_netem_gemodel ge = { + .p = q->clg.a1, + .r = q->clg.a2, + .h = q->clg.a3, + .k1 = q->clg.a4, + }; + + NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge); + break; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; } static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nla = (struct nlattr *) b; + struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; @@ -591,17 +867,87 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) corrupt.correlation = q->corrupt_cor.rho; NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); - nla->nla_len = skb_tail_pointer(skb) - b; + if (dump_loss_model(q, skb) != 0) + goto nla_put_failure; - return skb->len; + return nla_nest_end(skb, nla); nla_put_failure: - nlmsg_trim(skb, b); + nlmsg_trim(skb, nla); return -1; } +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct netem_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static const struct Qdisc_class_ops netem_class_ops = { + .graft = netem_graft, + .leaf = netem_leaf, + .get = netem_get, + .put = netem_put, + .walk = netem_walk, + .dump = netem_dump_class, +}; + static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", + .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), .enqueue = netem_enqueue, .dequeue = netem_dequeue, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 3bea31e..2a318f2 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -83,7 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -115,6 +114,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->dequeue(qdisc); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6891575..6649463 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -93,7 +93,6 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, child); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; @@ -113,11 +112,13 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) struct Qdisc *child = q->qdisc; skb = child->dequeue(child); - if (skb) + if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; - else if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); - + } else { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + } return skb; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 0000000..0a833d0 --- /dev/null +++ b/net/sched/sch_sfb.c @@ -0,0 +1,709 @@ +/* + * net/sched/sch_sfb.c Stochastic Fair Blue + * + * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: + * A New Class of Active Queue Management Algorithms. + * U. Michigan CSE-TR-387-99, April 1999. + * + * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <net/ip.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +/* + * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) + * This implementation uses L = 8 and N = 16 + * This permits us to split one 32bit hash (provided per packet by rxhash or + * external classifier) into 8 subhashes of 4 bits. + */ +#define SFB_BUCKET_SHIFT 4 +#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ +#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) +#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ + +/* SFB algo uses a virtual queue, named "bin" */ +struct sfb_bucket { + u16 qlen; /* length of virtual queue */ + u16 p_mark; /* marking probability */ +}; + +/* We use a double buffering right before hash change + * (Section 4.4 of SFB reference : moving hash functions) + */ +struct sfb_bins { + u32 perturbation; /* jhash perturbation */ + struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; +}; + +struct sfb_sched_data { + struct Qdisc *qdisc; + struct tcf_proto *filter_list; + unsigned long rehash_interval; + unsigned long warmup_time; /* double buffering warmup time in jiffies */ + u32 max; + u32 bin_size; /* maximum queue length per bin */ + u32 increment; /* d1 */ + u32 decrement; /* d2 */ + u32 limit; /* HARD maximal queue length */ + u32 penalty_rate; + u32 penalty_burst; + u32 tokens_avail; + unsigned long rehash_time; + unsigned long token_time; + + u8 slot; /* current active bins (0 or 1) */ + bool double_buffering; + struct sfb_bins bins[2]; + + struct { + u32 earlydrop; + u32 penaltydrop; + u32 bucketdrop; + u32 queuedrop; + u32 childdrop; /* drops in child qdisc */ + u32 marked; /* ECN mark */ + } stats; +}; + +/* + * Each queued skb might be hashed on one or two bins + * We store in skb_cb the two hash values. + * (A zero value means double buffering was not used) + */ +struct sfb_skb_cb { + u32 hashes[2]; +}; + +static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) < + sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; +} + +/* + * If using 'internal' SFB flow classifier, hash comes from skb rxhash + * If using external classifier, hash comes from the classid. + */ +static u32 sfb_hash(const struct sk_buff *skb, u32 slot) +{ + return sfb_skb_cb(skb)->hashes[slot]; +} + +/* Probabilities are coded as Q0.16 fixed-point values, + * with 0xFFFF representing 65535/65536 (almost 1.0) + * Addition and subtraction are saturating in [0, 65535] + */ +static u32 prob_plus(u32 p1, u32 p2) +{ + u32 res = p1 + p2; + + return min_t(u32, res, SFB_MAX_PROB); +} + +static u32 prob_minus(u32 p1, u32 p2) +{ + return p1 > p2 ? p1 - p2 : 0; +} + +static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen < 0xFFFF) + b[hash].qlen++; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + increment_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + increment_one_qlen(sfbhash, 1, q); +} + +static void decrement_one_qlen(u32 sfbhash, u32 slot, + struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen > 0) + b[hash].qlen--; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + decrement_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + decrement_one_qlen(sfbhash, 1, q); +} + +static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_minus(b->p_mark, q->decrement); +} + +static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_plus(b->p_mark, q->increment); +} + +static void sfb_zero_all_buckets(struct sfb_sched_data *q) +{ + memset(&q->bins, 0, sizeof(q->bins)); +} + +/* + * compute max qlen, max p_mark, and avg p_mark + */ +static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) +{ + int i; + u32 qlen = 0, prob = 0, totalpm = 0; + const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { + if (qlen < b->qlen) + qlen = b->qlen; + totalpm += b->p_mark; + if (prob < b->p_mark) + prob = b->p_mark; + b++; + } + *prob_r = prob; + *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); + return qlen; +} + + +static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) +{ + q->bins[slot].perturbation = net_random(); +} + +static void sfb_swap_slot(struct sfb_sched_data *q) +{ + sfb_init_perturbation(q->slot, q); + q->slot ^= 1; + q->double_buffering = false; +} + +/* Non elastic flows are allowed to use part of the bandwidth, expressed + * in "penalty_rate" packets per second, with "penalty_burst" burst + */ +static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) +{ + if (q->penalty_rate == 0 || q->penalty_burst == 0) + return true; + + if (q->tokens_avail < 1) { + unsigned long age = min(10UL * HZ, jiffies - q->token_time); + + q->tokens_avail = (age * q->penalty_rate) / HZ; + if (q->tokens_avail > q->penalty_burst) + q->tokens_avail = q->penalty_burst; + q->token_time = jiffies; + if (q->tokens_avail < 1) + return true; + } + + q->tokens_avail--; + return false; +} + +static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, + int *qerr, u32 *salt) +{ + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + *salt = TC_H_MIN(res.classid); + return true; + } + return false; +} + +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + int i; + u32 p_min = ~0; + u32 minqlen = ~0; + u32 r, slot, salt, sfbhash; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + + if (q->rehash_interval > 0) { + unsigned long limit = q->rehash_time + q->rehash_interval; + + if (unlikely(time_after(jiffies, limit))) { + sfb_swap_slot(q); + q->rehash_time = jiffies; + } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && + time_after(jiffies, limit - q->warmup_time))) { + q->double_buffering = true; + } + } + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!sfb_classify(skb, q, &ret, &salt)) + goto other_drop; + } else { + salt = skb_get_rxhash(skb); + } + + slot = q->slot; + + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + if (minqlen > b->qlen) + minqlen = b->qlen; + if (p_min > b->p_mark) + p_min = b->p_mark; + } + + slot ^= 1; + sfb_skb_cb(skb)->hashes[slot] = 0; + + if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) { + sch->qstats.overlimits++; + if (minqlen >= q->max) + q->stats.bucketdrop++; + else + q->stats.queuedrop++; + goto drop; + } + + if (unlikely(p_min >= SFB_MAX_PROB)) { + /* Inelastic flow */ + if (q->double_buffering) { + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + } + } + if (sfb_rate_limit(skb, q)) { + sch->qstats.overlimits++; + q->stats.penaltydrop++; + goto drop; + } + goto enqueue; + } + + r = net_random() & SFB_MAX_PROB; + + if (unlikely(r < p_min)) { + if (unlikely(p_min > SFB_MAX_PROB / 2)) { + /* If we're marking that many packets, then either + * this flow is unresponsive, or we're badly congested. + * In either case, we want to start dropping packets. + */ + if (r < (p_min - SFB_MAX_PROB / 2) * 2) { + q->stats.earlydrop++; + goto drop; + } + } + if (INET_ECN_set_ce(skb)) { + q->stats.marked++; + } else { + q->stats.earlydrop++; + goto drop; + } + } + +enqueue: + ret = qdisc_enqueue(skb, child); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + increment_qlen(skb, q); + } else if (net_xmit_drop_count(ret)) { + q->stats.childdrop++; + sch->qstats.drops++; + } + return ret; + +drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; +other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *sfb_dequeue(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + struct sk_buff *skb; + + skb = child->dequeue(q->qdisc); + + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + decrement_qlen(skb, q); + } + + return skb; +} + +static struct sk_buff *sfb_peek(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + + return child->ops->peek(child); +} + +/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ + +static void sfb_reset(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); +} + +static void sfb_destroy(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + qdisc_destroy(q->qdisc); +} + +static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { + [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, +}; + +static const struct tc_sfb_qopt sfb_default_ops = { + .rehash_interval = 600 * MSEC_PER_SEC, + .warmup_time = 60 * MSEC_PER_SEC, + .limit = 0, + .max = 25, + .bin_size = 20, + .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ + .decrement = (SFB_MAX_PROB + 3000) / 6000, + .penalty_rate = 10, + .penalty_burst = 20, +}; + +static int sfb_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child; + struct nlattr *tb[TCA_SFB_MAX + 1]; + const struct tc_sfb_qopt *ctl = &sfb_default_ops; + u32 limit; + int err; + + if (opt) { + err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); + if (err < 0) + return -EINVAL; + + if (tb[TCA_SFB_PARMS] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_SFB_PARMS]); + } + + limit = ctl->limit; + if (limit == 0) + limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + + child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); + if (IS_ERR(child)) + return PTR_ERR(child); + + sch_tree_lock(sch); + + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + + q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); + q->warmup_time = msecs_to_jiffies(ctl->warmup_time); + q->rehash_time = jiffies; + q->limit = limit; + q->increment = ctl->increment; + q->decrement = ctl->decrement; + q->max = ctl->max; + q->bin_size = ctl->bin_size; + q->penalty_rate = ctl->penalty_rate; + q->penalty_burst = ctl->penalty_burst; + q->tokens_avail = ctl->penalty_burst; + q->token_time = jiffies; + + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); + sfb_init_perturbation(1, q); + + sch_tree_unlock(sch); + + return 0; +} + +static int sfb_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + q->qdisc = &noop_qdisc; + return sfb_change(sch, opt); +} + +static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + struct tc_sfb_qopt opt = { + .rehash_interval = jiffies_to_msecs(q->rehash_interval), + .warmup_time = jiffies_to_msecs(q->warmup_time), + .limit = q->limit, + .max = q->max, + .bin_size = q->bin_size, + .increment = q->increment, + .decrement = q->decrement, + .penalty_rate = q->penalty_rate, + .penalty_burst = q->penalty_burst, + }; + + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); + NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct tc_sfb_xstats st = { + .earlydrop = q->stats.earlydrop, + .penaltydrop = q->stats.penaltydrop, + .bucketdrop = q->stats.bucketdrop, + .queuedrop = q->stats.queuedrop, + .childdrop = q->stats.childdrop, + .marked = q->stats.marked, + }; + + st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + return -ENOSYS; +} + +static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long sfb_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void sfb_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int sfb_delete(struct Qdisc *sch, unsigned long cl) +{ + return -ENOSYS; +} + +static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + + +static const struct Qdisc_class_ops sfb_class_ops = { + .graft = sfb_graft, + .leaf = sfb_leaf, + .get = sfb_get, + .put = sfb_put, + .change = sfb_change_class, + .delete = sfb_delete, + .walk = sfb_walk, + .tcf_chain = sfb_find_tcf, + .bind_tcf = sfb_bind, + .unbind_tcf = sfb_put, + .dump = sfb_dump_class, +}; + +static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { + .id = "sfb", + .priv_size = sizeof(struct sfb_sched_data), + .cl_ops = &sfb_class_ops, + .enqueue = sfb_enqueue, + .dequeue = sfb_dequeue, + .peek = sfb_peek, + .init = sfb_init, + .reset = sfb_reset, + .destroy = sfb_destroy, + .change = sfb_change, + .dump = sfb_dump, + .dump_stats = sfb_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init sfb_module_init(void) +{ + return register_qdisc(&sfb_qdisc_ops); +} + +static void __exit sfb_module_exit(void) +{ + unregister_qdisc(&sfb_qdisc_ops); +} + +module_init(sfb_module_init) +module_exit(sfb_module_exit) + +MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); +MODULE_AUTHOR("Juliusz Chroboczek"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 54a36f4..c2e628d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -21,6 +21,7 @@ #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <net/ip.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -76,7 +77,8 @@ #define SFQ_DEPTH 128 /* max number of packets per flow */ #define SFQ_SLOTS 128 /* max number of flows */ #define SFQ_EMPTY_SLOT 255 -#define SFQ_HASH_DIVISOR 1024 +#define SFQ_DEFAULT_HASH_DIVISOR 1024 + /* We use 16 bits to store allot, and want to handle packets up to 64K * Scale allot by 8 (1<<3) so that no overflow occurs. */ @@ -112,7 +114,7 @@ struct sfq_sched_data { int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ int limit; - + unsigned int divisor; /* number of slots in hash table */ /* Variables */ struct tcf_proto *filter_list; struct timer_list perturb_timer; @@ -120,7 +122,7 @@ struct sfq_sched_data { sfq_index cur_depth; /* depth of longest slot */ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct sfq_slot *tail; /* current slot in round */ - sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index *ht; /* Hash table (divisor slots) */ struct sfq_slot slots[SFQ_SLOTS]; struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ }; @@ -137,7 +139,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) { - return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1); + return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) @@ -201,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR) + TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); if (!q->filter_list) @@ -219,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return 0; } #endif - if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR) + if (TC_H_MIN(res.classid) <= q->divisor) return TC_H_MIN(res.classid); } return 0; @@ -400,10 +402,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->tail = slot; slot->allot = q->scaled_quantum; } - if (++sch->q.qlen <= q->limit) { - qdisc_bstats_update(sch, skb); + if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; - } sfq_drop(sch); return NET_XMIT_CN; @@ -443,6 +443,7 @@ next_slot: } skb = slot_dequeue_head(slot); sfq_dec(q, a); + qdisc_bstats_update(sch, skb); sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -490,13 +491,18 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; + if (ctl->divisor && + (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) + return -EINVAL; + sch_tree_lock(sch); q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = ctl->perturb_period * HZ; if (ctl->limit) q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); - + if (ctl->divisor) + q->divisor = ctl->divisor; qlen = sch->q.qlen; while (sch->q.qlen > q->limit) sfq_drop(sch); @@ -514,15 +520,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); + size_t sz; int i; q->perturb_timer.function = sfq_perturbation; q->perturb_timer.data = (unsigned long)sch; init_timer_deferrable(&q->perturb_timer); - for (i = 0; i < SFQ_HASH_DIVISOR; i++) - q->ht[i] = SFQ_EMPTY_SLOT; - for (i = 0; i < SFQ_DEPTH; i++) { q->dep[i].next = i + SFQ_SLOTS; q->dep[i].prev = i + SFQ_SLOTS; @@ -531,6 +535,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->limit = SFQ_DEPTH - 1; q->cur_depth = 0; q->tail = NULL; + q->divisor = SFQ_DEFAULT_HASH_DIVISOR; if (opt == NULL) { q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); @@ -542,10 +547,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) return err; } + sz = sizeof(q->ht[0]) * q->divisor; + q->ht = kmalloc(sz, GFP_KERNEL); + if (!q->ht && sz > PAGE_SIZE) + q->ht = vmalloc(sz); + if (!q->ht) + return -ENOMEM; + for (i = 0; i < q->divisor; i++) + q->ht[i] = SFQ_EMPTY_SLOT; + for (i = 0; i < SFQ_SLOTS; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } + if (q->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } @@ -556,6 +574,10 @@ static void sfq_destroy(struct Qdisc *sch) tcf_destroy_chain(&q->filter_list); q->perturb_period = 0; del_timer_sync(&q->perturb_timer); + if (is_vmalloc_addr(q->ht)) + vfree(q->ht); + else + kfree(q->ht); } static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -568,7 +590,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) opt.perturb_period = q->perturb_period / HZ; opt.limit = q->limit; - opt.divisor = SFQ_HASH_DIVISOR; + opt.divisor = q->divisor; opt.flows = q->limit; NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); @@ -593,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid) static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } @@ -646,7 +670,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) if (arg->stop) return; - for (i = 0; i < SFQ_HASH_DIVISOR; i++) { + for (i = 0; i < q->divisor; i++) { if (q->ht[i] == SFQ_EMPTY_SLOT || arg->count < arg->skip) { arg->count++; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 475edfb..1dcfb52 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -133,7 +133,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen++; - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -185,7 +184,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->tokens = toks; q->ptokens = ptoks; sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); return skb; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 64c071d..45cd300 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -85,7 +85,6 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->q.qlen < dev->tx_queue_len) { __skb_queue_tail(&q->q, skb); - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -109,6 +108,8 @@ teql_dequeue(struct Qdisc *sch) dat->m->slaves = sch; netif_wake_queue(m); } + } else { + qdisc_bstats_update(sch, skb); } sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; return skb; |