2 * net/sched/sch_red.c Random Early Detection queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
45 /* Random Early Detection (RED) algorithm.
46 =======================================
48 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
49 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51 This file codes a "divisionless" version of RED algorithm
52 as written down in Fig.17 of the paper.
57 When a new packet arrives we calculate the average queue length:
59 avg = (1-W)*avg + W*current_queue_len,
61 W is the filter time constant (chosen as 2^(-Wlog)), it controls
62 the inertia of the algorithm. To allow larger bursts, W should be
65 if (avg > th_max) -> packet marked (dropped).
66 if (avg < th_min) -> packet passes.
67 if (th_min < avg < th_max) we calculate probability:
69 Pb = max_P * (avg - th_min)/(th_max-th_min)
71 and mark (drop) packet with this probability.
72 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
73 max_P should be small (not 1), usually 0.01..0.02 is good value.
75 max_P is chosen as a number, so that max_P/(th_max-th_min)
76 is a negative power of two in order arithmetics to contain
80 Parameters, settable by user:
81 -----------------------------
83 limit - bytes (must be > qth_max + burst)
85 Hard limit on queue length, should be chosen >qth_max
86 to allow packet bursts. This parameter does not
87 affect the algorithms behaviour and can be chosen
88 arbitrarily high (well, less than ram size)
89 Really, this limit will never be reached
90 if RED works correctly.
92 qth_min - bytes (should be < qth_max/2)
93 qth_max - bytes (should be at least 2*qth_min and less limit)
94 Wlog - bits (<32) log(1/W).
97 Plog is related to max_P by formula:
99 max_P = (qth_max-qth_min)/2^Plog;
101 F.e. if qth_max=128K and qth_min=32K, then Plog=22
102 corresponds to max_P=0.02
107 Lookup table for log((1-W)^(t/t_ave).
115 If you want to allow bursts of L packets of size S,
118 L + 1 - th_min/S < (1-(1-W)^L)/W
120 th_min/S = 32 th_min/S = 4
135 struct red_sched_data
138 u32 limit; /* HARD maximal queue length */
139 u32 qth_min; /* Min average length threshold: A scaled */
140 u32 qth_max; /* Max average length threshold: A scaled */
144 char Wlog; /* log(W) */
145 char Plog; /* random number bits */
150 unsigned long qave; /* Average queue length: A scaled */
151 int qcount; /* Packets since last random number generation */
152 u32 qR; /* Cached random number */
154 psched_time_t qidlestart; /* Start of idle period */
155 struct tc_red_xstats st;
158 static int red_ecn_mark(struct sk_buff *skb)
160 if (skb->nh.raw + 20 > skb->tail)
163 switch (skb->protocol) {
164 case __constant_htons(ETH_P_IP):
165 if (!INET_ECN_is_capable(skb->nh.iph->tos))
167 if (INET_ECN_is_not_ce(skb->nh.iph->tos))
168 IP_ECN_set_ce(skb->nh.iph);
170 case __constant_htons(ETH_P_IPV6):
171 if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
173 IP6_ECN_set_ce(skb->nh.ipv6h);
181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
183 struct red_sched_data *q = qdisc_priv(sch);
187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
191 PSCHED_GET_TIME(now);
192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
193 PSCHED_SET_PASTPERFECT(q->qidlestart);
196 The problem: ideally, average length queue recalcultion should
197 be done over constant clock intervals. This is too expensive, so that
198 the calculation is driven by outgoing packets.
199 When the queue is idle we have to model this clock by hand.
201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202 dummy packets as a burst after idle time, i.e.
206 This is an apparently overcomplicated solution (f.e. we have to precompute
207 a table to make this calculation in reasonable time)
208 I believe that a simpler model may be used here,
209 but it is field for experiments.
211 shift = q->Stab[us_idle>>q->Scell_log];
216 /* Approximate initial part of exponent
217 with linear function:
218 (1-W)^m ~= 1-mW + ...
220 Seems, it is the best solution to
221 problem of too coarce exponent tabulation.
224 us_idle = (q->qave * us_idle)>>q->Scell_log;
225 if (us_idle < q->qave/2)
231 q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
233 q->qave is fixed point number with point at Wlog.
234 The formulae above is equvalent to floating point
237 qave = qave*(1-W) + sch->stats.backlog*W;
242 if (q->qave < q->qth_min) {
245 if (sch->stats.backlog + skb->len <= q->limit) {
246 __skb_queue_tail(&sch->q, skb);
247 sch->stats.backlog += skb->len;
248 sch->stats.bytes += skb->len;
249 sch->stats.packets++;
250 return NET_XMIT_SUCCESS;
256 return NET_XMIT_DROP;
258 if (q->qave >= q->qth_max) {
260 sch->stats.overlimits++;
262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
271 /* The formula used below causes questions.
273 OK. qR is random number in the interval 0..Rmask
274 i.e. 0..(2^Plog). If we used floating point
275 arithmetics, it would be: (2^Plog)*rnd_num,
276 where rnd_num is less 1.
278 Taking into account, that qave have fixed
279 point at Wlog, and Plog is related to max_P by
280 max_P = (qth_max-qth_min)/2^Plog; two lines
281 below have the following floating point equivalent:
283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
285 Any questions? --ANK (980924)
287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
290 q->qR = net_random()&q->Rmask;
291 sch->stats.overlimits++;
294 q->qR = net_random()&q->Rmask;
304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
306 struct red_sched_data *q = qdisc_priv(sch);
308 PSCHED_SET_PASTPERFECT(q->qidlestart);
310 __skb_queue_head(&sch->q, skb);
311 sch->stats.backlog += skb->len;
315 static struct sk_buff *
316 red_dequeue(struct Qdisc* sch)
319 struct red_sched_data *q = qdisc_priv(sch);
321 skb = __skb_dequeue(&sch->q);
323 sch->stats.backlog -= skb->len;
326 PSCHED_GET_TIME(q->qidlestart);
330 static unsigned int red_drop(struct Qdisc* sch)
333 struct red_sched_data *q = qdisc_priv(sch);
335 skb = __skb_dequeue_tail(&sch->q);
337 unsigned int len = skb->len;
338 sch->stats.backlog -= len;
344 PSCHED_GET_TIME(q->qidlestart);
348 static void red_reset(struct Qdisc* sch)
350 struct red_sched_data *q = qdisc_priv(sch);
352 __skb_queue_purge(&sch->q);
353 sch->stats.backlog = 0;
354 PSCHED_SET_PASTPERFECT(q->qidlestart);
359 static int red_change(struct Qdisc *sch, struct rtattr *opt)
361 struct red_sched_data *q = qdisc_priv(sch);
362 struct rtattr *tb[TCA_RED_STAB];
363 struct tc_red_qopt *ctl;
366 rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
367 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
368 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
369 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
372 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
375 q->flags = ctl->flags;
378 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
379 q->Scell_log = ctl->Scell_log;
380 q->Scell_max = (255<<q->Scell_log);
381 q->qth_min = ctl->qth_min<<ctl->Wlog;
382 q->qth_max = ctl->qth_max<<ctl->Wlog;
383 q->limit = ctl->limit;
384 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
387 if (skb_queue_len(&sch->q) == 0)
388 PSCHED_SET_PASTPERFECT(q->qidlestart);
389 sch_tree_unlock(sch);
393 static int red_init(struct Qdisc* sch, struct rtattr *opt)
395 return red_change(sch, opt);
399 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
401 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
408 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
410 struct red_sched_data *q = qdisc_priv(sch);
411 unsigned char *b = skb->tail;
413 struct tc_red_qopt opt;
415 rta = (struct rtattr*)b;
416 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
417 opt.limit = q->limit;
418 opt.qth_min = q->qth_min>>q->Wlog;
419 opt.qth_max = q->qth_max>>q->Wlog;
422 opt.Scell_log = q->Scell_log;
423 opt.flags = q->flags;
424 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
425 rta->rta_len = skb->tail - b;
427 if (red_copy_xstats(skb, &q->st))
433 skb_trim(skb, b - skb->data);
437 static struct Qdisc_ops red_qdisc_ops = {
441 .priv_size = sizeof(struct red_sched_data),
442 .enqueue = red_enqueue,
443 .dequeue = red_dequeue,
444 .requeue = red_requeue,
448 .change = red_change,
450 .owner = THIS_MODULE,
453 static int __init red_module_init(void)
455 return register_qdisc(&red_qdisc_ops);
457 static void __exit red_module_exit(void)
459 unregister_qdisc(&red_qdisc_ops);
461 module_init(red_module_init)
462 module_exit(red_module_exit)
463 MODULE_LICENSE("GPL");