2 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
37 #include <net/pkt_sched.h>
40 /* Simple Token Bucket Filter.
41 =======================================
51 A data flow obeys TBF with rate R and depth B, if for any
52 time interval t_i...t_f the number of transmitted bits
53 does not exceed B + R*(t_f-t_i).
55 Packetized version of this definition:
56 The sequence of packets of sizes s_i served at moments t_i
57 obeys TBF, if for any i<=k:
59 s_i+....+s_k <= B + R*(t_k - t_i)
64 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
66 N(t+delta) = min{B/R, N(t) + delta}
68 If the first packet in queue has length S, it may be
69 transmitted only at the time t_* when S/R <= N(t_*),
70 and in this case N(t) jumps:
72 N(t_* + 0) = N(t_* - 0) - S/R.
76 Actually, QoS requires two TBF to be applied to a data stream.
77 One of them controls steady state burst size, another
78 one with rate P (peak rate) and depth M (equal to link MTU)
79 limits bursts at a smaller time scale.
81 It is easy to see that P>R, and B>M. If P is infinity, this double
82 TBF is equivalent to a single one.
84 When TBF works in reshaping mode, latency is estimated as:
86 lat = max ((L-B)/R, (L-M)/P)
92 If TBF throttles, it starts a watchdog timer, which will wake it up
93 when it is ready to transmit.
94 Note that the minimal timer resolution is 1/HZ.
95 If no new packets arrive during this period,
96 or if the device is not awaken by EOI for some previous packet,
97 TBF can stop its activity for 1/HZ.
100 This means, that with depth B, the maximal rate is
104 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
106 Note that the peak rate TBF is much more tough: with MTU 1500
107 P_crit = 150Kbytes/sec. So, if you need greater peak
108 rates, use alpha with HZ=1000 :-)
111 struct tbf_sched_data
114 u32 limit; /* Maximal length of backlog: bytes */
115 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
118 struct qdisc_rate_table *R_tab;
119 struct qdisc_rate_table *P_tab;
122 long tokens; /* Current number of B tokens */
123 long ptokens; /* Current number of P tokens */
124 psched_time_t t_c; /* Time check-point */
125 struct timer_list wd_timer; /* Watchdog timer */
128 #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
132 tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
134 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
136 if (skb->len > q->max_size)
138 __skb_queue_tail(&sch->q, skb);
139 if ((sch->stats.backlog += skb->len) <= q->limit) {
140 sch->stats.bytes += skb->len;
141 sch->stats.packets++;
145 /* Drop action: undo the things that we just did,
146 * i.e. make tail drop
149 __skb_unlink(skb, &sch->q);
150 sch->stats.backlog -= skb->len;
154 #ifdef CONFIG_NET_CLS_POLICE
155 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
158 return NET_XMIT_DROP;
162 tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
164 __skb_queue_head(&sch->q, skb);
165 sch->stats.backlog += skb->len;
170 tbf_drop(struct Qdisc* sch)
174 skb = __skb_dequeue_tail(&sch->q);
176 sch->stats.backlog -= skb->len;
184 static void tbf_watchdog(unsigned long arg)
186 struct Qdisc *sch = (struct Qdisc*)arg;
188 sch->flags &= ~TCQ_F_THROTTLED;
189 netif_schedule(sch->dev);
192 static struct sk_buff *
193 tbf_dequeue(struct Qdisc* sch)
195 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
198 skb = __skb_dequeue(&sch->q);
205 PSCHED_GET_TIME(now);
207 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
210 ptoks = toks + q->ptokens;
211 if (ptoks > (long)q->mtu)
213 ptoks -= L2T_P(q, skb->len);
216 if (toks > (long)q->buffer)
218 toks -= L2T(q, skb->len);
220 if ((toks|ptoks) >= 0) {
224 sch->stats.backlog -= skb->len;
225 sch->flags &= ~TCQ_F_THROTTLED;
229 if (!netif_queue_stopped(sch->dev)) {
230 long delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
235 mod_timer(&q->wd_timer, jiffies+delay);
238 /* Maybe we have a shorter packet in the queue,
239 which can be sent now. It sounds cool,
240 but, however, this is wrong in principle.
241 We MUST NOT reorder packets under these circumstances.
243 Really, if we split the flow into independent
244 subflows, it would be a very good solution.
245 This is the main idea of all FQ algorithms
246 (cf. CSZ, HPFQ, HFSC)
248 __skb_queue_head(&sch->q, skb);
250 sch->flags |= TCQ_F_THROTTLED;
251 sch->stats.overlimits++;
258 tbf_reset(struct Qdisc* sch)
260 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
262 skb_queue_purge(&sch->q);
263 sch->stats.backlog = 0;
264 PSCHED_GET_TIME(q->t_c);
265 q->tokens = q->buffer;
267 sch->flags &= ~TCQ_F_THROTTLED;
268 del_timer(&q->wd_timer);
271 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
274 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
275 struct rtattr *tb[TCA_TBF_PTAB];
276 struct tc_tbf_qopt *qopt;
277 struct qdisc_rate_table *rtab = NULL;
278 struct qdisc_rate_table *ptab = NULL;
281 if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
282 tb[TCA_TBF_PARMS-1] == NULL ||
283 RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
286 qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
287 rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
291 if (qopt->peakrate.rate) {
292 if (qopt->peakrate.rate > qopt->rate.rate)
293 ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
298 for (n = 0; n < 256; n++)
299 if (rtab->data[n] > qopt->buffer) break;
300 max_size = (n << qopt->rate.cell_log)-1;
304 for (n = 0; n < 256; n++)
305 if (ptab->data[n] > qopt->mtu) break;
306 size = (n << qopt->peakrate.cell_log)-1;
307 if (size < max_size) max_size = size;
313 q->limit = qopt->limit;
315 q->max_size = max_size;
316 q->buffer = qopt->buffer;
317 q->tokens = q->buffer;
319 rtab = xchg(&q->R_tab, rtab);
320 ptab = xchg(&q->P_tab, ptab);
321 sch_tree_unlock(sch);
325 qdisc_put_rtab(rtab);
327 qdisc_put_rtab(ptab);
331 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
334 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
341 PSCHED_GET_TIME(q->t_c);
342 init_timer(&q->wd_timer);
343 q->wd_timer.function = tbf_watchdog;
344 q->wd_timer.data = (unsigned long)sch;
346 if ((err = tbf_change(sch, opt)) != 0) {
352 static void tbf_destroy(struct Qdisc *sch)
354 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
356 del_timer(&q->wd_timer);
359 qdisc_put_rtab(q->P_tab);
361 qdisc_put_rtab(q->R_tab);
366 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
368 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
369 unsigned char *b = skb->tail;
371 struct tc_tbf_qopt opt;
373 rta = (struct rtattr*)b;
374 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
376 opt.limit = q->limit;
377 opt.rate = q->R_tab->rate;
379 opt.peakrate = q->P_tab->rate;
381 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
383 opt.buffer = q->buffer;
384 RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
385 rta->rta_len = skb->tail - b;
390 skb_trim(skb, b - skb->data);
394 struct Qdisc_ops tbf_qdisc_ops =
399 sizeof(struct tbf_sched_data),
416 int init_module(void)
418 return register_qdisc(&tbf_qdisc_ops);
421 void cleanup_module(void)
423 unregister_qdisc(&tbf_qdisc_ops);
426 MODULE_LICENSE("GPL");