net/sched1/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/config.h>
  14 #include <linux/module.h>
  15 #include <asm/uaccess.h>
  16 #include <asm/system.h>
  17 #include <asm/bitops.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/string.h>
  22 #include <linux/mm.h>
  23 #include <linux/socket.h>
  24 #include <linux/sockios.h>
  25 #include <linux/in.h>
  26 #include <linux/errno.h>
  27 #include <linux/interrupt.h>
  28 #include <linux/if_ether.h>
  29 #include <linux/inet.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/etherdevice.h>
  32 #include <linux/notifier.h>
  33 #include <net/ip.h>
  34 #include <net/route.h>
  35 #include <linux/skbuff.h>
  36 #include <net/sock.h>
  37 #include <net/pkt_sched.h>
  38
  39
  40 /*      Simple Token Bucket Filter.
  41         =======================================
  42
  43         SOURCE.
  44         -------
  45
  46         None.
  47
  48         Description.
  49         ------------
  50
  51         A data flow obeys TBF with rate R and depth B, if for any
  52         time interval t_i...t_f the number of transmitted bits
  53         does not exceed B + R*(t_f-t_i).
  54
  55         Packetized version of this definition:
  56         The sequence of packets of sizes s_i served at moments t_i
  57         obeys TBF, if for any i<=k:
  58
  59         s_i+....+s_k <= B + R*(t_k - t_i)
  60
  61         Algorithm.
  62         ----------
  63
  64         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  65
  66         N(t+delta) = min{B/R, N(t) + delta}
  67
  68         If the first packet in queue has length S, it may be
  69         transmitted only at the time t_* when S/R <= N(t_*),
  70         and in this case N(t) jumps:
  71
  72         N(t_* + 0) = N(t_* - 0) - S/R.
  73
  74
  75
  76         Actually, QoS requires two TBF to be applied to a data stream.
  77         One of them controls steady state burst size, another
  78         one with rate P (peak rate) and depth M (equal to link MTU)
  79         limits bursts at a smaller time scale.
  80
  81         It is easy to see that P>R, and B>M. If P is infinity, this double
  82         TBF is equivalent to a single one.
  83
  84         When TBF works in reshaping mode, latency is estimated as:
  85
  86         lat = max ((L-B)/R, (L-M)/P)
  87
  88
  89         NOTES.
  90         ------
  91
  92         If TBF throttles, it starts a watchdog timer, which will wake it up
  93         when it is ready to transmit.
  94         Note that the minimal timer resolution is 1/HZ.
  95         If no new packets arrive during this period,
  96         or if the device is not awaken by EOI for some previous packet,
  97         TBF can stop its activity for 1/HZ.
  98
  99
 100         This means, that with depth B, the maximal rate is
 101
 102         R_crit = B*HZ
 103
 104         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 105
 106         Note that the peak rate TBF is much more tough: with MTU 1500
 107         P_crit = 150Kbytes/sec. So, if you need greater peak
 108         rates, use alpha with HZ=1000 :-)
 109 */
 110
 111 struct tbf_sched_data
 112 {
 113 /* Parameters */
 114         u32             limit;          /* Maximal length of backlog: bytes */
 115         u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 116         u32             mtu;
 117         u32             max_size;
 118         struct qdisc_rate_table *R_tab;
 119         struct qdisc_rate_table *P_tab;
 120
 121 /* Variables */
 122         long    tokens;                 /* Current number of B tokens */
 123         long    ptokens;                /* Current number of P tokens */
 124         psched_time_t   t_c;            /* Time check-point */
 125         struct timer_list wd_timer;     /* Watchdog timer */
 126 };
 127
 128 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
 129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 130
 131 static int
 132 tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 133 {
 134         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 135
 136         if (skb->len > q->max_size)
 137                 goto drop;
 138         __skb_queue_tail(&sch->q, skb);
 139         if ((sch->stats.backlog += skb->len) <= q->limit) {
 140                 sch->stats.bytes += skb->len;
 141                 sch->stats.packets++;
 142                 return 0;
 143         }
 144
 145         /* Drop action: undo the things that we just did,
 146          * i.e. make tail drop
 147          */
 148
 149         __skb_unlink(skb, &sch->q);
 150         sch->stats.backlog -= skb->len;
 151
 152 drop:
 153         sch->stats.drops++;
 154 #ifdef CONFIG_NET_CLS_POLICE
 155         if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
 156 #endif
 157                 kfree_skb(skb);
 158         return NET_XMIT_DROP;
 159 }
 160
 161 static int
 162 tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
 163 {
 164         __skb_queue_head(&sch->q, skb);
 165         sch->stats.backlog += skb->len;
 166         return 0;
 167 }
 168
 169 static int
 170 tbf_drop(struct Qdisc* sch)
 171 {
 172         struct sk_buff *skb;
 173
 174         skb = __skb_dequeue_tail(&sch->q);
 175         if (skb) {
 176                 sch->stats.backlog -= skb->len;
 177                 sch->stats.drops++;
 178                 kfree_skb(skb);
 179                 return 1;
 180         }
 181         return 0;
 182 }
 183
 184 static void tbf_watchdog(unsigned long arg)
 185 {
 186         struct Qdisc *sch = (struct Qdisc*)arg;
 187
 188         sch->flags &= ~TCQ_F_THROTTLED;
 189         netif_schedule(sch->dev);
 190 }
 191
 192 static struct sk_buff *
 193 tbf_dequeue(struct Qdisc* sch)
 194 {
 195         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 196         struct sk_buff *skb;
 197
 198         skb = __skb_dequeue(&sch->q);
 199
 200         if (skb) {
 201                 psched_time_t now;
 202                 long toks;
 203                 long ptoks = 0;
 204
 205                 PSCHED_GET_TIME(now);
 206
 207                 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
 208
 209                 if (q->P_tab) {
 210                         ptoks = toks + q->ptokens;
 211                         if (ptoks > (long)q->mtu)
 212                                 ptoks = q->mtu;
 213                         ptoks -= L2T_P(q, skb->len);
 214                 }
 215                 toks += q->tokens;
 216                 if (toks > (long)q->buffer)
 217                         toks = q->buffer;
 218                 toks -= L2T(q, skb->len);
 219
 220                 if ((toks|ptoks) >= 0) {
 221                         q->t_c = now;
 222                         q->tokens = toks;
 223                         q->ptokens = ptoks;
 224                         sch->stats.backlog -= skb->len;
 225                         sch->flags &= ~TCQ_F_THROTTLED;
 226                         return skb;
 227                 }
 228
 229                 if (!netif_queue_stopped(sch->dev)) {
 230                         long delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
 231
 232                         if (delay == 0)
 233                                 delay = 1;
 234
 235                         mod_timer(&q->wd_timer, jiffies+delay);
 236                 }
 237
 238                 /* Maybe we have a shorter packet in the queue,
 239                    which can be sent now. It sounds cool,
 240                    but, however, this is wrong in principle.
 241                    We MUST NOT reorder packets under these circumstances.
 242
 243                    Really, if we split the flow into independent
 244                    subflows, it would be a very good solution.
 245                    This is the main idea of all FQ algorithms
 246                    (cf. CSZ, HPFQ, HFSC)
 247                  */
 248                 __skb_queue_head(&sch->q, skb);
 249
 250                 sch->flags |= TCQ_F_THROTTLED;
 251                 sch->stats.overlimits++;
 252         }
 253         return NULL;
 254 }
 255
 256
 257 static void
 258 tbf_reset(struct Qdisc* sch)
 259 {
 260         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 261
 262         skb_queue_purge(&sch->q);
 263         sch->stats.backlog = 0;
 264         PSCHED_GET_TIME(q->t_c);
 265         q->tokens = q->buffer;
 266         q->ptokens = q->mtu;
 267         sch->flags &= ~TCQ_F_THROTTLED;
 268         del_timer(&q->wd_timer);
 269 }
 270
 271 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 272 {
 273         int err = -EINVAL;
 274         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 275         struct rtattr *tb[TCA_TBF_PTAB];
 276         struct tc_tbf_qopt *qopt;
 277         struct qdisc_rate_table *rtab = NULL;
 278         struct qdisc_rate_table *ptab = NULL;
 279         int max_size,n;
 280
 281         if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
 282             tb[TCA_TBF_PARMS-1] == NULL ||
 283             RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
 284                 goto done;
 285
 286         qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
 287         rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
 288         if (rtab == NULL)
 289                 goto done;
 290
 291         if (qopt->peakrate.rate) {
 292                 if (qopt->peakrate.rate > qopt->rate.rate)
 293                         ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
 294                 if (ptab == NULL)
 295                         goto done;
 296         }
 297
 298         for (n = 0; n < 256; n++)
 299                 if (rtab->data[n] > qopt->buffer) break;
 300         max_size = (n << qopt->rate.cell_log)-1;
 301         if (ptab) {
 302                 int size;
 303
 304                 for (n = 0; n < 256; n++)
 305                         if (ptab->data[n] > qopt->mtu) break;
 306                 size = (n << qopt->peakrate.cell_log)-1;
 307                 if (size < max_size) max_size = size;
 308         }
 309         if (max_size < 0)
 310                 goto done;
 311
 312         sch_tree_lock(sch);
 313         q->limit = qopt->limit;
 314         q->mtu = qopt->mtu;
 315         q->max_size = max_size;
 316         q->buffer = qopt->buffer;
 317         q->tokens = q->buffer;
 318         q->ptokens = q->mtu;
 319         rtab = xchg(&q->R_tab, rtab);
 320         ptab = xchg(&q->P_tab, ptab);
 321         sch_tree_unlock(sch);
 322         err = 0;
 323 done:
 324         if (rtab)
 325                 qdisc_put_rtab(rtab);
 326         if (ptab)
 327                 qdisc_put_rtab(ptab);
 328         return err;
 329 }
 330
 331 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
 332 {
 333         int err;
 334         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 335
 336         if (opt == NULL)
 337                 return -EINVAL;
 338
 339         MOD_INC_USE_COUNT;
 340
 341         PSCHED_GET_TIME(q->t_c);
 342         init_timer(&q->wd_timer);
 343         q->wd_timer.function = tbf_watchdog;
 344         q->wd_timer.data = (unsigned long)sch;
 345
 346         if ((err = tbf_change(sch, opt)) != 0) {
 347                 MOD_DEC_USE_COUNT;
 348         }
 349         return err;
 350 }
 351
 352 static void tbf_destroy(struct Qdisc *sch)
 353 {
 354         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 355
 356         del_timer(&q->wd_timer);
 357
 358         if (q->P_tab)
 359                 qdisc_put_rtab(q->P_tab);
 360         if (q->R_tab)
 361                 qdisc_put_rtab(q->R_tab);
 362
 363         MOD_DEC_USE_COUNT;
 364 }
 365
 366 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 367 {
 368         struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
 369         unsigned char    *b = skb->tail;
 370         struct rtattr *rta;
 371         struct tc_tbf_qopt opt;
 372
 373         rta = (struct rtattr*)b;
 374         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 375
 376         opt.limit = q->limit;
 377         opt.rate = q->R_tab->rate;
 378         if (q->P_tab)
 379                 opt.peakrate = q->P_tab->rate;
 380         else
 381                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 382         opt.mtu = q->mtu;
 383         opt.buffer = q->buffer;
 384         RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
 385         rta->rta_len = skb->tail - b;
 386
 387         return skb->len;
 388
 389 rtattr_failure:
 390         skb_trim(skb, b - skb->data);
 391         return -1;
 392 }
 393
 394 struct Qdisc_ops tbf_qdisc_ops =
 395 {
 396         NULL,
 397         NULL,
 398         "tbf",
 399         sizeof(struct tbf_sched_data),
 400
 401         tbf_enqueue,
 402         tbf_dequeue,
 403         tbf_requeue,
 404         tbf_drop,
 405
 406         tbf_init,
 407         tbf_reset,
 408         tbf_destroy,
 409         tbf_change,
 410
 411         tbf_dump,
 412 };
 413
 414
 415 #ifdef MODULE
 416 int init_module(void)
 417 {
 418         return register_qdisc(&tbf_qdisc_ops);
 419 }
 420
 421 void cleanup_module(void)
 422 {
 423         unregister_qdisc(&tbf_qdisc_ops);
 424 }
 425 #endif
 426 MODULE_LICENSE("GPL");