net/sched1/sch_red.c

   1 /*
   2  * net/sched/sch_red.c  Random Early Detection queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  * Changes:
  12  * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
  13  * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
  14  * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/module.h>
  19 #include <asm/uaccess.h>
  20 #include <asm/system.h>
  21 #include <asm/bitops.h>
  22 #include <linux/types.h>
  23 #include <linux/kernel.h>
  24 #include <linux/sched.h>
  25 #include <linux/string.h>
  26 #include <linux/mm.h>
  27 #include <linux/socket.h>
  28 #include <linux/sockios.h>
  29 #include <linux/in.h>
  30 #include <linux/errno.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/if_ether.h>
  33 #include <linux/inet.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/etherdevice.h>
  36 #include <linux/notifier.h>
  37 #include <net/ip.h>
  38 #include <net/route.h>
  39 #include <linux/skbuff.h>
  40 #include <net/sock.h>
  41 #include <net/pkt_sched.h>
  42 #include <net/inet_ecn.h>
  43
  44 #define RED_ECN_ECT  0x02
  45 #define RED_ECN_CE   0x01
  46
  47
  48 /*      Random Early Detection (RED) algorithm.
  49         =======================================
  50
  51         Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
  52         for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
  53
  54         This file codes a "divisionless" version of RED algorithm
  55         as written down in Fig.17 of the paper.
  56
  57 Short description.
  58 ------------------
  59
  60         When a new packet arrives we calculate the average queue length:
  61
  62         avg = (1-W)*avg + W*current_queue_len,
  63
  64         W is the filter time constant (choosen as 2^(-Wlog)), it controls
  65         the inertia of the algorithm. To allow larger bursts, W should be
  66         decreased.
  67
  68         if (avg > th_max) -> packet marked (dropped).
  69         if (avg < th_min) -> packet passes.
  70         if (th_min < avg < th_max) we calculate probability:
  71
  72         Pb = max_P * (avg - th_min)/(th_max-th_min)
  73
  74         and mark (drop) packet with this probability.
  75         Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
  76         max_P should be small (not 1), usually 0.01..0.02 is good value.
  77
  78         max_P is chosen as a number, so that max_P/(th_max-th_min)
  79         is a negative power of two in order arithmetics to contain
  80         only shifts.
  81
  82
  83         Parameters, settable by user:
  84         -----------------------------
  85
  86         limit           - bytes (must be > qth_max + burst)
  87
  88         Hard limit on queue length, should be chosen >qth_max
  89         to allow packet bursts. This parameter does not
  90         affect the algorithms behaviour and can be chosen
  91         arbitrarily high (well, less than ram size)
  92         Really, this limit will never be reached
  93         if RED works correctly.
  94
  95         qth_min         - bytes (should be < qth_max/2)
  96         qth_max         - bytes (should be at least 2*qth_min and less limit)
  97         Wlog            - bits (<32) log(1/W).
  98         Plog            - bits (<32)
  99
 100         Plog is related to max_P by formula:
 101
 102         max_P = (qth_max-qth_min)/2^Plog;
 103
 104         F.e. if qth_max=128K and qth_min=32K, then Plog=22
 105         corresponds to max_P=0.02
 106
 107         Scell_log
 108         Stab
 109
 110         Lookup table for log((1-W)^(t/t_ave).
 111
 112
 113 NOTES:
 114
 115 Upper bound on W.
 116 -----------------
 117
 118         If you want to allow bursts of L packets of size S,
 119         you should choose W:
 120
 121         L + 1 - th_min/S < (1-(1-W)^L)/W
 122
 123         th_min/S = 32         th_min/S = 4
 124
 125         log(W)  L
 126         -1      33
 127         -2      35
 128         -3      39
 129         -4      46
 130         -5      57
 131         -6      75
 132         -7      101
 133         -8      135
 134         -9      190
 135         etc.
 136  */
 137
 138 struct red_sched_data
 139 {
 140 /* Parameters */
 141         u32             limit;          /* HARD maximal queue length    */
 142         u32             qth_min;        /* Min average length threshold: A scaled */
 143         u32             qth_max;        /* Max average length threshold: A scaled */
 144         u32             Rmask;
 145         u32             Scell_max;
 146         unsigned char   flags;
 147         char            Wlog;           /* log(W)               */
 148         char            Plog;           /* random number bits   */
 149         char            Scell_log;
 150         u8              Stab[256];
 151
 152 /* Variables */
 153         unsigned long   qave;           /* Average queue length: A scaled */
 154         int             qcount;         /* Packets since last random number generation */
 155         u32             qR;             /* Cached random number */
 156
 157         psched_time_t   qidlestart;     /* Start of idle period         */
 158         struct tc_red_xstats st;
 159 };
 160
 161 static int red_ecn_mark(struct sk_buff *skb)
 162 {
 163         if (skb->nh.raw + 20 > skb->tail)
 164                 return 0;
 165
 166         switch (skb->protocol) {
 167         case __constant_htons(ETH_P_IP):
 168         {
 169                 u8 tos = skb->nh.iph->tos;
 170
 171                 if (!(tos & RED_ECN_ECT))
 172                         return 0;
 173
 174                 if (!(tos & RED_ECN_CE))
 175                         IP_ECN_set_ce(skb->nh.iph);
 176
 177                 return 1;
 178         }
 179
 180         case __constant_htons(ETH_P_IPV6):
 181         {
 182                 u32 label = *(u32*)skb->nh.raw;
 183
 184                 if (!(label & __constant_htonl(RED_ECN_ECT<<20)))
 185                         return 0;
 186                 label |= __constant_htonl(RED_ECN_CE<<20);
 187                 return 1;
 188         }
 189
 190         default:
 191                 return 0;
 192         }
 193 }
 194
 195 static int
 196 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 197 {
 198         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 199
 200         psched_time_t now;
 201
 202         if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
 203                 long us_idle;
 204                 int  shift;
 205
 206                 PSCHED_GET_TIME(now);
 207                 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
 208                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 209
 210 /*
 211    The problem: ideally, average length queue recalcultion should
 212    be done over constant clock intervals. This is too expensive, so that
 213    the calculation is driven by outgoing packets.
 214    When the queue is idle we have to model this clock by hand.
 215
 216    SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
 217    dummy packets as a burst after idle time, i.e.
 218
 219           q->qave *= (1-W)^m
 220
 221    This is an apparently overcomplicated solution (f.e. we have to precompute
 222    a table to make this calculation in reasonable time)
 223    I believe that a simpler model may be used here,
 224    but it is field for experiments.
 225 */
 226                 shift = q->Stab[us_idle>>q->Scell_log];
 227
 228                 if (shift) {
 229                         q->qave >>= shift;
 230                 } else {
 231                         /* Approximate initial part of exponent
 232                            with linear function:
 233                            (1-W)^m ~= 1-mW + ...
 234
 235                            Seems, it is the best solution to
 236                            problem of too coarce exponent tabulation.
 237                          */
 238
 239                         us_idle = (q->qave * us_idle)>>q->Scell_log;
 240                         if (us_idle < q->qave/2)
 241                                 q->qave -= us_idle;
 242                         else
 243                                 q->qave >>= 1;
 244                 }
 245         } else {
 246                 q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
 247                 /* NOTE:
 248                    q->qave is fixed point number with point at Wlog.
 249                    The formulae above is equvalent to floating point
 250                    version:
 251
 252                    qave = qave*(1-W) + sch->stats.backlog*W;
 253                                                            --ANK (980924)
 254                  */
 255         }
 256
 257         if (q->qave < q->qth_min) {
 258                 q->qcount = -1;
 259 enqueue:
 260                 if (sch->stats.backlog <= q->limit) {
 261                         __skb_queue_tail(&sch->q, skb);
 262                         sch->stats.backlog += skb->len;
 263                         sch->stats.bytes += skb->len;
 264                         sch->stats.packets++;
 265                         return NET_XMIT_SUCCESS;
 266                 } else {
 267                         q->st.pdrop++;
 268                 }
 269                 kfree_skb(skb);
 270                 sch->stats.drops++;
 271                 return NET_XMIT_DROP;
 272         }
 273         if (q->qave >= q->qth_max) {
 274                 q->qcount = -1;
 275                 sch->stats.overlimits++;
 276 mark:
 277                 if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
 278                         q->st.early++;
 279                         goto drop;
 280                 }
 281                 q->st.marked++;
 282                 goto enqueue;
 283         }
 284
 285         if (++q->qcount) {
 286                 /* The formula used below causes questions.
 287
 288                    OK. qR is random number in the interval 0..Rmask
 289                    i.e. 0..(2^Plog). If we used floating point
 290                    arithmetics, it would be: (2^Plog)*rnd_num,
 291                    where rnd_num is less 1.
 292
 293                    Taking into account, that qave have fixed
 294                    point at Wlog, and Plog is related to max_P by
 295                    max_P = (qth_max-qth_min)/2^Plog; two lines
 296                    below have the following floating point equivalent:
 297
 298                    max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
 299
 300                    Any questions? --ANK (980924)
 301                  */
 302                 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
 303                         goto enqueue;
 304                 q->qcount = 0;
 305                 q->qR = net_random()&q->Rmask;
 306                 sch->stats.overlimits++;
 307                 goto mark;
 308         }
 309         q->qR = net_random()&q->Rmask;
 310         goto enqueue;
 311
 312 drop:
 313         kfree_skb(skb);
 314         sch->stats.drops++;
 315         return NET_XMIT_CN;
 316 }
 317
 318 static int
 319 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
 320 {
 321         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 322
 323         PSCHED_SET_PASTPERFECT(q->qidlestart);
 324
 325         __skb_queue_head(&sch->q, skb);
 326         sch->stats.backlog += skb->len;
 327         return 0;
 328 }
 329
 330 static struct sk_buff *
 331 red_dequeue(struct Qdisc* sch)
 332 {
 333         struct sk_buff *skb;
 334         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 335
 336         skb = __skb_dequeue(&sch->q);
 337         if (skb) {
 338                 sch->stats.backlog -= skb->len;
 339                 return skb;
 340         }
 341         PSCHED_GET_TIME(q->qidlestart);
 342         return NULL;
 343 }
 344
 345 static int
 346 red_drop(struct Qdisc* sch)
 347 {
 348         struct sk_buff *skb;
 349         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 350
 351         skb = __skb_dequeue_tail(&sch->q);
 352         if (skb) {
 353                 sch->stats.backlog -= skb->len;
 354                 sch->stats.drops++;
 355                 q->st.other++;
 356                 kfree_skb(skb);
 357                 return 1;
 358         }
 359         PSCHED_GET_TIME(q->qidlestart);
 360         return 0;
 361 }
 362
 363 static void red_reset(struct Qdisc* sch)
 364 {
 365         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 366
 367         __skb_queue_purge(&sch->q);
 368         sch->stats.backlog = 0;
 369         PSCHED_SET_PASTPERFECT(q->qidlestart);
 370         q->qave = 0;
 371         q->qcount = -1;
 372 }
 373
 374 static int red_change(struct Qdisc *sch, struct rtattr *opt)
 375 {
 376         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 377         struct rtattr *tb[TCA_RED_STAB];
 378         struct tc_red_qopt *ctl;
 379
 380         if (opt == NULL ||
 381             rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
 382             tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
 383             RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
 384             RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
 385                 return -EINVAL;
 386
 387         ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
 388
 389         sch_tree_lock(sch);
 390         q->flags = ctl->flags;
 391         q->Wlog = ctl->Wlog;
 392         q->Plog = ctl->Plog;
 393         q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
 394         q->Scell_log = ctl->Scell_log;
 395         q->Scell_max = (255<<q->Scell_log);
 396         q->qth_min = ctl->qth_min<<ctl->Wlog;
 397         q->qth_max = ctl->qth_max<<ctl->Wlog;
 398         q->limit = ctl->limit;
 399         memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 400
 401         q->qcount = -1;
 402         if (skb_queue_len(&sch->q) == 0)
 403                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 404         sch_tree_unlock(sch);
 405         return 0;
 406 }
 407
 408 static int red_init(struct Qdisc* sch, struct rtattr *opt)
 409 {
 410         int err;
 411
 412         MOD_INC_USE_COUNT;
 413
 414         if ((err = red_change(sch, opt)) != 0) {
 415                 MOD_DEC_USE_COUNT;
 416         }
 417         return err;
 418 }
 419
 420
 421 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
 422 {
 423         RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
 424         return 0;
 425
 426 rtattr_failure:
 427         return 1;
 428 }
 429
 430 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 431 {
 432         struct red_sched_data *q = (struct red_sched_data *)sch->data;
 433         unsigned char    *b = skb->tail;
 434         struct rtattr *rta;
 435         struct tc_red_qopt opt;
 436
 437         rta = (struct rtattr*)b;
 438         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 439         opt.limit = q->limit;
 440         opt.qth_min = q->qth_min>>q->Wlog;
 441         opt.qth_max = q->qth_max>>q->Wlog;
 442         opt.Wlog = q->Wlog;
 443         opt.Plog = q->Plog;
 444         opt.Scell_log = q->Scell_log;
 445         opt.flags = q->flags;
 446         RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
 447         rta->rta_len = skb->tail - b;
 448
 449         if (red_copy_xstats(skb, &q->st))
 450                 goto rtattr_failure;
 451
 452         return skb->len;
 453
 454 rtattr_failure:
 455         skb_trim(skb, b - skb->data);
 456         return -1;
 457 }
 458
 459 static void red_destroy(struct Qdisc *sch)
 460 {
 461         MOD_DEC_USE_COUNT;
 462 }
 463
 464 struct Qdisc_ops red_qdisc_ops =
 465 {
 466         NULL,
 467         NULL,
 468         "red",
 469         sizeof(struct red_sched_data),
 470
 471         red_enqueue,
 472         red_dequeue,
 473         red_requeue,
 474         red_drop,
 475
 476         red_init,
 477         red_reset,
 478         red_destroy,
 479         red_change,
 480
 481         red_dump,
 482 };
 483
 484
 485 #ifdef MODULE
 486 int init_module(void)
 487 {
 488         return register_qdisc(&red_qdisc_ops);
 489 }
 490
 491 void cleanup_module(void)
 492 {
 493         unregister_qdisc(&red_qdisc_ops);
 494 }
 495 #endif
 496 MODULE_LICENSE("GPL");