net/sched/sch_csz.c

   1 /*
   2  * net/sched/sch_csz.c  Clark-Shenker-Zhang scheduler.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/config.h>
  14 #include <linux/module.h>
  15 #include <asm/uaccess.h>
  16 #include <asm/system.h>
  17 #include <asm/bitops.h>
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/string.h>
  22 #include <linux/mm.h>
  23 #include <linux/socket.h>
  24 #include <linux/sockios.h>
  25 #include <linux/in.h>
  26 #include <linux/errno.h>
  27 #include <linux/interrupt.h>
  28 #include <linux/if_ether.h>
  29 #include <linux/inet.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/etherdevice.h>
  32 #include <linux/notifier.h>
  33 #include <net/ip.h>
  34 #include <net/route.h>
  35 #include <linux/skbuff.h>
  36 #include <net/sock.h>
  37 #include <net/pkt_sched.h>
  38
  39
  40 /*      Clark-Shenker-Zhang algorithm.
  41         =======================================
  42
  43         SOURCE.
  44
  45         David D. Clark, Scott Shenker and Lixia Zhang
  46         "Supporting Real-Time Applications in an Integrated Services Packet
  47         Network: Architecture and Mechanism".
  48
  49         CBQ presents a flexible universal algorithm for packet scheduling,
  50         but it has pretty poor delay characteristics.
  51         Round-robin scheduling and link-sharing goals
  52         apparently contradict minimization of network delay and jitter.
  53         Moreover, correct handling of predictive flows seems to be
  54         impossible in CBQ.
  55
  56         CSZ presents a more precise but less flexible and less efficient
  57         approach. As I understand it, the main idea is to create
  58         WFQ flows for each guaranteed service and to allocate
  59         the rest of bandwidth to dummy flow-0. Flow-0 comprises
  60         the predictive services and the best effort traffic;
  61         it is handled by a priority scheduler with the highest
  62         priority band allocated for predictive services, and the rest ---
  63         to the best effort packets.
  64
  65         Note that in CSZ flows are NOT limited to their bandwidth.  It
  66         is supposed that the flow passed admission control at the edge
  67         of the QoS network and it doesn't need further shaping. Any
  68         attempt to improve the flow or to shape it to a token bucket
  69         at intermediate hops will introduce undesired delays and raise
  70         jitter.
  71
  72         At the moment CSZ is the only scheduler that provides
  73         true guaranteed service. Another schemes (including CBQ)
  74         do not provide guaranteed delay and randomize jitter.
  75         There is a proof (Sally Floyd), that delay
  76         can be estimated by a IntServ compliant formula.
  77         This result is true formally, but it is wrong in principle.
  78         It takes into account only round-robin delays,
  79         ignoring delays introduced by link sharing i.e. overlimiting.
  80         Note that temporary overlimits are inevitable because
  81         real links are not ideal, and the real algorithm must take this
  82         into account.
  83
  84         ALGORITHM.
  85
  86         --- Notations.
  87
  88         $B$ is link bandwidth (bits/sec).
  89
  90         $I$ is set of all flows, including flow $0$.
  91         Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
  92         $\sum_{a \in I} r_a = 1$.
  93
  94         --- Flow model.
  95
  96         Let $m_a$ is the number of backlogged bits in flow $a$.
  97         The flow is {\em active}, if $m_a > 0$.
  98         This number is a discontinuous function of time;
  99         when a packet $i$ arrives:
 100         \[
 101         m_a(t_i+0) - m_a(t_i-0) = L^i,
 102         \]
 103         where $L^i$ is the length of the arrived packet.
 104         The flow queue is drained continuously until $m_a == 0$:
 105         \[
 106         {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
 107         \]
 108         I.e. flow rates are their allocated rates proportionally
 109         scaled to take all available link bandwidth. Apparently,
 110         it is not the only possible policy. F.e. CBQ classes
 111         without borrowing would be modelled by:
 112         \[
 113         {d m_a \over dt} = - B r_a .
 114         \]
 115         More complicated hierarchical bandwidth allocation
 116         policies are possible, but unfortunately, the basic
 117         flow equations have a simple solution only for proportional
 118         scaling.
 119
 120         --- Departure times.
 121
 122         We calculate the time until the last bit of packet is sent:
 123         \[
 124         E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
 125         \]
 126         where $\delta_a(t)$ is number of bits drained since $t_i$.
 127         We have to evaluate $E_a^i$ for all queued packets,
 128         then find the packet with minimal $E_a^i$ and send it.
 129
 130         This sounds good, but direct implementation of the algorithm
 131         is absolutely infeasible. Luckily, if flow rates
 132         are scaled proportionally, the equations have a simple solution.
 133
 134         The differential equation for $E_a^i$ is
 135         \[
 136         {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
 137         { B \over \sum_{b \in A} r_b}
 138         \]
 139         with initial condition
 140         \[
 141         E_a^i (t_i) = { m_a(t_i) \over r_a } .
 142         \]
 143
 144         Let's introduce an auxiliary function $R(t)$:
 145
 146         --- Round number.
 147
 148         Consider the following model: we rotate over active flows,
 149         sending $r_a B$ bits from every flow, so that we send
 150         $B \sum_{a \in A} r_a$ bits per round, that takes
 151         $\sum_{a \in A} r_a$ seconds.
 152
 153         Hence, $R(t)$ (round number) is a monotonically increasing
 154         linear function of time when $A$ is not changed
 155         \[
 156         { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
 157         \]
 158         and it is continuous when $A$ changes.
 159
 160         The central observation is that the quantity
 161         $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
 162         $R(t)$ does not depend on flow, so that $F_a^i$ can be
 163         calculated only once on packet arrival, and we need not
 164         recalculate $E$ numbers and resorting queues.
 165         The number $F_a^i$ is called finish number of the packet.
 166         It is just the value of $R(t)$ when the last bit of packet
 167         is sent out.
 168
 169         Maximal finish number on flow is called finish number of flow
 170         and minimal one is "start number of flow".
 171         Apparently, flow is active if and only if $F_a \leq R$.
 172
 173         When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
 174         we calculate $F_a^i$ as:
 175
 176         If flow was inactive ($F_a < R$):
 177         $F_a^i = R(t) + {L_i \over B r_a}$
 178         otherwise
 179         $F_a^i = F_a + {L_i \over B r_a}$
 180
 181         These equations complete the algorithm specification.
 182
 183         It looks pretty hairy, but there is a simple
 184         procedure for solving these equations.
 185         See procedure csz_update(), that is a generalization of
 186         the algorithm from S. Keshav's thesis Chapter 3
 187         "Efficient Implementation of Fair Queeing".
 188
 189         NOTES.
 190
 191         * We implement only the simplest variant of CSZ,
 192         when flow-0 is a explicit 4band priority fifo.
 193         This is bad, but we need a "peek" operation in addition
 194         to "dequeue" to implement complete CSZ.
 195         I do not want to do that, unless it is absolutely
 196         necessary.
 197
 198         * A primitive support for token bucket filtering
 199         presents itself too. It directly contradicts CSZ, but
 200         even though the Internet is on the globe ... :-)
 201         "the edges of the network" really exist.
 202
 203         BUGS.
 204
 205         * Fixed point arithmetic is overcomplicated, suboptimal and even
 206         wrong. Check it later.  */
 207
 208
 209 /* This number is arbitrary */
 210
 211 #define CSZ_GUARANTEED          16
 212 #define CSZ_FLOWS               (CSZ_GUARANTEED+4)
 213
 214 struct csz_head
 215 {
 216         struct csz_head         *snext;
 217         struct csz_head         *sprev;
 218         struct csz_head         *fnext;
 219         struct csz_head         *fprev;
 220 };
 221
 222 struct csz_flow
 223 {
 224         struct csz_head         *snext;
 225         struct csz_head         *sprev;
 226         struct csz_head         *fnext;
 227         struct csz_head         *fprev;
 228
 229 /* Parameters */
 230         struct tc_ratespec      rate;
 231         struct tc_ratespec      slice;
 232         u32                     *L_tab; /* Lookup table for L/(B*r_a) values */
 233         unsigned long           limit;  /* Maximal length of queue */
 234 #ifdef CSZ_PLUS_TBF
 235         struct tc_ratespec      peakrate;
 236         __u32                   buffer; /* Depth of token bucket, normalized
 237                                            as L/(B*r_a) */
 238         __u32                   mtu;
 239 #endif
 240
 241 /* Variables */
 242 #ifdef CSZ_PLUS_TBF
 243         unsigned long           tokens; /* Tokens number: usecs */
 244         psched_time_t           t_tbf;
 245         unsigned long           R_tbf;
 246         int                     throttled;
 247 #endif
 248         unsigned                peeked;
 249         unsigned long           start;  /* Finish number of the first skb */
 250         unsigned long           finish; /* Finish number of the flow */
 251
 252         struct sk_buff_head     q;      /* FIFO queue */
 253 };
 254
 255 #define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
 256
 257 struct csz_sched_data
 258 {
 259 /* Parameters */
 260         unsigned char   rate_log;       /* fixed point position for rate;
 261                                          * really we need not it */
 262         unsigned char   R_log;          /* fixed point position for round number */
 263         unsigned char   delta_log;      /* 1<<delta_log is maximal timeout in usecs;
 264                                          * 21 <-> 2.1sec is MAXIMAL value */
 265
 266 /* Variables */
 267         struct tcf_proto *filter_list;
 268         u8      prio2band[TC_PRIO_MAX+1];
 269 #ifdef CSZ_PLUS_TBF
 270         struct timer_list wd_timer;
 271         long            wd_expires;
 272 #endif
 273         psched_time_t   t_c;            /* Time check-point */
 274         unsigned long   R_c;            /* R-number check-point */
 275         unsigned long   rate;           /* Current sum of rates of active flows */
 276         struct csz_head s;              /* Flows sorted by "start" */
 277         struct csz_head f;              /* Flows sorted by "finish"     */
 278
 279         struct sk_buff_head     other[4];/* Predicted (0) and the best efforts
 280                                             classes (1,2,3) */
 281         struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */
 282 };
 283
 284 /* These routines (csz_insert_finish and csz_insert_start) are
 285    the most time consuming part of all the algorithm.
 286
 287    We insert to sorted list, so that time
 288    is linear with respect to number of active flows in the worst case.
 289    Note that we have not very large number of guaranteed flows,
 290    so that logarithmic algorithms (heap etc.) are useless,
 291    they are slower than linear one when length of list <= 32.
 292
 293    Heap would take sence if we used WFQ for best efforts
 294    flows, but SFQ is better choice in this case.
 295  */
 296
 297
 298 /* Insert flow "this" to the list "b" before
 299    flow with greater finish number.
 300  */
 301
 302 #if 0
 303 /* Scan forward */
 304 extern __inline__ void csz_insert_finish(struct csz_head *b,
 305                                          struct csz_flow *this)
 306 {
 307         struct csz_head *f = b->fnext;
 308         unsigned long finish = this->finish;
 309
 310         while (f != b) {
 311                 if (((struct csz_flow*)f)->finish - finish > 0)
 312                         break;
 313                 f = f->fnext;
 314         }
 315         this->fnext = f;
 316         this->fprev = f->fprev;
 317         this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
 318 }
 319 #else
 320 /* Scan backward */
 321 extern __inline__ void csz_insert_finish(struct csz_head *b,
 322                                          struct csz_flow *this)
 323 {
 324         struct csz_head *f = b->fprev;
 325         unsigned long finish = this->finish;
 326
 327         while (f != b) {
 328                 if (((struct csz_flow*)f)->finish - finish <= 0)
 329                         break;
 330                 f = f->fprev;
 331         }
 332         this->fnext = f->fnext;
 333         this->fprev = f;
 334         this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
 335 }
 336 #endif
 337
 338 /* Insert flow "this" to the list "b" before
 339    flow with greater start number.
 340  */
 341
 342 extern __inline__ void csz_insert_start(struct csz_head *b,
 343                                         struct csz_flow *this)
 344 {
 345         struct csz_head *f = b->snext;
 346         unsigned long start = this->start;
 347
 348         while (f != b) {
 349                 if (((struct csz_flow*)f)->start - start > 0)
 350                         break;
 351                 f = f->snext;
 352         }
 353         this->snext = f;
 354         this->sprev = f->sprev;
 355         this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
 356 }
 357
 358
 359 /* Calculate and return current round number.
 360    It is another time consuming part, but
 361    it is impossible to avoid it.
 362
 363    It costs O(N) that make all the algorithm useful only
 364    to play with closest to ideal fluid model.
 365
 366    There exist less academic, but more practical modifications,
 367    which might have even better characteristics (WF2Q+, HPFQ, HFSC)
 368  */
 369
 370 static unsigned long csz_update(struct Qdisc *sch)
 371 {
 372         struct csz_sched_data   *q = (struct csz_sched_data*)sch->data;
 373         struct csz_flow         *a;
 374         unsigned long           F;
 375         unsigned long           tmp;
 376         psched_time_t           now;
 377         unsigned long           delay;
 378         unsigned long           R_c;
 379
 380         PSCHED_GET_TIME(now);
 381         delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
 382
 383         if (delay>>q->delta_log) {
 384 do_reset:
 385                 /* Delta is too large.
 386                    It is possible if MTU/BW > 1<<q->delta_log
 387                    (i.e. configuration error) or because of hardware
 388                    fault. We have no choice...
 389                  */
 390                 qdisc_reset(sch);
 391                 return 0;
 392         }
 393
 394         q->t_c = now;
 395
 396         for (;;) {
 397                 a = (struct csz_flow*)q->f.fnext;
 398
 399                 /* No more active flows. Reset R and exit. */
 400                 if (a == (struct csz_flow*)&q->f) {
 401 #ifdef CSZ_DEBUG
 402                         if (q->rate) {
 403                                 printk("csz_update: rate!=0 on inactive csz\n");
 404                                 q->rate = 0;
 405                         }
 406 #endif
 407                         q->R_c = 0;
 408                         return 0;
 409                 }
 410
 411                 F = a->finish;
 412
 413 #ifdef CSZ_DEBUG
 414                 if (q->rate == 0) {
 415                         printk("csz_update: rate=0 on active csz\n");
 416                         goto do_reset;
 417                 }
 418 #endif
 419
 420                 /*
 421                  *           tmp = (t - q->t_c)/q->rate;
 422                  */
 423
 424                 tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
 425
 426                 tmp += q->R_c;
 427
 428                 /* OK, this flow (and all flows with greater
 429                    finish numbers) is still active */
 430                 if (F - tmp > 0)
 431                         break;
 432
 433                 /* It is more not active */
 434
 435                 a->fprev->fnext = a->fnext;
 436                 a->fnext->fprev = a->fprev;
 437
 438                 /*
 439                  * q->t_c += (F - q->R_c)*q->rate
 440                  */
 441
 442                 tmp = ((F-q->R_c)*q->rate)<<q->R_log;
 443                 R_c = F;
 444                 q->rate -= a->slice.rate;
 445
 446                 if ((long)(delay - tmp) >= 0) {
 447                         delay -= tmp;
 448                         continue;
 449                 }
 450                 delay = 0;
 451         }
 452
 453         q->R_c = tmp;
 454         return tmp;
 455 }
 456
 457 unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
 458 {
 459         return CSZ_GUARANTEED;
 460 }
 461
 462 static int
 463 csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 464 {
 465         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 466         unsigned flow_id = csz_classify(skb, q);
 467         unsigned long R;
 468         int prio = 0;
 469         struct csz_flow *this;
 470
 471         if (flow_id >= CSZ_GUARANTEED) {
 472                 prio = flow_id - CSZ_GUARANTEED;
 473                 flow_id = 0;
 474         }
 475
 476         this = &q->flow[flow_id];
 477         if (this->q.qlen >= this->limit || this->L_tab == NULL) {
 478                 sch->stats.drops++;
 479                 kfree_skb(skb);
 480                 return NET_XMIT_DROP;
 481         }
 482
 483         R = csz_update(sch);
 484
 485         if ((long)(this->finish - R) >= 0) {
 486                 /* It was active */
 487                 this->finish += L2R(this,skb->len);
 488         } else {
 489                 /* It is inactive; activate it */
 490                 this->finish = R + L2R(this,skb->len);
 491                 q->rate += this->slice.rate;
 492                 csz_insert_finish(&q->f, this);
 493         }
 494
 495         /* If this flow was empty, remember start number
 496            and insert it into start queue */
 497         if (this->q.qlen == 0) {
 498                 this->start = this->finish;
 499                 csz_insert_start(&q->s, this);
 500         }
 501         if (flow_id)
 502                 skb_queue_tail(&this->q, skb);
 503         else
 504                 skb_queue_tail(&q->other[prio], skb);
 505         sch->q.qlen++;
 506         sch->stats.bytes += skb->len;
 507         sch->stats.packets++;
 508         return 0;
 509 }
 510
 511 static __inline__ struct sk_buff *
 512 skb_dequeue_best(struct csz_sched_data * q)
 513 {
 514         int i;
 515         struct sk_buff *skb;
 516
 517         for (i=0; i<4; i++) {
 518                 skb = skb_dequeue(&q->other[i]);
 519                 if (skb) {
 520                         q->flow[0].q.qlen--;
 521                         return skb;
 522                 }
 523         }
 524         return NULL;
 525 }
 526
 527 static __inline__ struct sk_buff *
 528 skb_peek_best(struct csz_sched_data * q)
 529 {
 530         int i;
 531         struct sk_buff *skb;
 532
 533         for (i=0; i<4; i++) {
 534                 skb = skb_peek(&q->other[i]);
 535                 if (skb)
 536                         return skb;
 537         }
 538         return NULL;
 539 }
 540
 541 #ifdef CSZ_PLUS_TBF
 542
 543 static void csz_watchdog(unsigned long arg)
 544 {
 545         struct Qdisc *sch = (struct Qdisc*)arg;
 546
 547         qdisc_wakeup(sch->dev);
 548 }
 549
 550 static __inline__ void
 551 csz_move_queue(struct csz_flow *this, long delta)
 552 {
 553         this->fprev->fnext = this->fnext;
 554         this->fnext->fprev = this->fprev;
 555
 556         this->start += delta;
 557         this->finish += delta;
 558
 559         csz_insert_finish(this);
 560 }
 561
 562 static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
 563                                         struct csz_flow *this,
 564                                         struct sk_buff *skb)
 565 {
 566         long toks;
 567         long shift;
 568         psched_time_t now;
 569
 570         PSCHED_GET_TIME(now);
 571
 572         toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
 573
 574         shift = 0;
 575         if (this->throttled) {
 576                 /* Remember aposteriory delay */
 577
 578                 unsigned long R = csz_update(q);
 579                 shift = R - this->R_tbf;
 580                 this->R_tbf = R;
 581         }
 582
 583         if (toks >= 0) {
 584                 /* Now we have enough tokens to proceed */
 585
 586                 this->tokens = toks <= this->depth ? toks : this->depth;
 587                 this->t_tbf = now;
 588
 589                 if (!this->throttled)
 590                         return 1;
 591
 592                 /* Flow was throttled. Update its start&finish numbers
 593                    with delay calculated aposteriori.
 594                  */
 595
 596                 this->throttled = 0;
 597                 if (shift > 0)
 598                         csz_move_queue(this, shift);
 599                 return 1;
 600         }
 601
 602         if (!this->throttled) {
 603                 /* Flow has just been throttled; remember
 604                    current round number to calculate aposteriori delay
 605                  */
 606                 this->throttled = 1;
 607                 this->R_tbf = csz_update(q);
 608         }
 609
 610         /* Move all the queue to the time when it will be allowed to send.
 611            We should translate time to round number, but it is impossible,
 612            so that we made the most conservative estimate i.e. we suppose
 613            that only this flow is active and, hence, R = t.
 614            Really toks <= R <= toks/r_a.
 615
 616            This apriory shift in R will be adjusted later to reflect
 617            real delay. We cannot avoid it because of:
 618            - throttled flow continues to be active from the viewpoint
 619              of CSZ, so that it would acquire the highest priority,
 620              if you not adjusted start numbers.
 621            - Eventually, finish number would become less than round
 622              number and flow were declared inactive.
 623          */
 624
 625         toks = -toks;
 626
 627         /* Remeber, that we should start watchdog */
 628         if (toks < q->wd_expires)
 629                 q->wd_expires = toks;
 630
 631         toks >>= q->R_log;
 632         shift += toks;
 633         if (shift > 0) {
 634                 this->R_tbf += toks;
 635                 csz_move_queue(this, shift);
 636         }
 637         csz_insert_start(this);
 638         return 0;
 639 }
 640 #endif
 641
 642
 643 static struct sk_buff *
 644 csz_dequeue(struct Qdisc* sch)
 645 {
 646         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 647         struct sk_buff *skb;
 648         struct csz_flow *this;
 649
 650 #ifdef CSZ_PLUS_TBF
 651         q->wd_expires = 0;
 652 #endif
 653         this = (struct csz_flow*)q->s.snext;
 654
 655         while (this != (struct csz_flow*)&q->s) {
 656
 657                 /* First of all: unlink from start list */
 658                 this->sprev->snext = this->snext;
 659                 this->snext->sprev = this->sprev;
 660
 661                 if (this != &q->flow[0]) {      /* Guaranteed flow */
 662                         skb = __skb_dequeue(&this->q);
 663                         if (skb) {
 664 #ifdef CSZ_PLUS_TBF
 665                                 if (this->depth) {
 666                                         if (!csz_enough_tokens(q, this, skb))
 667                                                 continue;
 668                                 }
 669 #endif
 670                                 if (this->q.qlen) {
 671                                         struct sk_buff *nskb = skb_peek(&this->q);
 672                                         this->start += L2R(this,nskb->len);
 673                                         csz_insert_start(&q->s, this);
 674                                 }
 675                                 sch->q.qlen--;
 676                                 return skb;
 677                         }
 678                 } else {        /* Predicted or best effort flow */
 679                         skb = skb_dequeue_best(q);
 680                         if (skb) {
 681                                 unsigned peeked = this->peeked;
 682                                 this->peeked = 0;
 683
 684                                 if (--this->q.qlen) {
 685                                         struct sk_buff *nskb;
 686                                         unsigned dequeued = L2R(this,skb->len);
 687
 688                                         /* We got not the same thing that
 689                                            peeked earlier; adjust start number
 690                                            */
 691                                         if (peeked != dequeued && peeked)
 692                                                 this->start += dequeued - peeked;
 693
 694                                         nskb = skb_peek_best(q);
 695                                         peeked = L2R(this,nskb->len);
 696                                         this->start += peeked;
 697                                         this->peeked = peeked;
 698                                         csz_insert_start(&q->s, this);
 699                                 }
 700                                 sch->q.qlen--;
 701                                 return skb;
 702                         }
 703                 }
 704         }
 705 #ifdef CSZ_PLUS_TBF
 706         /* We are about to return no skb.
 707            Schedule watchdog timer, if it occurred because of shaping.
 708          */
 709         if (q->wd_expires) {
 710                 unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
 711                 if (delay == 0)
 712                         delay = 1;
 713                 mod_timer(&q->wd_timer, jiffies + delay);
 714                 sch->stats.overlimits++;
 715         }
 716 #endif
 717         return NULL;
 718 }
 719
 720 static void
 721 csz_reset(struct Qdisc* sch)
 722 {
 723         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 724         int    i;
 725
 726         for (i=0; i<4; i++)
 727                 skb_queue_purge(&q->other[i]);
 728
 729         for (i=0; i<CSZ_GUARANTEED; i++) {
 730                 struct csz_flow *this = q->flow + i;
 731                 skb_queue_purge(&this->q);
 732                 this->snext = this->sprev =
 733                 this->fnext = this->fprev = (struct csz_head*)this;
 734                 this->start = this->finish = 0;
 735         }
 736         q->s.snext = q->s.sprev = &q->s;
 737         q->f.fnext = q->f.fprev = &q->f;
 738         q->R_c = 0;
 739 #ifdef CSZ_PLUS_TBF
 740         PSCHED_GET_TIME(&q->t_tbf);
 741         q->tokens = q->depth;
 742         del_timer(&q->wd_timer);
 743 #endif
 744         sch->q.qlen = 0;
 745 }
 746
 747 static void
 748 csz_destroy(struct Qdisc* sch)
 749 {
 750         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 751         struct tcf_proto *tp;
 752
 753         while ((tp = q->filter_list) != NULL) {
 754                 q->filter_list = tp->next;
 755                 tcf_destroy(tp);
 756         }
 757
 758         MOD_DEC_USE_COUNT;
 759 }
 760
 761 static int csz_init(struct Qdisc *sch, struct rtattr *opt)
 762 {
 763         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 764         struct rtattr *tb[TCA_CSZ_PTAB];
 765         struct tc_csz_qopt *qopt;
 766         int    i;
 767
 768         rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
 769         if (tb[TCA_CSZ_PARMS-1] == NULL ||
 770             RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
 771                 return -EINVAL;
 772         qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
 773
 774         q->R_log = qopt->R_log;
 775         q->delta_log = qopt->delta_log;
 776         for (i=0; i<=TC_PRIO_MAX; i++) {
 777                 if (qopt->priomap[i] >= CSZ_FLOWS)
 778                         return -EINVAL;
 779                 q->prio2band[i] = qopt->priomap[i];
 780         }
 781
 782         for (i=0; i<4; i++)
 783                 skb_queue_head_init(&q->other[i]);
 784
 785         for (i=0; i<CSZ_GUARANTEED; i++) {
 786                 struct csz_flow *this = q->flow + i;
 787                 skb_queue_head_init(&this->q);
 788                 this->snext = this->sprev =
 789                 this->fnext = this->fprev = (struct csz_head*)this;
 790                 this->start = this->finish = 0;
 791         }
 792         q->s.snext = q->s.sprev = &q->s;
 793         q->f.fnext = q->f.fprev = &q->f;
 794         q->R_c = 0;
 795 #ifdef CSZ_PLUS_TBF
 796         init_timer(&q->wd_timer);
 797         q->wd_timer.data = (unsigned long)sch;
 798         q->wd_timer.function = csz_watchdog;
 799 #endif
 800         MOD_INC_USE_COUNT;
 801         return 0;
 802 }
 803
 804 static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
 805 {
 806         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 807         unsigned char    *b = skb->tail;
 808         struct rtattr *rta;
 809         struct tc_csz_qopt opt;
 810
 811         rta = (struct rtattr*)b;
 812         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 813
 814         opt.flows = CSZ_FLOWS;
 815         memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
 816         RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
 817         rta->rta_len = skb->tail - b;
 818
 819         return skb->len;
 820
 821 rtattr_failure:
 822         skb_trim(skb, b - skb->data);
 823         return -1;
 824 }
 825
 826 static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 827                      struct Qdisc **old)
 828 {
 829         return -EINVAL;
 830 }
 831
 832 static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
 833 {
 834         return NULL;
 835 }
 836
 837
 838 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
 839 {
 840         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 841         unsigned long band = TC_H_MIN(classid) - 1;
 842
 843         if (band >= CSZ_FLOWS)
 844                 return 0;
 845
 846         if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
 847                 return 0;
 848
 849         return band+1;
 850 }
 851
 852 static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
 853 {
 854         return csz_get(sch, classid);
 855 }
 856
 857
 858 static void csz_put(struct Qdisc *sch, unsigned long cl)
 859 {
 860         return;
 861 }
 862
 863 static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
 864 {
 865         unsigned long cl = *arg;
 866         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 867         struct rtattr *opt = tca[TCA_OPTIONS-1];
 868         struct rtattr *tb[TCA_CSZ_PTAB];
 869         struct tc_csz_copt *copt;
 870
 871         rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
 872         if (tb[TCA_CSZ_PARMS-1] == NULL ||
 873             RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
 874                 return -EINVAL;
 875         copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
 876
 877         if (tb[TCA_CSZ_RTAB-1] &&
 878             RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
 879                 return -EINVAL;
 880
 881         if (cl) {
 882                 struct csz_flow *a;
 883                 cl--;
 884                 if (cl >= CSZ_FLOWS)
 885                         return -ENOENT;
 886                 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
 887                         return -EINVAL;
 888
 889                 a = &q->flow[cl];
 890
 891                 spin_lock_bh(&sch->dev->queue_lock);
 892 #if 0
 893                 a->rate_log = copt->rate_log;
 894 #endif
 895 #ifdef CSZ_PLUS_TBF
 896                 a->limit = copt->limit;
 897                 a->rate = copt->rate;
 898                 a->buffer = copt->buffer;
 899                 a->mtu = copt->mtu;
 900 #endif
 901
 902                 if (tb[TCA_CSZ_RTAB-1])
 903                         memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
 904
 905                 spin_unlock_bh(&sch->dev->queue_lock);
 906                 return 0;
 907         }
 908         /* NI */
 909         return 0;
 910 }
 911
 912 static int csz_delete(struct Qdisc *sch, unsigned long cl)
 913 {
 914         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 915         struct csz_flow *a;
 916
 917         cl--;
 918
 919         if (cl >= CSZ_FLOWS)
 920                 return -ENOENT;
 921         if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
 922                 return -EINVAL;
 923
 924         a = &q->flow[cl];
 925
 926         spin_lock_bh(&sch->dev->queue_lock);
 927         a->fprev->fnext = a->fnext;
 928         a->fnext->fprev = a->fprev;
 929         a->sprev->snext = a->snext;
 930         a->snext->sprev = a->sprev;
 931         a->start = a->finish = 0;
 932         kfree(xchg(&q->flow[cl].L_tab, NULL));
 933         spin_unlock_bh(&sch->dev->queue_lock);
 934
 935         return 0;
 936 }
 937
 938 static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
 939 {
 940         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 941         unsigned char    *b = skb->tail;
 942         struct rtattr *rta;
 943         struct tc_csz_copt opt;
 944
 945         tcm->tcm_handle = sch->handle|cl;
 946
 947         cl--;
 948
 949         if (cl > CSZ_FLOWS)
 950                 goto rtattr_failure;
 951
 952         if (cl < CSZ_GUARANTEED) {
 953                 struct csz_flow *f = &q->flow[cl];
 954
 955                 if (f->L_tab == NULL)
 956                         goto rtattr_failure;
 957
 958                 rta = (struct rtattr*)b;
 959                 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 960
 961                 opt.limit = f->limit;
 962                 opt.rate = f->rate;
 963                 opt.slice = f->slice;
 964                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 965 #ifdef CSZ_PLUS_TBF
 966                 opt.buffer = f->buffer;
 967                 opt.mtu = f->mtu;
 968 #else
 969                 opt.buffer = 0;
 970                 opt.mtu = 0;
 971 #endif
 972
 973                 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
 974                 rta->rta_len = skb->tail - b;
 975         }
 976
 977         return skb->len;
 978
 979 rtattr_failure:
 980         skb_trim(skb, b - skb->data);
 981         return -1;
 982 }
 983
 984 static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 985 {
 986         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
 987         int prio = 0;
 988
 989         if (arg->stop)
 990                 return;
 991
 992         for (prio = 0; prio < CSZ_FLOWS; prio++) {
 993                 if (arg->count < arg->skip) {
 994                         arg->count++;
 995                         continue;
 996                 }
 997                 if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
 998                         arg->count++;
 999                         continue;
1000                 }
1001                 if (arg->fn(sch, prio+1, arg) < 0) {
1002                         arg->stop = 1;
1003                         break;
1004                 }
1005                 arg->count++;
1006         }
1007 }
1008
1009 static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
1010 {
1011         struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
1012
1013         if (cl)
1014                 return NULL;
1015
1016         return &q->filter_list;
1017 }
1018
1019 struct Qdisc_class_ops csz_class_ops =
1020 {
1021         csz_graft,
1022         csz_leaf,
1023
1024         csz_get,
1025         csz_put,
1026         csz_change,
1027         csz_delete,
1028         csz_walk,
1029
1030         csz_find_tcf,
1031         csz_bind,
1032         csz_put,
1033
1034         csz_dump_class,
1035 };
1036
1037 struct Qdisc_ops csz_qdisc_ops =
1038 {
1039         NULL,
1040         &csz_class_ops,
1041         "csz",
1042         sizeof(struct csz_sched_data),
1043
1044         csz_enqueue,
1045         csz_dequeue,
1046         NULL,
1047         NULL,
1048
1049         csz_init,
1050         csz_reset,
1051         csz_destroy,
1052         NULL /* csz_change */,
1053
1054         csz_dump,
1055 };
1056
1057
1058 #ifdef MODULE
1059 int init_module(void)
1060 {
1061         return register_qdisc(&csz_qdisc_ops);
1062 }
1063
1064 void cleanup_module(void)
1065 {
1066         unregister_qdisc(&csz_qdisc_ops);
1067 }
1068 #endif
1069 MODULE_LICENSE("GPL");