import of upstream 2.4.34.4 from kernel.org
[linux-2.4.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <asm/bitops.h>
17 #include <linux/config.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/list.h>
33 #include <net/sock.h>
34 #include <net/pkt_sched.h>
35
36 /* Main transmission queue. */
37
38 /* Main qdisc structure lock. 
39
40    However, modifications
41    to data, participating in scheduling must be additionally
42    protected with dev->queue_lock spinlock.
43
44    The idea is the following:
45    - enqueue, dequeue are serialized via top level device
46      spinlock dev->queue_lock.
47    - tree walking is protected by read_lock(qdisc_tree_lock)
48      and this lock is used only in process context.
49    - updates to tree are made only under rtnl semaphore,
50      hence this lock may be made without local bh disabling.
51
52    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
53  */
54 rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED;
55
56 /* 
57    dev->queue_lock serializes queue accesses for this device
58    AND dev->qdisc pointer itself.
59
60    dev->xmit_lock serializes accesses to device driver.
61
62    dev->queue_lock and dev->xmit_lock are mutually exclusive,
63    if one is grabbed, another must be free.
64  */
65
66
67 /* Kick device.
68    Note, that this procedure can be called by a watchdog timer, so that
69    we do not check dev->tbusy flag here.
70
71    Returns:  0  - queue is empty.
72             >0  - queue is not empty, but throttled.
73             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
74
75    NOTE: Called under dev->queue_lock with locally disabled BH.
76 */
77
78 int qdisc_restart(struct net_device *dev)
79 {
80         struct Qdisc *q = dev->qdisc;
81         struct sk_buff *skb;
82
83         /* Dequeue packet */
84         if ((skb = q->dequeue(q)) != NULL) {
85                 if (spin_trylock(&dev->xmit_lock)) {
86                         /* Remember that the driver is grabbed by us. */
87                         dev->xmit_lock_owner = smp_processor_id();
88
89                         /* And release queue */
90                         spin_unlock(&dev->queue_lock);
91
92                         if (!netif_queue_stopped(dev)) {
93                                 if (netdev_nit)
94                                         dev_queue_xmit_nit(skb, dev);
95
96                                 if (dev->hard_start_xmit(skb, dev) == 0) {
97                                         dev->xmit_lock_owner = -1;
98                                         spin_unlock(&dev->xmit_lock);
99
100                                         spin_lock(&dev->queue_lock);
101                                         return -1;
102                                 }
103                         }
104
105                         /* Release the driver */
106                         dev->xmit_lock_owner = -1;
107                         spin_unlock(&dev->xmit_lock);
108                         spin_lock(&dev->queue_lock);
109                         q = dev->qdisc;
110                 } else {
111                         /* So, someone grabbed the driver. */
112
113                         /* It may be transient configuration error,
114                            when hard_start_xmit() recurses. We detect
115                            it by checking xmit owner and drop the
116                            packet when deadloop is detected.
117                          */
118                         if (dev->xmit_lock_owner == smp_processor_id()) {
119                                 kfree_skb(skb);
120                                 if (net_ratelimit())
121                                         printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
122                                 return -1;
123                         }
124                         netdev_rx_stat[smp_processor_id()].cpu_collision++;
125                 }
126
127                 /* Device kicked us out :(
128                    This is possible in three cases:
129
130                    0. driver is locked
131                    1. fastroute is enabled
132                    2. device cannot determine busy state
133                       before start of transmission (f.e. dialout)
134                    3. device is buggy (ppp)
135                  */
136
137                 q->ops->requeue(skb, q);
138                 netif_schedule(dev);
139                 return 1;
140         }
141         return q->q.qlen;
142 }
143
144 static void dev_watchdog(unsigned long arg)
145 {
146         struct net_device *dev = (struct net_device *)arg;
147
148         spin_lock(&dev->xmit_lock);
149         if (dev->qdisc != &noop_qdisc) {
150                 if (netif_device_present(dev) &&
151                     netif_running(dev) &&
152                     netif_carrier_ok(dev)) {
153                         if (netif_queue_stopped(dev) &&
154                             (jiffies - dev->trans_start) > dev->watchdog_timeo) {
155                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
156                                 dev->tx_timeout(dev);
157                         }
158                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
159                                 dev_hold(dev);
160                 }
161         }
162         spin_unlock(&dev->xmit_lock);
163
164         dev_put(dev);
165 }
166
167 static void dev_watchdog_init(struct net_device *dev)
168 {
169         init_timer(&dev->watchdog_timer);
170         dev->watchdog_timer.data = (unsigned long)dev;
171         dev->watchdog_timer.function = dev_watchdog;
172 }
173
174 void __netdev_watchdog_up(struct net_device *dev)
175 {
176         if (dev->tx_timeout) {
177                 if (dev->watchdog_timeo <= 0)
178                         dev->watchdog_timeo = 5*HZ;
179                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
180                         dev_hold(dev);
181         }
182 }
183
184 static void dev_watchdog_up(struct net_device *dev)
185 {
186         spin_lock_bh(&dev->xmit_lock);
187         __netdev_watchdog_up(dev);
188         spin_unlock_bh(&dev->xmit_lock);
189 }
190
191 static void dev_watchdog_down(struct net_device *dev)
192 {
193         spin_lock_bh(&dev->xmit_lock);
194         if (del_timer(&dev->watchdog_timer))
195                 __dev_put(dev);
196         spin_unlock_bh(&dev->xmit_lock);
197 }
198
199 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
200    under all circumstances. It is difficult to invent anything faster or
201    cheaper.
202  */
203
204 static int
205 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
206 {
207         kfree_skb(skb);
208         return NET_XMIT_CN;
209 }
210
211 static struct sk_buff *
212 noop_dequeue(struct Qdisc * qdisc)
213 {
214         return NULL;
215 }
216
217 static int
218 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
219 {
220         if (net_ratelimit())
221                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
222         kfree_skb(skb);
223         return NET_XMIT_CN;
224 }
225
226 struct Qdisc_ops noop_qdisc_ops =
227 {
228         NULL,
229         NULL,
230         "noop",
231         0,
232
233         noop_enqueue,
234         noop_dequeue,
235         noop_requeue,
236 };
237
238 struct Qdisc noop_qdisc =
239 {
240         noop_enqueue,
241         noop_dequeue,
242         TCQ_F_BUILTIN,
243         &noop_qdisc_ops,        
244 };
245
246
247 struct Qdisc_ops noqueue_qdisc_ops =
248 {
249         NULL,
250         NULL,
251         "noqueue",
252         0,
253
254         noop_enqueue,
255         noop_dequeue,
256         noop_requeue,
257
258 };
259
260 struct Qdisc noqueue_qdisc =
261 {
262         NULL,
263         noop_dequeue,
264         TCQ_F_BUILTIN,
265         &noqueue_qdisc_ops,
266 };
267
268
269 static const u8 prio2band[TC_PRIO_MAX+1] =
270 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
271
272 /* 3-band FIFO queue: old style, but should be a bit faster than
273    generic prio+fifo combination.
274  */
275
276 static int
277 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279         struct sk_buff_head *list;
280
281         list = ((struct sk_buff_head*)qdisc->data) +
282                 prio2band[skb->priority&TC_PRIO_MAX];
283
284         if (list->qlen < qdisc->dev->tx_queue_len) {
285                 __skb_queue_tail(list, skb);
286                 qdisc->q.qlen++;
287                 qdisc->stats.bytes += skb->len;
288                 qdisc->stats.packets++;
289                 return 0;
290         }
291         qdisc->stats.drops++;
292         kfree_skb(skb);
293         return NET_XMIT_DROP;
294 }
295
296 static struct sk_buff *
297 pfifo_fast_dequeue(struct Qdisc* qdisc)
298 {
299         int prio;
300         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
301         struct sk_buff *skb;
302
303         for (prio = 0; prio < 3; prio++, list++) {
304                 skb = __skb_dequeue(list);
305                 if (skb) {
306                         qdisc->q.qlen--;
307                         return skb;
308                 }
309         }
310         return NULL;
311 }
312
313 static int
314 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
315 {
316         struct sk_buff_head *list;
317
318         list = ((struct sk_buff_head*)qdisc->data) +
319                 prio2band[skb->priority&TC_PRIO_MAX];
320
321         __skb_queue_head(list, skb);
322         qdisc->q.qlen++;
323         return 0;
324 }
325
326 static void
327 pfifo_fast_reset(struct Qdisc* qdisc)
328 {
329         int prio;
330         struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data);
331
332         for (prio=0; prio < 3; prio++)
333                 skb_queue_purge(list+prio);
334         qdisc->q.qlen = 0;
335 }
336
337 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
338 {
339         unsigned char    *b = skb->tail;
340         struct tc_prio_qopt opt;
341
342         opt.bands = 3; 
343         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
344         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
345         return skb->len;
346
347 rtattr_failure:
348         skb_trim(skb, b - skb->data);
349         return -1;
350 }
351
352 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
353 {
354         int i;
355         struct sk_buff_head *list;
356
357         list = ((struct sk_buff_head*)qdisc->data);
358
359         for (i=0; i<3; i++)
360                 skb_queue_head_init(list+i);
361
362         return 0;
363 }
364
365 static struct Qdisc_ops pfifo_fast_ops =
366 {
367         NULL,
368         NULL,
369         "pfifo_fast",
370         3 * sizeof(struct sk_buff_head),
371
372         pfifo_fast_enqueue,
373         pfifo_fast_dequeue,
374         pfifo_fast_requeue,
375         NULL,
376
377         pfifo_fast_init,
378         pfifo_fast_reset,
379         NULL,
380         NULL,
381         pfifo_fast_dump,
382
383 };
384
385 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
386 {
387         struct Qdisc *sch;
388         int size = sizeof(*sch) + ops->priv_size;
389
390         sch = kmalloc(size, GFP_KERNEL);
391         if (!sch)
392                 return NULL;
393         memset(sch, 0, size);
394
395         INIT_LIST_HEAD(&sch->list);
396         skb_queue_head_init(&sch->q);
397         sch->ops = ops;
398         sch->enqueue = ops->enqueue;
399         sch->dequeue = ops->dequeue;
400         sch->dev = dev;
401         sch->stats.lock = &dev->queue_lock;
402         atomic_set(&sch->refcnt, 1);
403         if (!ops->init || ops->init(sch, NULL) == 0)
404                 return sch;
405
406         kfree(sch);
407         return NULL;
408 }
409
410 /* Under dev->queue_lock and BH! */
411
412 void qdisc_reset(struct Qdisc *qdisc)
413 {
414         struct Qdisc_ops *ops = qdisc->ops;
415
416         if (ops->reset)
417                 ops->reset(qdisc);
418 }
419
420 /* Under dev->queue_lock and BH! */
421
422 void qdisc_destroy(struct Qdisc *qdisc)
423 {
424         struct Qdisc_ops *ops = qdisc->ops;
425
426         if (qdisc->flags&TCQ_F_BUILTIN ||
427             !atomic_dec_and_test(&qdisc->refcnt))
428                 return;
429         list_del(&qdisc->list);
430 #ifdef CONFIG_NET_ESTIMATOR
431         qdisc_kill_estimator(&qdisc->stats);
432 #endif
433         if (ops->reset)
434                 ops->reset(qdisc);
435         if (ops->destroy)
436                 ops->destroy(qdisc);
437         kfree(qdisc);
438 }
439
440
441 void dev_activate(struct net_device *dev)
442 {
443         /* No queueing discipline is attached to device;
444            create default one i.e. pfifo_fast for devices,
445            which need queueing and noqueue_qdisc for
446            virtual interfaces
447          */
448
449         if (dev->qdisc_sleeping == &noop_qdisc) {
450                 struct Qdisc *qdisc;
451                 if (dev->tx_queue_len) {
452                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
453                         if (qdisc == NULL) {
454                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
455                                 return;
456                         }
457                         write_lock(&qdisc_tree_lock);
458                         list_add_tail(&qdisc->list, &dev->qdisc_list);
459                         write_unlock(&qdisc_tree_lock);
460
461                 } else {
462                         qdisc =  &noqueue_qdisc;
463                 }
464                 write_lock(&qdisc_tree_lock);
465                 dev->qdisc_sleeping = qdisc;
466                 write_unlock(&qdisc_tree_lock);
467         }
468
469         spin_lock_bh(&dev->queue_lock);
470         if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) {
471                 dev->trans_start = jiffies;
472                 dev_watchdog_up(dev);
473         }
474         spin_unlock_bh(&dev->queue_lock);
475 }
476
477 void dev_deactivate(struct net_device *dev)
478 {
479         struct Qdisc *qdisc;
480
481         spin_lock_bh(&dev->queue_lock);
482         qdisc = dev->qdisc;
483         dev->qdisc = &noop_qdisc;
484
485         qdisc_reset(qdisc);
486
487         spin_unlock_bh(&dev->queue_lock);
488
489         dev_watchdog_down(dev);
490
491         while (test_bit(__LINK_STATE_SCHED, &dev->state))
492                 yield();
493
494         spin_unlock_wait(&dev->xmit_lock);
495 }
496
497 void dev_init_scheduler(struct net_device *dev)
498 {
499         write_lock(&qdisc_tree_lock);
500         spin_lock_bh(&dev->queue_lock);
501         dev->qdisc = &noop_qdisc;
502         spin_unlock_bh(&dev->queue_lock);
503         dev->qdisc_sleeping = &noop_qdisc;
504         INIT_LIST_HEAD(&dev->qdisc_list);
505         write_unlock(&qdisc_tree_lock);
506
507         dev_watchdog_init(dev);
508 }
509
510 void dev_shutdown(struct net_device *dev)
511 {
512         struct Qdisc *qdisc;
513
514         write_lock(&qdisc_tree_lock);
515         spin_lock_bh(&dev->queue_lock);
516         qdisc = dev->qdisc_sleeping;
517         dev->qdisc = &noop_qdisc;
518         dev->qdisc_sleeping = &noop_qdisc;
519         qdisc_destroy(qdisc);
520 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
521         if ((qdisc = dev->qdisc_ingress) != NULL) {
522                 dev->qdisc_ingress = NULL;
523                 qdisc_destroy(qdisc);
524         }
525 #endif
526         BUG_TRAP(list_empty(&dev->qdisc_list));
527         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
528         spin_unlock_bh(&dev->queue_lock);
529         write_unlock(&qdisc_tree_lock);
530 }