2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <asm/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/init.h>
27 #include <net/pkt_sched.h>
29 #define qdisc_priv(q) ((void *)(q->data))
31 /* Network Emulation Queuing algorithm.
32 ====================================
34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 Network Emulation Tool
36 [2] Luigi Rizzo, DummyNet for FreeBSD
38 ----------------------------------------------------------------
40 This started out as a simple way to delay outgoing packets to
41 test TCP but has grown to include most of the functionality
42 of a full blown network emulator like NISTnet. It can delay
43 packets and add random jitter (and correlation). The random
44 distribution can be loaded from a table as well to provide
45 normal, Pareto, or experimental curves. Packet loss,
46 duplication, and reordering can also be emulated.
48 This qdisc does not do classification that can be handled in
49 layering other disciplines. It does not need to do bandwidth
50 control either since that can be handled by using token
51 bucket or other rate control.
53 The simulator is limited by the Linux timer resolution
54 and will create packet bursts on the HZ boundary (1ms).
57 struct netem_sched_data {
59 struct sk_buff_head delayed;
60 struct timer_list timer;
73 } delay_cor, loss_cor, dup_cor;
81 /* Time stamp put into socket buffer control block */
83 psched_time_t time_to_send;
86 /* init_crandom - initialize correlated random number generator
87 * Use entropy source for initial seed.
89 static void init_crandom(struct crndstate *state, unsigned long rho)
92 state->last = net_random();
95 /* get_crandom - correlated random number generator
96 * Next number depends on last value.
97 * rho is scaled to avoid floating point.
99 static unsigned long get_crandom(struct crndstate *state)
102 unsigned long answer;
104 if (state->rho == 0) /* no correllation */
107 value = net_random();
108 rho = (u64)state->rho + 1;
109 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
110 state->last = answer;
114 /* tabledist - return a pseudo-randomly distributed value with mean mu and
115 * std deviation sigma. Uses table lookup to approximate the desired
116 * distribution, and a uniformly-distributed pseudo-random source.
118 static long tabledist(unsigned long mu, long sigma,
119 struct crndstate *state, const struct disttable *dist)
127 rnd = get_crandom(state);
129 /* default uniform distribution */
131 return (rnd % (2*sigma)) - sigma + mu;
133 t = dist->table[rnd % dist->size];
134 x = (sigma % NETEM_DIST_SCALE) * t;
136 x += NETEM_DIST_SCALE/2;
138 x -= NETEM_DIST_SCALE/2;
140 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
143 /* Put skb in the private delayed queue. */
144 static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
146 struct netem_sched_data *q = qdisc_priv(sch);
147 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
151 PSCHED_GET_TIME(now);
152 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
153 PSCHED_TADD2(now, td, cb->time_to_send);
155 /* Always queue at tail to keep packets in order */
156 if (likely(q->delayed.qlen < q->limit)) {
157 __skb_queue_tail(&q->delayed, skb);
158 sch->stats.bytes += skb->len;
159 sch->stats.packets++;
161 if (!timer_pending(&q->timer)) {
162 q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
163 add_timer(&q->timer);
165 return NET_XMIT_SUCCESS;
170 return NET_XMIT_DROP;
173 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
175 struct netem_sched_data *q = qdisc_priv(sch);
177 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
179 /* Random packet drop 0 => none, ~0 => all */
180 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
181 pr_debug("netem_enqueue: random loss\n");
184 return 0; /* lie about loss so TCP doesn't know */
187 /* Random duplication */
188 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
189 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
191 pr_debug("netem_enqueue: dup %p\n", skb2);
193 delay_skb(sch, skb2);
196 /* If doing simple delay then gap == 0 so all packets
197 * go into the delayed holding queue
198 * otherwise if doing out of order only "1 out of gap"
199 * packets will be delayed.
201 if (q->counter < q->gap) {
205 ret = q->qdisc->enqueue(skb, q->qdisc);
206 if (likely(ret == NET_XMIT_SUCCESS)) {
208 sch->stats.bytes += skb->len;
209 sch->stats.packets++;
217 return delay_skb(sch, skb);
220 /* Requeue packets but don't change time stamp */
221 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
223 struct netem_sched_data *q = qdisc_priv(sch);
226 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0)
232 static unsigned int netem_drop(struct Qdisc* sch)
234 struct netem_sched_data *q = qdisc_priv(sch);
237 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
245 * Move all packets that are ready to send from the delay holding
246 * list to the underlying qdisc, then just call dequeue
248 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
250 struct netem_sched_data *q = qdisc_priv(sch);
253 skb = q->qdisc->dequeue(q->qdisc);
259 static void netem_watchdog(unsigned long arg)
261 struct Qdisc *sch = (struct Qdisc *)arg;
262 struct netem_sched_data *q = qdisc_priv(sch);
263 struct net_device *dev = sch->dev;
267 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
269 spin_lock_bh(&dev->queue_lock);
270 PSCHED_GET_TIME(now);
272 while ((skb = skb_peek(&q->delayed)) != NULL) {
273 const struct netem_skb_cb *cb
274 = (const struct netem_skb_cb *)skb->cb;
276 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
277 pr_debug("netem_watchdog: skb %p@%lu %ld\n",
278 skb, jiffies, delay);
280 /* if more time remaining? */
282 mod_timer(&q->timer, jiffies + delay);
285 __skb_unlink(skb, &q->delayed);
287 if (q->qdisc->enqueue(skb, q->qdisc))
293 spin_unlock_bh(&dev->queue_lock);
296 static void netem_reset(struct Qdisc *sch)
298 struct netem_sched_data *q = qdisc_priv(sch);
300 qdisc_reset(q->qdisc);
301 skb_queue_purge(&q->delayed);
304 del_timer_sync(&q->timer);
307 static int set_fifo_limit(struct Qdisc *q, int limit)
312 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
314 rta->rta_type = RTM_NEWQDISC;
315 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
316 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
318 ret = q->ops->change(q, rta);
325 * Distribution data is a variable size payload containing
326 * signed 16 bit values.
328 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
330 struct netem_sched_data *q = qdisc_priv(sch);
331 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
332 const __s16 *data = RTA_DATA(attr);
339 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
344 for (i = 0; i < n; i++)
345 d->table[i] = data[i];
347 spin_lock_bh(&sch->dev->queue_lock);
348 d = xchg(&q->delay_dist, d);
349 spin_unlock_bh(&sch->dev->queue_lock);
355 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
357 struct netem_sched_data *q = qdisc_priv(sch);
358 const struct tc_netem_corr *c = RTA_DATA(attr);
360 if (RTA_PAYLOAD(attr) != sizeof(*c))
363 init_crandom(&q->delay_cor, c->delay_corr);
364 init_crandom(&q->loss_cor, c->loss_corr);
365 init_crandom(&q->dup_cor, c->dup_corr);
369 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
371 struct netem_sched_data *q = qdisc_priv(sch);
372 struct tc_netem_qopt *qopt;
375 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
378 qopt = RTA_DATA(opt);
379 ret = set_fifo_limit(q->qdisc, qopt->limit);
381 pr_debug("netem: can't set fifo limit\n");
385 q->latency = qopt->latency;
386 q->jitter = qopt->jitter;
387 q->limit = qopt->limit;
389 q->loss = qopt->loss;
390 q->duplicate = qopt->duplicate;
392 /* Handle nested options after initial queue options.
393 * Should have put all options in nested format but too late now.
395 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
396 struct rtattr *tb[TCA_NETEM_MAX];
397 if (rtattr_parse(tb, TCA_NETEM_MAX,
398 RTA_DATA(opt) + sizeof(*qopt),
399 RTA_PAYLOAD(opt) - sizeof(*qopt)))
402 if (tb[TCA_NETEM_CORR-1]) {
403 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
408 if (tb[TCA_NETEM_DELAY_DIST-1]) {
409 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
419 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
421 struct netem_sched_data *q = qdisc_priv(sch);
428 skb_queue_head_init(&q->delayed);
429 init_timer(&q->timer);
430 q->timer.function = netem_watchdog;
431 q->timer.data = (unsigned long) sch;
434 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
436 pr_debug("netem: qdisc create failed\n");
440 ret = netem_change(sch, opt);
442 pr_debug("netem: change failed\n");
443 qdisc_destroy(q->qdisc);
449 static void netem_destroy(struct Qdisc *sch)
451 struct netem_sched_data *q = qdisc_priv(sch);
453 del_timer_sync(&q->timer);
454 qdisc_destroy(q->qdisc);
455 kfree(q->delay_dist);
459 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
461 const struct netem_sched_data *q = qdisc_priv(sch);
462 unsigned char *b = skb->tail;
463 struct rtattr *rta = (struct rtattr *) b;
464 struct tc_netem_qopt qopt;
465 struct tc_netem_corr cor;
467 qopt.latency = q->latency;
468 qopt.jitter = q->jitter;
469 qopt.limit = q->limit;
472 qopt.duplicate = q->duplicate;
473 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
475 cor.delay_corr = q->delay_cor.rho;
476 cor.loss_corr = q->loss_cor.rho;
477 cor.dup_corr = q->dup_cor.rho;
478 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
479 rta->rta_len = skb->tail - b;
484 skb_trim(skb, b - skb->data);
488 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
489 struct sk_buff *skb, struct tcmsg *tcm)
491 struct netem_sched_data *q = qdisc_priv(sch);
493 if (cl != 1) /* only one class */
496 tcm->tcm_handle |= TC_H_MIN(1);
497 tcm->tcm_info = q->qdisc->handle;
502 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
505 struct netem_sched_data *q = qdisc_priv(sch);
511 *old = xchg(&q->qdisc, new);
514 sch_tree_unlock(sch);
519 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
521 struct netem_sched_data *q = qdisc_priv(sch);
525 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
530 static void netem_put(struct Qdisc *sch, unsigned long arg)
534 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
535 struct rtattr **tca, unsigned long *arg)
540 static int netem_delete(struct Qdisc *sch, unsigned long arg)
545 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
548 if (walker->count >= walker->skip)
549 if (walker->fn(sch, 1, walker) < 0) {
557 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
562 static struct Qdisc_class_ops netem_class_ops = {
563 .graft = netem_graft,
567 .change = netem_change_class,
568 .delete = netem_delete,
570 .tcf_chain = netem_find_tcf,
571 .dump = netem_dump_class,
574 static struct Qdisc_ops netem_qdisc_ops = {
576 .cl_ops = &netem_class_ops,
577 .priv_size = sizeof(struct netem_sched_data),
578 .enqueue = netem_enqueue,
579 .dequeue = netem_dequeue,
580 .requeue = netem_requeue,
583 .reset = netem_reset,
584 .destroy = netem_destroy,
585 .change = netem_change,
590 static int __init netem_module_init(void)
592 return register_qdisc(&netem_qdisc_ops);
594 static void __exit netem_module_exit(void)
596 unregister_qdisc(&netem_qdisc_ops);
598 module_init(netem_module_init)
599 module_exit(netem_module_exit)
600 MODULE_LICENSE("GPL");