tweak reserved blocks
[linux-2.4.git] / net / sched / cls_u32.c
1 /*
2  * net/sched/cls_u32.c  Ugly (or Universal) 32bit key Packet Classifier.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  *      The filters are packed to hash tables of key nodes
12  *      with a set of 32bit key/mask pairs at every node.
13  *      Nodes reference next level hash tables etc.
14  *
15  *      This scheme is the best universal classifier I managed to
16  *      invent; it is not super-fast, but it is not slow (provided you
17  *      program it correctly), and general enough.  And its relative
18  *      speed grows as the number of rules becomes larger.
19  *
20  *      It seems that it represents the best middle point between
21  *      speed and manageability both by human and by machine.
22  *
23  *      It is especially useful for link sharing combined with QoS;
24  *      pure RSVP doesn't need such a general approach and can use
25  *      much simpler (and faster) schemes, sort of cls_rsvp.c.
26  */
27
28 #include <asm/uaccess.h>
29 #include <asm/system.h>
30 #include <asm/bitops.h>
31 #include <linux/config.h>
32 #include <linux/module.h>
33 #include <linux/types.h>
34 #include <linux/kernel.h>
35 #include <linux/sched.h>
36 #include <linux/string.h>
37 #include <linux/mm.h>
38 #include <linux/socket.h>
39 #include <linux/sockios.h>
40 #include <linux/in.h>
41 #include <linux/errno.h>
42 #include <linux/interrupt.h>
43 #include <linux/if_ether.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/etherdevice.h>
47 #include <linux/notifier.h>
48 #include <linux/rtnetlink.h>
49 #include <net/ip.h>
50 #include <net/route.h>
51 #include <linux/skbuff.h>
52 #include <net/sock.h>
53 #include <net/pkt_sched.h>
54
55
56 struct tc_u_knode
57 {
58         struct tc_u_knode       *next;
59         u32                     handle;
60         struct tc_u_hnode       *ht_up;
61 #ifdef CONFIG_NET_CLS_POLICE
62         struct tcf_police       *police;
63 #endif
64         struct tcf_result       res;
65         struct tc_u_hnode       *ht_down;
66         struct tc_u32_sel       sel;
67 };
68
69 struct tc_u_hnode
70 {
71         struct tc_u_hnode       *next;
72         u32                     handle;
73         u32                     prio;
74         struct tc_u_common      *tp_c;
75         int                     refcnt;
76         unsigned                divisor;
77         struct tc_u_knode       *ht[1];
78 };
79
80 struct tc_u_common
81 {
82         struct tc_u_common      *next;
83         struct tc_u_hnode       *hlist;
84         struct Qdisc            *q;
85         int                     refcnt;
86         u32                     hgenerator;
87 };
88
89 static struct tc_u_common *u32_list;
90
91 static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel)
92 {
93         unsigned h = key & sel->hmask;
94
95         h ^= h>>16;
96         h ^= h>>8;
97         return h;
98 }
99
100 static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
101 {
102         struct {
103                 struct tc_u_knode *knode;
104                 u8                *ptr;
105         } stack[TC_U32_MAXDEPTH];
106
107         struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
108         u8 *ptr = skb->nh.raw;
109         struct tc_u_knode *n;
110         int sdepth = 0;
111         int off2 = 0;
112         int sel = 0;
113         int i;
114
115 next_ht:
116         n = ht->ht[sel];
117
118 next_knode:
119         if (n) {
120                 struct tc_u32_key *key = n->sel.keys;
121
122                 for (i = n->sel.nkeys; i>0; i--, key++) {
123                         if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
124                                 n = n->next;
125                                 goto next_knode;
126                         }
127                 }
128                 if (n->ht_down == NULL) {
129 check_terminal:
130                         if (n->sel.flags&TC_U32_TERMINAL) {
131                                 *res = n->res;
132 #ifdef CONFIG_NET_CLS_POLICE
133                                 if (n->police) {
134                                         int pol_res = tcf_police(skb, n->police);
135                                         if (pol_res >= 0)
136                                                 return pol_res;
137                                 } else
138 #endif
139                                         return 0;
140                         }
141                         n = n->next;
142                         goto next_knode;
143                 }
144
145                 /* PUSH */
146                 if (sdepth >= TC_U32_MAXDEPTH)
147                         goto deadloop;
148                 stack[sdepth].knode = n;
149                 stack[sdepth].ptr = ptr;
150                 sdepth++;
151
152                 ht = n->ht_down;
153                 sel = 0;
154                 if (ht->divisor)
155                         sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel);
156
157                 if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
158                         goto next_ht;
159
160                 if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
161                         off2 = n->sel.off + 3;
162                         if (n->sel.flags&TC_U32_VAROFFSET)
163                                 off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
164                         off2 &= ~3;
165                 }
166                 if (n->sel.flags&TC_U32_EAT) {
167                         ptr += off2;
168                         off2 = 0;
169                 }
170
171                 if (ptr < skb->tail)
172                         goto next_ht;
173         }
174
175         /* POP */
176         if (sdepth--) {
177                 n = stack[sdepth].knode;
178                 ht = n->ht_up;
179                 ptr = stack[sdepth].ptr;
180                 goto check_terminal;
181         }
182         return -1;
183
184 deadloop:
185         if (net_ratelimit())
186                 printk("cls_u32: dead loop\n");
187         return -1;
188 }
189
190 static __inline__ struct tc_u_hnode *
191 u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
192 {
193         struct tc_u_hnode *ht;
194
195         for (ht = tp_c->hlist; ht; ht = ht->next)
196                 if (ht->handle == handle)
197                         break;
198
199         return ht;
200 }
201
202 static __inline__ struct tc_u_knode *
203 u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
204 {
205         unsigned sel;
206         struct tc_u_knode *n;
207
208         sel = TC_U32_HASH(handle);
209         if (sel > ht->divisor)
210                 return 0;
211
212         for (n = ht->ht[sel]; n; n = n->next)
213                 if (n->handle == handle)
214                         return n;
215
216         return NULL;
217 }
218
219
220 static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
221 {
222         struct tc_u_hnode *ht;
223         struct tc_u_common *tp_c = tp->data;
224
225         if (TC_U32_HTID(handle) == TC_U32_ROOT)
226                 ht = tp->root;
227         else
228                 ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
229
230         if (!ht)
231                 return 0;
232
233         if (TC_U32_KEY(handle) == 0)
234                 return (unsigned long)ht;
235
236         return (unsigned long)u32_lookup_key(ht, handle);
237 }
238
239 static void u32_put(struct tcf_proto *tp, unsigned long f)
240 {
241 }
242
243 static u32 gen_new_htid(struct tc_u_common *tp_c)
244 {
245         int i = 0x800;
246
247         do {
248                 if (++tp_c->hgenerator == 0x7FF)
249                         tp_c->hgenerator = 1;
250         } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
251
252         return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
253 }
254
255 static int u32_init(struct tcf_proto *tp)
256 {
257         struct tc_u_hnode *root_ht;
258         struct tc_u_common *tp_c;
259
260         MOD_INC_USE_COUNT;
261
262         for (tp_c = u32_list; tp_c; tp_c = tp_c->next)
263                 if (tp_c->q == tp->q)
264                         break;
265
266         root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL);
267         if (root_ht == NULL) {
268                 MOD_DEC_USE_COUNT;
269                 return -ENOBUFS;
270         }
271         memset(root_ht, 0, sizeof(*root_ht));
272         root_ht->divisor = 0;
273         root_ht->refcnt++;
274         root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
275         root_ht->prio = tp->prio;
276
277         if (tp_c == NULL) {
278                 tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL);
279                 if (tp_c == NULL) {
280                         kfree(root_ht);
281                         MOD_DEC_USE_COUNT;
282                         return -ENOBUFS;
283                 }
284                 memset(tp_c, 0, sizeof(*tp_c));
285                 tp_c->q = tp->q;
286                 tp_c->next = u32_list;
287                 u32_list = tp_c;
288         }
289
290         tp_c->refcnt++;
291         root_ht->next = tp_c->hlist;
292         tp_c->hlist = root_ht;
293         root_ht->tp_c = tp_c;
294
295         tp->root = root_ht;
296         tp->data = tp_c;
297         return 0;
298 }
299
300 static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
301 {
302         unsigned long cl;
303
304         if ((cl = __cls_set_class(&n->res.class, 0)) != 0)
305                 tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
306 #ifdef CONFIG_NET_CLS_POLICE
307         tcf_police_release(n->police);
308 #endif
309         if (n->ht_down)
310                 n->ht_down->refcnt--;
311         kfree(n);
312         return 0;
313 }
314
315 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
316 {
317         struct tc_u_knode **kp;
318         struct tc_u_hnode *ht = key->ht_up;
319
320         if (ht) {
321                 for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
322                         if (*kp == key) {
323                                 tcf_tree_lock(tp);
324                                 *kp = key->next;
325                                 tcf_tree_unlock(tp);
326
327                                 u32_destroy_key(tp, key);
328                                 return 0;
329                         }
330                 }
331         }
332         BUG_TRAP(0);
333         return 0;
334 }
335
336 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
337 {
338         struct tc_u_knode *n;
339         unsigned h;
340
341         for (h=0; h<=ht->divisor; h++) {
342                 while ((n = ht->ht[h]) != NULL) {
343                         ht->ht[h] = n->next;
344
345                         u32_destroy_key(tp, n);
346                 }
347         }
348 }
349
350 static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
351 {
352         struct tc_u_common *tp_c = tp->data;
353         struct tc_u_hnode **hn;
354
355         BUG_TRAP(!ht->refcnt);
356
357         u32_clear_hnode(tp, ht);
358
359         for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
360                 if (*hn == ht) {
361                         *hn = ht->next;
362                         kfree(ht);
363                         return 0;
364                 }
365         }
366
367         BUG_TRAP(0);
368         return -ENOENT;
369 }
370
371 static void u32_destroy(struct tcf_proto *tp)
372 {
373         struct tc_u_common *tp_c = tp->data;
374         struct tc_u_hnode *root_ht = xchg(&tp->root, NULL);
375
376         BUG_TRAP(root_ht != NULL);
377
378         if (root_ht && --root_ht->refcnt == 0)
379                 u32_destroy_hnode(tp, root_ht);
380
381         if (--tp_c->refcnt == 0) {
382                 struct tc_u_hnode *ht;
383                 struct tc_u_common **tp_cp;
384
385                 for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) {
386                         if (*tp_cp == tp_c) {
387                                 *tp_cp = tp_c->next;
388                                 break;
389                         }
390                 }
391
392                 for (ht=tp_c->hlist; ht; ht = ht->next)
393                         u32_clear_hnode(tp, ht);
394
395                 while ((ht = tp_c->hlist) != NULL) {
396                         tp_c->hlist = ht->next;
397
398                         BUG_TRAP(ht->refcnt == 0);
399
400                         kfree(ht);
401                 };
402
403                 kfree(tp_c);
404         }
405
406         MOD_DEC_USE_COUNT;
407         tp->data = NULL;
408 }
409
410 static int u32_delete(struct tcf_proto *tp, unsigned long arg)
411 {
412         struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
413
414         if (ht == NULL)
415                 return 0;
416
417         if (TC_U32_KEY(ht->handle))
418                 return u32_delete_key(tp, (struct tc_u_knode*)ht);
419
420         if (tp->root == ht)
421                 return -EINVAL;
422
423         if (--ht->refcnt == 0)
424                 u32_destroy_hnode(tp, ht);
425
426         return 0;
427 }
428
429 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
430 {
431         struct tc_u_knode *n;
432         unsigned i = 0x7FF;
433
434         for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
435                 if (i < TC_U32_NODE(n->handle))
436                         i = TC_U32_NODE(n->handle);
437         i++;
438
439         return handle|(i>0xFFF ? 0xFFF : i);
440 }
441
442 static int u32_set_parms(struct Qdisc *q, unsigned long base,
443                          struct tc_u_hnode *ht,
444                          struct tc_u_knode *n, struct rtattr **tb,
445                          struct rtattr *est)
446 {
447         if (tb[TCA_U32_LINK-1]) {
448                 u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
449                 struct tc_u_hnode *ht_down = NULL;
450
451                 if (TC_U32_KEY(handle))
452                         return -EINVAL;
453
454                 if (handle) {
455                         ht_down = u32_lookup_ht(ht->tp_c, handle);
456
457                         if (ht_down == NULL)
458                                 return -EINVAL;
459                         ht_down->refcnt++;
460                 }
461
462                 sch_tree_lock(q);
463                 ht_down = xchg(&n->ht_down, ht_down);
464                 sch_tree_unlock(q);
465
466                 if (ht_down)
467                         ht_down->refcnt--;
468         }
469         if (tb[TCA_U32_CLASSID-1]) {
470                 unsigned long cl;
471
472                 n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
473                 sch_tree_lock(q);
474                 cl = __cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid));
475                 sch_tree_unlock(q);
476                 if (cl)
477                         q->ops->cl_ops->unbind_tcf(q, cl);
478         }
479 #ifdef CONFIG_NET_CLS_POLICE
480         if (tb[TCA_U32_POLICE-1]) {
481                 struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est);
482
483                 sch_tree_lock(q);
484                 police = xchg(&n->police, police);
485                 sch_tree_unlock(q);
486
487                 tcf_police_release(police);
488         }
489 #endif
490         return 0;
491 }
492
493 static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
494                       struct rtattr **tca,
495                       unsigned long *arg)
496 {
497         struct tc_u_common *tp_c = tp->data;
498         struct tc_u_hnode *ht;
499         struct tc_u_knode *n;
500         struct tc_u32_sel *s;
501         struct rtattr *opt = tca[TCA_OPTIONS-1];
502         struct rtattr *tb[TCA_U32_MAX];
503         u32 htid;
504         int err;
505
506         if (opt == NULL)
507                 return handle ? -EINVAL : 0;
508
509         if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0)
510                 return -EINVAL;
511
512         if ((n = (struct tc_u_knode*)*arg) != NULL) {
513                 if (TC_U32_KEY(n->handle) == 0)
514                         return -EINVAL;
515
516                 return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
517         }
518
519         if (tb[TCA_U32_DIVISOR-1]) {
520                 unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]);
521
522                 if (--divisor > 0x100)
523                         return -EINVAL;
524                 if (TC_U32_KEY(handle))
525                         return -EINVAL;
526                 if (handle == 0) {
527                         handle = gen_new_htid(tp->data);
528                         if (handle == 0)
529                                 return -ENOMEM;
530                 }
531                 ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
532                 if (ht == NULL)
533                         return -ENOBUFS;
534                 memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*));
535                 ht->tp_c = tp_c;
536                 ht->refcnt = 0;
537                 ht->divisor = divisor;
538                 ht->handle = handle;
539                 ht->prio = tp->prio;
540                 ht->next = tp_c->hlist;
541                 tp_c->hlist = ht;
542                 *arg = (unsigned long)ht;
543                 return 0;
544         }
545
546         if (tb[TCA_U32_HASH-1]) {
547                 htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
548                 if (TC_U32_HTID(htid) == TC_U32_ROOT) {
549                         ht = tp->root;
550                         htid = ht->handle;
551                 } else {
552                         ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
553                         if (ht == NULL)
554                                 return -EINVAL;
555                 }
556         } else {
557                 ht = tp->root;
558                 htid = ht->handle;
559         }
560
561         if (ht->divisor < TC_U32_HASH(htid))
562                 return -EINVAL;
563
564         if (handle) {
565                 if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
566                         return -EINVAL;
567                 handle = htid | TC_U32_NODE(handle);
568         } else
569                 handle = gen_new_kid(ht, htid);
570
571         if (tb[TCA_U32_SEL-1] == 0 ||
572             RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel))
573                 return -EINVAL;
574
575         s = RTA_DATA(tb[TCA_U32_SEL-1]);
576         n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
577         if (n == NULL)
578                 return -ENOBUFS;
579         memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key));
580         memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
581         n->ht_up = ht;
582         n->handle = handle;
583         err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]);
584         if (err == 0) {
585                 struct tc_u_knode **ins;
586                 for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
587                         if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
588                                 break;
589
590                 n->next = *ins;
591                 wmb();
592                 *ins = n;
593
594                 *arg = (unsigned long)n;
595                 return 0;
596         }
597         kfree(n);
598         return err;
599 }
600
601 static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
602 {
603         struct tc_u_common *tp_c = tp->data;
604         struct tc_u_hnode *ht;
605         struct tc_u_knode *n;
606         unsigned h;
607
608         if (arg->stop)
609                 return;
610
611         for (ht = tp_c->hlist; ht; ht = ht->next) {
612                 if (ht->prio != tp->prio)
613                         continue;
614                 if (arg->count >= arg->skip) {
615                         if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
616                                 arg->stop = 1;
617                                 return;
618                         }
619                 }
620                 arg->count++;
621                 for (h = 0; h <= ht->divisor; h++) {
622                         for (n = ht->ht[h]; n; n = n->next) {
623                                 if (arg->count < arg->skip) {
624                                         arg->count++;
625                                         continue;
626                                 }
627                                 if (arg->fn(tp, (unsigned long)n, arg) < 0) {
628                                         arg->stop = 1;
629                                         return;
630                                 }
631                                 arg->count++;
632                         }
633                 }
634         }
635 }
636
637 static int u32_dump(struct tcf_proto *tp, unsigned long fh,
638                      struct sk_buff *skb, struct tcmsg *t)
639 {
640         struct tc_u_knode *n = (struct tc_u_knode*)fh;
641         unsigned char    *b = skb->tail;
642         struct rtattr *rta;
643
644         if (n == NULL)
645                 return skb->len;
646
647         t->tcm_handle = n->handle;
648
649         rta = (struct rtattr*)b;
650         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
651
652         if (TC_U32_KEY(n->handle) == 0) {
653                 struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
654                 u32 divisor = ht->divisor+1;
655                 RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor);
656         } else {
657                 RTA_PUT(skb, TCA_U32_SEL,
658                         sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
659                         &n->sel);
660                 if (n->ht_up) {
661                         u32 htid = n->handle & 0xFFFFF000;
662                         RTA_PUT(skb, TCA_U32_HASH, 4, &htid);
663                 }
664                 if (n->res.classid)
665                         RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid);
666                 if (n->ht_down)
667                         RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle);
668 #ifdef CONFIG_NET_CLS_POLICE
669                 if (n->police) {
670                         struct rtattr * p_rta = (struct rtattr*)skb->tail;
671
672                         RTA_PUT(skb, TCA_U32_POLICE, 0, NULL);
673
674                         if (tcf_police_dump(skb, n->police) < 0)
675                                 goto rtattr_failure;
676
677                         p_rta->rta_len = skb->tail - (u8*)p_rta;
678                 }
679 #endif
680         }
681
682         rta->rta_len = skb->tail - b;
683 #ifdef CONFIG_NET_CLS_POLICE
684         if (TC_U32_KEY(n->handle) && n->police) {
685                 if (qdisc_copy_stats(skb, &n->police->stats))
686                         goto rtattr_failure;
687         }
688 #endif
689         return skb->len;
690
691 rtattr_failure:
692         skb_trim(skb, b - skb->data);
693         return -1;
694 }
695
696 struct tcf_proto_ops cls_u32_ops = {
697         NULL,
698         "u32",
699         u32_classify,
700         u32_init,
701         u32_destroy,
702
703         u32_get,
704         u32_put,
705         u32_change,
706         u32_delete,
707         u32_walk,
708         u32_dump
709 };
710
711 #ifdef MODULE
712 int init_module(void)
713 {
714         return register_tcf_proto_ops(&cls_u32_ops);
715 }
716
717 void cleanup_module(void) 
718 {
719         unregister_tcf_proto_ops(&cls_u32_ops);
720 }
721 #endif
722 MODULE_LICENSE("GPL");