2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 FIB: lookup engine and maintenance routines.
8 * Version: $Id: fib_hash.c,v 1.1.1.1 2005/04/11 02:51:13 jack Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
51 static kmem_cache_t * fn_hash_kmem;
54 These bizarre types are just to force strict type checking.
55 When I reversed order of bytes and changed to natural mask lengths,
56 I forgot to make fixes in several places. Now I am lazy to return
70 struct fib_node *fn_next;
71 struct fib_info *fn_info;
72 #define FIB_INFO(f) ((f)->fn_info)
81 #define FN_S_ACCESSED 2
83 static int fib_hash_zombies;
87 struct fn_zone *fz_next; /* Next not empty zone */
88 struct fib_node **fz_hash; /* Hash table pointer */
89 int fz_nent; /* Number of entries */
91 int fz_divisor; /* Hash divisor */
92 u32 fz_hashmask; /* (1<<fz_divisor) - 1 */
93 #define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
95 int fz_order; /* Zone order */
97 #define FZ_MASK(fz) ((fz)->fz_mask)
100 /* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
101 can be cheaper than memory lookup, so that FZ_* macros are used.
106 struct fn_zone *fn_zones[33];
107 struct fn_zone *fn_zone_list;
110 static __inline__ fn_hash_idx_t fn_hash(fn_key_t key, struct fn_zone *fz)
112 u32 h = ntohl(key.datum)>>(32 - fz->fz_order);
116 h &= FZ_HASHMASK(fz);
117 return *(fn_hash_idx_t*)&h;
120 #define fz_key_0(key) ((key).datum = 0)
121 #define fz_prefix(key,fz) ((key).datum)
123 static __inline__ fn_key_t fz_key(u32 dst, struct fn_zone *fz)
126 k.datum = dst & FZ_MASK(fz);
130 static __inline__ struct fib_node ** fz_chain_p(fn_key_t key, struct fn_zone *fz)
132 return &fz->fz_hash[fn_hash(key, fz).datum];
135 static __inline__ struct fib_node * fz_chain(fn_key_t key, struct fn_zone *fz)
137 return fz->fz_hash[fn_hash(key, fz).datum];
140 extern __inline__ int fn_key_eq(fn_key_t a, fn_key_t b)
142 return a.datum == b.datum;
145 extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b)
147 return a.datum <= b.datum;
150 static rwlock_t fib_hash_lock = RW_LOCK_UNLOCKED;
152 #define FZ_MAX_DIVISOR 1024
154 #ifdef CONFIG_IP_ROUTE_LARGE_TABLES
156 /* The fib hash lock must be held when this is called. */
157 static __inline__ void fn_rebuild_zone(struct fn_zone *fz,
158 struct fib_node **old_ht,
162 struct fib_node *f, **fp, *next;
164 for (i=0; i<old_divisor; i++) {
165 for (f=old_ht[i]; f; f=next) {
167 for (fp = fz_chain_p(f->fn_key, fz);
168 *fp && fn_key_leq((*fp)->fn_key, f->fn_key);
169 fp = &(*fp)->fn_next)
177 static void fn_rehash_zone(struct fn_zone *fz)
179 struct fib_node **ht, **old_ht;
180 int old_divisor, new_divisor;
183 old_divisor = fz->fz_divisor;
185 switch (old_divisor) {
192 new_hashmask = 0x3FF;
195 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
198 #if RT_CACHE_DEBUG >= 2
199 printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
202 ht = kmalloc(new_divisor*sizeof(struct fib_node*), GFP_KERNEL);
205 memset(ht, 0, new_divisor*sizeof(struct fib_node*));
206 write_lock_bh(&fib_hash_lock);
207 old_ht = fz->fz_hash;
209 fz->fz_hashmask = new_hashmask;
210 fz->fz_divisor = new_divisor;
211 fn_rebuild_zone(fz, old_ht, old_divisor);
212 write_unlock_bh(&fib_hash_lock);
216 #endif /* CONFIG_IP_ROUTE_LARGE_TABLES */
218 static void fn_free_node(struct fib_node * f)
220 fib_release_info(FIB_INFO(f));
221 kmem_cache_free(fn_hash_kmem, f);
225 static struct fn_zone *
226 fn_new_zone(struct fn_hash *table, int z)
229 struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
233 memset(fz, 0, sizeof(struct fn_zone));
236 fz->fz_hashmask = 0xF;
241 fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL);
246 memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*));
248 fz->fz_mask = inet_make_mask(z);
250 /* Find the first not empty zone with more specific mask */
251 for (i=z+1; i<=32; i++)
252 if (table->fn_zones[i])
254 write_lock_bh(&fib_hash_lock);
256 /* No more specific masks, we are the first. */
257 fz->fz_next = table->fn_zone_list;
258 table->fn_zone_list = fz;
260 fz->fz_next = table->fn_zones[i]->fz_next;
261 table->fn_zones[i]->fz_next = fz;
263 table->fn_zones[z] = fz;
264 write_unlock_bh(&fib_hash_lock);
269 fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
273 struct fn_hash *t = (struct fn_hash*)tb->tb_data;
275 read_lock(&fib_hash_lock);
276 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
278 fn_key_t k = fz_key(key->dst, fz);
280 for (f = fz_chain(k, fz); f; f = f->fn_next) {
281 if (!fn_key_eq(k, f->fn_key)) {
282 if (fn_key_leq(k, f->fn_key))
287 #ifdef CONFIG_IP_ROUTE_TOS
288 if (f->fn_tos && f->fn_tos != key->tos)
291 f->fn_state |= FN_S_ACCESSED;
293 if (f->fn_state&FN_S_ZOMBIE)
295 if (f->fn_scope < key->scope)
298 err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res);
300 res->type = f->fn_type;
301 res->scope = f->fn_scope;
302 res->prefixlen = fz->fz_order;
311 read_unlock(&fib_hash_lock);
315 static int fn_hash_last_dflt=-1;
317 static int fib_detect_death(struct fib_info *fi, int order,
318 struct fib_info **last_resort, int *last_idx)
321 int state = NUD_NONE;
323 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
325 state = n->nud_state;
328 if (state==NUD_REACHABLE)
330 if ((state&NUD_VALID) && order != fn_hash_last_dflt)
332 if ((state&NUD_VALID) ||
333 (*last_idx<0 && order > fn_hash_last_dflt)) {
341 fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
345 struct fib_info *fi = NULL;
346 struct fib_info *last_resort;
347 struct fn_hash *t = (struct fn_hash*)tb->tb_data;
348 struct fn_zone *fz = t->fn_zones[0];
357 read_lock(&fib_hash_lock);
358 for (f = fz->fz_hash[0]; f; f = f->fn_next) {
359 struct fib_info *next_fi = FIB_INFO(f);
361 if ((f->fn_state&FN_S_ZOMBIE) ||
362 f->fn_scope != res->scope ||
363 f->fn_type != RTN_UNICAST)
366 if (next_fi->fib_priority > res->fi->fib_priority)
368 if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
370 f->fn_state |= FN_S_ACCESSED;
373 if (next_fi != res->fi)
375 } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
377 fib_info_put(res->fi);
379 atomic_inc(&fi->fib_clntref);
380 fn_hash_last_dflt = order;
387 if (order<=0 || fi==NULL) {
388 fn_hash_last_dflt = -1;
392 if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
394 fib_info_put(res->fi);
396 atomic_inc(&fi->fib_clntref);
397 fn_hash_last_dflt = order;
403 fib_info_put(res->fi);
404 res->fi = last_resort;
406 atomic_inc(&last_resort->fib_clntref);
408 fn_hash_last_dflt = last_idx;
410 read_unlock(&fib_hash_lock);
413 #define FIB_SCAN(f, fp) \
414 for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
416 #define FIB_SCAN_KEY(f, fp, key) \
417 for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
419 #ifndef CONFIG_IP_ROUTE_TOS
420 #define FIB_SCAN_TOS(f, fp, key, tos) FIB_SCAN_KEY(f, fp, key)
422 #define FIB_SCAN_TOS(f, fp, key, tos) \
423 for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)) && \
424 (f)->fn_tos == (tos) ; (fp) = &(f)->fn_next)
428 static void rtmsg_fib(int, struct fib_node*, int, int,
430 struct netlink_skb_parms *);
433 fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
434 struct nlmsghdr *n, struct netlink_skb_parms *req)
436 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
437 struct fib_node *new_f, *f, **fp, **del_fp;
441 int z = r->rtm_dst_len;
442 int type = r->rtm_type;
443 #ifdef CONFIG_IP_ROUTE_TOS
449 FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
450 *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1,
451 rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
454 fz = table->fn_zones[z];
455 if (!fz && !(fz = fn_new_zone(table, z)))
461 memcpy(&dst, rta->rta_dst, 4);
462 if (dst & ~FZ_MASK(fz))
464 key = fz_key(dst, fz);
467 if ((fi = fib_create_info(r, rta, n, &err)) == NULL)
470 #ifdef CONFIG_IP_ROUTE_LARGE_TABLES
471 if (fz->fz_nent > (fz->fz_divisor<<2) &&
472 fz->fz_divisor < FZ_MAX_DIVISOR &&
473 (z==32 || (1<<z) > fz->fz_divisor))
477 fp = fz_chain_p(key, fz);
481 * Scan list to find the first route with the same destination
484 if (fn_key_leq(key,f->fn_key))
488 #ifdef CONFIG_IP_ROUTE_TOS
490 * Find route with the same destination and tos.
492 FIB_SCAN_KEY(f, fp, key) {
493 if (f->fn_tos <= tos)
500 if (f && (f->fn_state&FN_S_ZOMBIE) &&
501 #ifdef CONFIG_IP_ROUTE_TOS
504 fn_key_eq(f->fn_key, key)) {
511 FIB_SCAN_TOS(f, fp, key, tos) {
512 if (fi->fib_priority <= FIB_INFO(f)->fib_priority)
516 /* Now f==*fp points to the first node with the same
517 keys [prefix,tos,priority], if such key already
518 exists or to the node, before which we will insert new one.
522 #ifdef CONFIG_IP_ROUTE_TOS
525 fn_key_eq(f->fn_key, key) &&
526 fi->fib_priority == FIB_INFO(f)->fib_priority) {
527 struct fib_node **ins_fp;
530 if (n->nlmsg_flags&NLM_F_EXCL)
533 if (n->nlmsg_flags&NLM_F_REPLACE) {
543 FIB_SCAN_TOS(f, fp, key, tos) {
544 if (fi->fib_priority != FIB_INFO(f)->fib_priority)
546 if (f->fn_type == type && f->fn_scope == r->rtm_scope
547 && FIB_INFO(f) == fi)
551 if (!(n->nlmsg_flags&NLM_F_APPEND)) {
559 if (!(n->nlmsg_flags&NLM_F_CREATE))
564 new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
568 memset(new_f, 0, sizeof(struct fib_node));
571 #ifdef CONFIG_IP_ROUTE_TOS
574 new_f->fn_type = type;
575 new_f->fn_scope = r->rtm_scope;
576 FIB_INFO(new_f) = fi;
579 * Insert new entry to the list.
583 write_lock_bh(&fib_hash_lock);
585 write_unlock_bh(&fib_hash_lock);
590 /* Unlink replaced node */
591 write_lock_bh(&fib_hash_lock);
592 *del_fp = f->fn_next;
593 write_unlock_bh(&fib_hash_lock);
595 if (!(f->fn_state&FN_S_ZOMBIE))
596 rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
597 if (f->fn_state&FN_S_ACCESSED)
604 rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req);
608 fib_release_info(fi);
614 fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
615 struct nlmsghdr *n, struct netlink_skb_parms *req)
617 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
618 struct fib_node **fp, **del_fp, *f;
619 int z = r->rtm_dst_len;
623 #ifdef CONFIG_IP_ROUTE_TOS
627 FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
628 *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1);
631 if ((fz = table->fn_zones[z]) == NULL)
637 memcpy(&dst, rta->rta_dst, 4);
638 if (dst & ~FZ_MASK(fz))
640 key = fz_key(dst, fz);
643 fp = fz_chain_p(key, fz);
647 if (fn_key_eq(f->fn_key, key))
649 if (fn_key_leq(key, f->fn_key)) {
653 #ifdef CONFIG_IP_ROUTE_TOS
654 FIB_SCAN_KEY(f, fp, key) {
655 if (f->fn_tos == tos)
662 FIB_SCAN_TOS(f, fp, key, tos) {
663 struct fib_info * fi = FIB_INFO(f);
665 if (f->fn_state&FN_S_ZOMBIE) {
670 if (del_fp == NULL &&
671 (!r->rtm_type || f->fn_type == r->rtm_type) &&
672 (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
673 (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) &&
674 fib_nh_match(r, n, rta, fi) == 0)
680 rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
683 write_lock_bh(&fib_hash_lock);
684 *del_fp = f->fn_next;
685 write_unlock_bh(&fib_hash_lock);
687 if (f->fn_state&FN_S_ACCESSED)
692 f->fn_state |= FN_S_ZOMBIE;
693 if (f->fn_state&FN_S_ACCESSED) {
694 f->fn_state &= ~FN_S_ACCESSED;
697 if (++fib_hash_zombies > 128)
706 extern __inline__ int
707 fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
712 while ((f = *fp) != NULL) {
713 struct fib_info *fi = FIB_INFO(f);
715 if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
716 write_lock_bh(&fib_hash_lock);
718 write_unlock_bh(&fib_hash_lock);
729 static int fn_hash_flush(struct fib_table *tb)
731 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
735 fib_hash_zombies = 0;
736 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
739 for (i=fz->fz_divisor-1; i>=0; i--)
740 tmp += fn_flush_list(&fz->fz_hash[i], fz->fz_order, table);
748 #ifdef CONFIG_PROC_FS
750 static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int count)
752 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
757 read_lock(&fib_hash_lock);
758 for (fz=table->fn_zone_list; fz; fz = fz->fz_next) {
761 int maxslot = fz->fz_divisor;
762 struct fib_node **fp = fz->fz_hash;
764 if (fz->fz_nent == 0)
767 if (pos + fz->fz_nent <= first) {
772 for (i=0; i < maxslot; i++, fp++) {
773 for (f = *fp; f; f = f->fn_next) {
776 fib_node_get_info(f->fn_type,
777 f->fn_state&FN_S_ZOMBIE,
779 fz_prefix(f->fn_key, fz),
780 FZ_MASK(fz), buffer);
788 read_unlock(&fib_hash_lock);
794 static __inline__ int
795 fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
796 struct fib_table *tb,
803 for (i=0; f; i++, f=f->fn_next) {
804 if (i < s_i) continue;
805 if (f->fn_state&FN_S_ZOMBIE) continue;
806 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
808 tb->tb_id, (f->fn_state&FN_S_ZOMBIE) ? 0 : f->fn_type, f->fn_scope,
809 &f->fn_key, fz->fz_order, f->fn_tos,
819 static __inline__ int
820 fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
821 struct fib_table *tb,
827 for (h=0; h < fz->fz_divisor; h++) {
828 if (h < s_h) continue;
830 memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
831 if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL)
833 if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) {
842 static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
846 struct fn_hash *table = (struct fn_hash*)tb->tb_data;
849 read_lock(&fib_hash_lock);
850 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
851 if (m < s_m) continue;
853 memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
854 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
856 read_unlock(&fib_hash_lock);
860 read_unlock(&fib_hash_lock);
865 static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
866 struct nlmsghdr *n, struct netlink_skb_parms *req)
869 u32 pid = req ? req->pid : 0;
870 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
872 skb = alloc_skb(size, GFP_KERNEL);
876 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
877 f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos,
882 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
883 if (n->nlmsg_flags&NLM_F_ECHO)
884 atomic_inc(&skb->users);
885 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
886 if (n->nlmsg_flags&NLM_F_ECHO)
887 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
890 #ifdef CONFIG_IP_MULTIPLE_TABLES
891 struct fib_table * fib_hash_init(int id)
893 struct fib_table * __init fib_hash_init(int id)
896 struct fib_table *tb;
898 if (fn_hash_kmem == NULL)
899 fn_hash_kmem = kmem_cache_create("ip_fib_hash",
900 sizeof(struct fib_node),
901 0, SLAB_HWCACHE_ALIGN,
904 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL);
909 tb->tb_lookup = fn_hash_lookup;
910 tb->tb_insert = fn_hash_insert;
911 tb->tb_delete = fn_hash_delete;
912 tb->tb_flush = fn_hash_flush;
913 tb->tb_select_default = fn_hash_select_default;
914 tb->tb_dump = fn_hash_dump;
915 #ifdef CONFIG_PROC_FS
916 tb->tb_get_info = fn_hash_get_info;
918 memset(tb->tb_data, 0, sizeof(struct fn_hash));