2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
57 * This program is free software; you can redistribute it and/or
58 * modify it under the terms of the GNU General Public License
59 * as published by the Free Software Foundation; either version
60 * 2 of the License, or (at your option) any later version.
63 #include <linux/config.h>
64 #include <asm/uaccess.h>
65 #include <asm/system.h>
66 #include <asm/bitops.h>
67 #include <linux/types.h>
68 #include <linux/kernel.h>
69 #include <linux/sched.h>
71 #include <linux/string.h>
72 #include <linux/socket.h>
73 #include <linux/sockios.h>
74 #include <linux/errno.h>
76 #include <linux/inet.h>
77 #include <linux/netdevice.h>
78 #include <linux/proc_fs.h>
79 #include <linux/init.h>
80 #include <linux/skbuff.h>
81 #include <linux/rtnetlink.h>
82 #include <linux/inetdevice.h>
83 #include <linux/igmp.h>
84 #include <linux/pkt_sched.h>
85 #include <linux/mroute.h>
86 #include <linux/netfilter_ipv4.h>
87 #include <linux/random.h>
88 #include <linux/jhash.h>
89 #include <net/protocol.h>
91 #include <net/route.h>
92 #include <net/inetpeer.h>
94 #include <net/ip_fib.h>
99 #include <linux/sysctl.h>
102 #define IP_MAX_MTU 0xFFF0
104 #define RT_GC_TIMEOUT (300*HZ)
106 int ip_rt_min_delay = 2 * HZ;
107 int ip_rt_max_delay = 10 * HZ;
109 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
110 int ip_rt_gc_interval = 60 * HZ;
111 int ip_rt_gc_min_interval = HZ / 2;
112 int ip_rt_redirect_number = 9;
113 int ip_rt_redirect_load = HZ / 50;
114 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
115 int ip_rt_error_cost = HZ;
116 int ip_rt_error_burst = 5 * HZ;
117 int ip_rt_gc_elasticity = 8;
118 int ip_rt_mtu_expires = 10 * 60 * HZ;
119 int ip_rt_min_pmtu = 512 + 20 + 20;
120 int ip_rt_min_advmss = 256;
121 int ip_rt_secret_interval = 10 * 60 * HZ;
122 static unsigned long rt_deadline;
124 #define RTprint(a...) printk(KERN_DEBUG a)
126 static struct timer_list rt_flush_timer;
127 static struct timer_list rt_periodic_timer;
128 static struct timer_list rt_secret_timer;
131 * Interface to generic destination cache.
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
136 struct sk_buff *skb);
137 static void ipv4_dst_destroy(struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void ipv4_link_failure(struct sk_buff *skb);
140 static int rt_garbage_collect(void);
143 struct dst_ops ipv4_dst_ops = {
145 protocol: __constant_htons(ETH_P_IP),
146 gc: rt_garbage_collect,
147 check: ipv4_dst_check,
148 reroute: ipv4_dst_reroute,
149 destroy: ipv4_dst_destroy,
150 negative_advice: ipv4_negative_advice,
151 link_failure: ipv4_link_failure,
152 entry_size: sizeof(struct rtable),
155 #define ECN_OR_COST(class) TC_PRIO_##class
157 __u8 ip_tos2prio[16] = {
161 ECN_OR_COST(BESTEFFORT),
167 ECN_OR_COST(INTERACTIVE),
169 ECN_OR_COST(INTERACTIVE),
170 TC_PRIO_INTERACTIVE_BULK,
171 ECN_OR_COST(INTERACTIVE_BULK),
172 TC_PRIO_INTERACTIVE_BULK,
173 ECN_OR_COST(INTERACTIVE_BULK)
181 /* The locking scheme is rather straight forward:
183 * 1) A BH protected rwlocks protect buckets of the central route hash.
184 * 2) Only writers remove entries, and they hold the lock
185 * as they look at rtable reference counts.
186 * 3) Only readers acquire references to rtable entries,
187 * they do so with atomic increments and with the
191 struct rt_hash_bucket {
192 struct rtable *chain;
194 } __attribute__((__aligned__(8)));
196 static struct rt_hash_bucket *rt_hash_table;
197 static unsigned rt_hash_mask;
198 static int rt_hash_log;
199 static unsigned int rt_hash_rnd;
201 struct rt_cache_stat rt_cache_stat[NR_CPUS];
203 static int rt_intern_hash(unsigned hash, struct rtable *rth,
204 struct rtable **res);
206 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
208 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
212 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
222 sprintf(buffer, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 for (i = rt_hash_mask; i >= 0; i--) {
230 read_lock_bh(&rt_hash_table[i].lock);
231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
233 * Spin through entries until we are ready
241 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
242 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
243 r->u.dst.dev ? r->u.dst.dev->name : "*",
244 (unsigned long)r->rt_dst,
245 (unsigned long)r->rt_gateway,
247 atomic_read(&r->u.dst.__refcnt),
250 (unsigned long)r->rt_src,
252 (int) r->u.dst.advmss + 40 : 0),
254 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
257 atomic_read(&r->u.dst.hh->hh_refcnt) :
260 (r->u.dst.hh->hh_output ==
263 sprintf(buffer + len, "%-127s\n", temp);
265 if (pos >= offset+length) {
266 read_unlock_bh(&rt_hash_table[i].lock);
270 read_unlock_bh(&rt_hash_table[i].lock);
274 *start = buffer + len - (pos - offset);
281 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
283 unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
287 len += sprintf(buffer+len, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288 for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
289 i = cpu_logical_map(lcpu);
291 len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293 rt_cache_stat[i].in_hit,
294 rt_cache_stat[i].in_slow_tot,
295 rt_cache_stat[i].in_slow_mc,
296 rt_cache_stat[i].in_no_route,
297 rt_cache_stat[i].in_brd,
298 rt_cache_stat[i].in_martian_dst,
299 rt_cache_stat[i].in_martian_src,
301 rt_cache_stat[i].out_hit,
302 rt_cache_stat[i].out_slow_tot,
303 rt_cache_stat[i].out_slow_mc,
305 rt_cache_stat[i].gc_total,
306 rt_cache_stat[i].gc_ignored,
307 rt_cache_stat[i].gc_goal_miss,
308 rt_cache_stat[i].gc_dst_overflow,
309 rt_cache_stat[i].in_hlist_search,
310 rt_cache_stat[i].out_hlist_search
321 *start = buffer + offset;
325 static __inline__ void rt_free(struct rtable *rt)
327 dst_free(&rt->u.dst);
330 static __inline__ void rt_drop(struct rtable *rt)
333 dst_free(&rt->u.dst);
336 static __inline__ int rt_fast_clean(struct rtable *rth)
338 /* Kill broadcast/multicast entries very aggresively, if they
339 collide in hash table with more useful entries */
340 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
341 rth->key.iif && rth->u.rt_next;
344 static __inline__ int rt_valuable(struct rtable *rth)
346 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
350 static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
355 if (atomic_read(&rth->u.dst.__refcnt))
359 if (rth->u.dst.expires &&
360 time_after_eq(jiffies, rth->u.dst.expires))
363 age = jiffies - rth->u.dst.lastuse;
365 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
366 (age <= tmo2 && rt_valuable(rth)))
372 /* Bits of score are:
374 * 30: not quite useless
375 * 29..0: usage counter
377 static inline u32 rt_score(struct rtable *rt)
379 u32 score = jiffies - rt->u.dst.lastuse;
381 score = ~score & ~(3<<30);
387 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
393 /* This runs via a timer and thus is always in BH context. */
394 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
398 struct rtable *rth, **rthp;
399 unsigned long now = jiffies;
401 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
402 t -= ip_rt_gc_timeout) {
403 unsigned long tmo = ip_rt_gc_timeout;
405 i = (i + 1) & rt_hash_mask;
406 rthp = &rt_hash_table[i].chain;
408 write_lock(&rt_hash_table[i].lock);
409 while ((rth = *rthp) != NULL) {
410 if (rth->u.dst.expires) {
411 /* Entry is expired even if it is in use */
412 if (time_before_eq(now, rth->u.dst.expires)) {
414 rthp = &rth->u.rt_next;
417 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
419 rthp = &rth->u.rt_next;
423 /* Cleanup aged off entries. */
424 *rthp = rth->u.rt_next;
427 write_unlock(&rt_hash_table[i].lock);
429 /* Fallback loop breaker. */
430 if (time_after(jiffies, now))
434 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
437 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
439 /* This can run from both BH and non-BH contexts, the latter
440 * in the case of a forced flush event.
442 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
445 struct rtable *rth, *next;
449 get_random_bytes(&rt_hash_rnd, 4);
451 for (i = rt_hash_mask; i >= 0; i--) {
452 write_lock_bh(&rt_hash_table[i].lock);
453 rth = rt_hash_table[i].chain;
455 rt_hash_table[i].chain = NULL;
456 write_unlock_bh(&rt_hash_table[i].lock);
458 for (; rth; rth = next) {
459 next = rth->u.rt_next;
465 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
467 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
469 void rt_cache_flush(int delay)
471 unsigned long now = jiffies;
472 int user_mode = !in_softirq();
475 delay = ip_rt_min_delay;
477 spin_lock_bh(&rt_flush_lock);
479 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
480 long tmo = (long)(rt_deadline - now);
482 /* If flush timer is already running
483 and flush request is not immediate (delay > 0):
485 if deadline is not achieved, prolongate timer to "delay",
486 otherwise fire it at deadline time.
489 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
497 spin_unlock_bh(&rt_flush_lock);
498 SMP_TIMER_NAME(rt_run_flush)(0);
502 if (rt_deadline == 0)
503 rt_deadline = now + ip_rt_max_delay;
505 mod_timer(&rt_flush_timer, now+delay);
506 spin_unlock_bh(&rt_flush_lock);
509 static void rt_secret_rebuild(unsigned long dummy)
511 unsigned long now = jiffies;
514 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
518 Short description of GC goals.
520 We want to build algorithm, which will keep routing cache
521 at some equilibrium point, when number of aged off entries
522 is kept approximately equal to newly generated ones.
524 Current expiration strength is variable "expire".
525 We try to adjust it dynamically, so that if networking
526 is idle expires is large enough to keep enough of warm entries,
527 and when load increases it reduces to limit cache size.
530 static int rt_garbage_collect(void)
532 static unsigned long expire = RT_GC_TIMEOUT;
533 static unsigned long last_gc;
535 static int equilibrium;
536 struct rtable *rth, **rthp;
537 unsigned long now = jiffies;
541 * Garbage collection is pretty expensive,
542 * do not make it too frequently.
545 rt_cache_stat[smp_processor_id()].gc_total++;
547 if (now - last_gc < ip_rt_gc_min_interval &&
548 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
549 rt_cache_stat[smp_processor_id()].gc_ignored++;
553 /* Calculate number of entries, which we want to expire now. */
554 goal = atomic_read(&ipv4_dst_ops.entries) -
555 (ip_rt_gc_elasticity << rt_hash_log);
557 if (equilibrium < ipv4_dst_ops.gc_thresh)
558 equilibrium = ipv4_dst_ops.gc_thresh;
559 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
561 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
562 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
565 /* We are in dangerous area. Try to reduce cache really
568 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
569 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
572 if (now - last_gc >= ip_rt_gc_min_interval)
583 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
584 unsigned long tmo = expire;
586 k = (k + 1) & rt_hash_mask;
587 rthp = &rt_hash_table[k].chain;
588 write_lock_bh(&rt_hash_table[k].lock);
589 while ((rth = *rthp) != NULL) {
590 if (!rt_may_expire(rth, tmo, expire)) {
592 rthp = &rth->u.rt_next;
595 *rthp = rth->u.rt_next;
599 write_unlock_bh(&rt_hash_table[k].lock);
608 /* Goal is not achieved. We stop process if:
610 - if expire reduced to zero. Otherwise, expire is halfed.
611 - if table is not full.
612 - if we are called from interrupt.
613 - jiffies check is just fallback/debug loop breaker.
614 We will not spin here for long time in any case.
617 rt_cache_stat[smp_processor_id()].gc_goal_miss++;
623 #if RT_CACHE_DEBUG >= 2
624 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
625 atomic_read(&ipv4_dst_ops.entries), goal, i);
628 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
630 } while (!in_softirq() && time_before_eq(jiffies, now));
632 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
635 printk(KERN_WARNING "dst cache overflow\n");
636 rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
640 expire += ip_rt_gc_min_interval;
641 if (expire > ip_rt_gc_timeout ||
642 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
643 expire = ip_rt_gc_timeout;
644 #if RT_CACHE_DEBUG >= 2
645 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
646 atomic_read(&ipv4_dst_ops.entries), goal, rover);
651 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
653 struct rtable *rth, **rthp;
655 struct rtable *cand, **candp;
658 int attempts = !in_softirq();
667 rthp = &rt_hash_table[hash].chain;
669 write_lock_bh(&rt_hash_table[hash].lock);
670 while ((rth = *rthp) != NULL) {
671 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
673 *rthp = rth->u.rt_next;
674 rth->u.rt_next = rt_hash_table[hash].chain;
675 rt_hash_table[hash].chain = rth;
678 dst_hold(&rth->u.dst);
679 rth->u.dst.lastuse = now;
680 write_unlock_bh(&rt_hash_table[hash].lock);
687 if (!atomic_read(&rth->u.dst.__refcnt)) {
688 u32 score = rt_score(rth);
690 if (score <= min_score) {
699 rthp = &rth->u.rt_next;
703 /* ip_rt_gc_elasticity used to be average length of chain
704 * length, when exceeded gc becomes really aggressive.
706 * The second limit is less certain. At the moment it allows
707 * only 2 entries per bucket. We will see.
709 if (chain_length > ip_rt_gc_elasticity) {
710 *candp = cand->u.rt_next;
715 /* Try to bind route to arp only if it is output
716 route or unicast forwarding path.
718 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
719 int err = arp_bind_neighbour(&rt->u.dst);
721 write_unlock_bh(&rt_hash_table[hash].lock);
723 if (err != -ENOBUFS) {
728 /* Neighbour tables are full and nothing
729 can be released. Try to shrink route cache,
730 it is most likely it holds some neighbour records.
732 if (attempts-- > 0) {
733 int saved_elasticity = ip_rt_gc_elasticity;
734 int saved_int = ip_rt_gc_min_interval;
735 ip_rt_gc_elasticity = 1;
736 ip_rt_gc_min_interval = 0;
737 rt_garbage_collect();
738 ip_rt_gc_min_interval = saved_int;
739 ip_rt_gc_elasticity = saved_elasticity;
744 printk(KERN_WARNING "Neighbour table overflow.\n");
750 rt->u.rt_next = rt_hash_table[hash].chain;
751 #if RT_CACHE_DEBUG >= 2
754 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
755 NIPQUAD(rt->rt_dst));
756 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
757 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
761 rt_hash_table[hash].chain = rt;
762 write_unlock_bh(&rt_hash_table[hash].lock);
767 void rt_bind_peer(struct rtable *rt, int create)
769 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
770 struct inet_peer *peer;
772 peer = inet_getpeer(rt->rt_dst, create);
774 spin_lock_bh(&rt_peer_lock);
775 if (rt->peer == NULL) {
779 spin_unlock_bh(&rt_peer_lock);
785 * Peer allocation may fail only in serious out-of-memory conditions. However
786 * we still can generate some output.
787 * Random ID selection looks a bit dangerous because we have no chances to
788 * select ID being unique in a reasonable period of time.
789 * But broken packet identifier may be better than no packet at all.
791 static void ip_select_fb_ident(struct iphdr *iph)
793 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
794 static u32 ip_fallback_id;
797 spin_lock_bh(&ip_fb_id_lock);
798 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
799 iph->id = htons(salt & 0xFFFF);
800 ip_fallback_id = salt;
801 spin_unlock_bh(&ip_fb_id_lock);
804 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
806 struct rtable *rt = (struct rtable *) dst;
809 if (rt->peer == NULL)
812 /* If peer is attached to destination, it is never detached,
813 so that we need not to grab a lock to dereference it.
816 iph->id = htons(inet_getid(rt->peer));
820 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
822 ip_select_fb_ident(iph);
825 static void rt_del(unsigned hash, struct rtable *rt)
827 struct rtable **rthp;
829 write_lock_bh(&rt_hash_table[hash].lock);
831 for (rthp = &rt_hash_table[hash].chain; *rthp;
832 rthp = &(*rthp)->u.rt_next)
834 *rthp = rt->u.rt_next;
838 write_unlock_bh(&rt_hash_table[hash].lock);
841 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
842 u32 saddr, u8 tos, struct net_device *dev)
845 struct in_device *in_dev = in_dev_get(dev);
846 struct rtable *rth, **rthp;
847 u32 skeys[2] = { saddr, 0 };
848 int ikeys[2] = { dev->ifindex, 0 };
850 tos &= IPTOS_RT_MASK;
855 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
856 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
857 goto reject_redirect;
859 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
860 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
861 goto reject_redirect;
862 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
863 goto reject_redirect;
865 if (inet_addr_type(new_gw) != RTN_UNICAST)
866 goto reject_redirect;
869 for (i = 0; i < 2; i++) {
870 for (k = 0; k < 2; k++) {
871 unsigned hash = rt_hash_code(daddr,
872 skeys[i] ^ (ikeys[k] << 5),
875 rthp=&rt_hash_table[hash].chain;
877 read_lock(&rt_hash_table[hash].lock);
878 while ((rth = *rthp) != NULL) {
881 if (rth->key.dst != daddr ||
882 rth->key.src != skeys[i] ||
883 rth->key.tos != tos ||
884 rth->key.oif != ikeys[k] ||
886 rthp = &rth->u.rt_next;
890 if (rth->rt_dst != daddr ||
891 rth->rt_src != saddr ||
893 rth->rt_gateway != old_gw ||
894 rth->u.dst.dev != dev)
897 dst_hold(&rth->u.dst);
898 read_unlock(&rt_hash_table[hash].lock);
900 rt = dst_alloc(&ipv4_dst_ops);
907 /* Copy all the information. */
910 atomic_set(&rt->u.dst.__refcnt, 1);
912 dev_hold(rt->u.dst.dev);
913 rt->u.dst.lastuse = jiffies;
914 rt->u.dst.neighbour = NULL;
916 rt->u.dst.obsolete = 0;
918 rt->rt_flags |= RTCF_REDIRECTED;
920 /* Gateway is different ... */
921 rt->rt_gateway = new_gw;
923 /* Redirect received -> path was valid */
924 dst_confirm(&rth->u.dst);
927 atomic_inc(&rt->peer->refcnt);
929 if (arp_bind_neighbour(&rt->u.dst) ||
930 !(rt->u.dst.neighbour->nud_state &
932 if (rt->u.dst.neighbour)
933 neigh_event_send(rt->u.dst.neighbour, NULL);
940 if (!rt_intern_hash(hash, rt, &rt))
944 read_unlock(&rt_hash_table[hash].lock);
953 #ifdef CONFIG_IP_ROUTE_VERBOSE
954 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
955 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
956 "%u.%u.%u.%u ignored.\n"
957 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
959 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
960 NIPQUAD(saddr), NIPQUAD(daddr), tos);
965 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
967 struct rtable *rt = (struct rtable*)dst;
968 struct dst_entry *ret = dst;
974 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
976 unsigned hash = rt_hash_code(rt->key.dst,
980 #if RT_CACHE_DEBUG >= 1
981 printk(KERN_DEBUG "ip_rt_advice: redirect to "
982 "%u.%u.%u.%u/%02x dropped\n",
983 NIPQUAD(rt->rt_dst), rt->key.tos);
994 * 1. The first ip_rt_redirect_number redirects are sent
995 * with exponential backoff, then we stop sending them at all,
996 * assuming that the host ignores our redirects.
997 * 2. If we did not see packets requiring redirects
998 * during ip_rt_redirect_silence, we assume that the host
999 * forgot redirected route and start to send redirects again.
1001 * This algorithm is much cheaper and more intelligent than dumb load limiting
1004 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1005 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1008 void ip_rt_send_redirect(struct sk_buff *skb)
1010 struct rtable *rt = (struct rtable*)skb->dst;
1011 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1016 if (!IN_DEV_TX_REDIRECTS(in_dev))
1019 /* No redirected packets during ip_rt_redirect_silence;
1020 * reset the algorithm.
1022 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023 rt->u.dst.rate_tokens = 0;
1025 /* Too many ignored redirects; do not send anything
1026 * set u.dst.rate_last to the last seen redirected packet.
1028 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029 rt->u.dst.rate_last = jiffies;
1033 /* Check for load limit; set rate_last to the latest sent
1036 if (time_after(jiffies,
1037 (rt->u.dst.rate_last +
1038 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040 rt->u.dst.rate_last = jiffies;
1041 ++rt->u.dst.rate_tokens;
1042 #ifdef CONFIG_IP_ROUTE_VERBOSE
1043 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1046 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048 NIPQUAD(rt->rt_src), rt->rt_iif,
1049 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1056 static int ip_error(struct sk_buff *skb)
1058 struct rtable *rt = (struct rtable*)skb->dst;
1062 switch (rt->u.dst.error) {
1067 code = ICMP_HOST_UNREACH;
1070 code = ICMP_NET_UNREACH;
1073 code = ICMP_PKT_FILTERED;
1078 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080 rt->u.dst.rate_tokens = ip_rt_error_burst;
1081 rt->u.dst.rate_last = now;
1082 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1087 out: kfree_skb(skb);
1092 * The last two values are not from the RFC but
1093 * are needed for AMPRnet AX.25 paths.
1096 static unsigned short mtu_plateau[] =
1097 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1099 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1103 for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104 if (old_mtu > mtu_plateau[i])
1105 return mtu_plateau[i];
1109 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1112 unsigned short old_mtu = ntohs(iph->tot_len);
1114 u32 skeys[2] = { iph->saddr, 0, };
1115 u32 daddr = iph->daddr;
1116 u8 tos = iph->tos & IPTOS_RT_MASK;
1117 unsigned short est_mtu = 0;
1119 if (ipv4_config.no_pmtu_disc)
1122 for (i = 0; i < 2; i++) {
1123 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1125 read_lock(&rt_hash_table[hash].lock);
1126 for (rth = rt_hash_table[hash].chain; rth;
1127 rth = rth->u.rt_next) {
1128 if (rth->key.dst == daddr &&
1129 rth->key.src == skeys[i] &&
1130 rth->rt_dst == daddr &&
1131 rth->rt_src == iph->saddr &&
1132 rth->key.tos == tos &&
1133 rth->key.iif == 0 &&
1134 !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135 unsigned short mtu = new_mtu;
1137 if (new_mtu < 68 || new_mtu >= old_mtu) {
1139 /* BSD 4.2 compatibility hack :-( */
1141 old_mtu >= rth->u.dst.pmtu &&
1142 old_mtu >= 68 + (iph->ihl << 2))
1143 old_mtu -= iph->ihl << 2;
1145 mtu = guess_mtu(old_mtu);
1147 if (mtu <= rth->u.dst.pmtu) {
1148 if (mtu < rth->u.dst.pmtu) {
1149 dst_confirm(&rth->u.dst);
1150 if (mtu < ip_rt_min_pmtu) {
1151 mtu = ip_rt_min_pmtu;
1152 rth->u.dst.mxlock |=
1155 rth->u.dst.pmtu = mtu;
1156 dst_set_expires(&rth->u.dst,
1163 read_unlock(&rt_hash_table[hash].lock);
1165 return est_mtu ? : new_mtu;
1168 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1170 if (dst->pmtu > mtu && mtu >= 68 &&
1171 !(dst->mxlock & (1 << RTAX_MTU))) {
1172 if (mtu < ip_rt_min_pmtu) {
1173 mtu = ip_rt_min_pmtu;
1174 dst->mxlock |= (1 << RTAX_MTU);
1177 dst_set_expires(dst, ip_rt_mtu_expires);
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1187 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188 struct sk_buff *skb)
1193 static void ipv4_dst_destroy(struct dst_entry *dst)
1195 struct rtable *rt = (struct rtable *) dst;
1196 struct inet_peer *peer = rt->peer;
1204 static void ipv4_link_failure(struct sk_buff *skb)
1208 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1210 rt = (struct rtable *) skb->dst;
1212 dst_set_expires(&rt->u.dst, 0);
1215 static int ip_rt_bug(struct sk_buff *skb)
1217 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219 skb->dev ? skb->dev->name : "?");
1225 We do not cache source address of outgoing interface,
1226 because it is used only by IP RR, TS and SRR options,
1227 so that it out of fast path.
1229 BTW remember: "addr" is allowed to be not aligned
1233 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1236 struct fib_result res;
1238 if (rt->key.iif == 0)
1240 else if (fib_lookup(&rt->key, &res) == 0) {
1241 #ifdef CONFIG_IP_ROUTE_NAT
1242 if (res.type == RTN_NAT)
1243 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1247 src = FIB_RES_PREFSRC(res);
1250 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1252 memcpy(addr, &src, 4);
1255 #ifdef CONFIG_NET_CLS_ROUTE
1256 static void set_class_tag(struct rtable *rt, u32 tag)
1258 if (!(rt->u.dst.tclassid & 0xFFFF))
1259 rt->u.dst.tclassid |= tag & 0xFFFF;
1260 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1265 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1267 struct fib_info *fi = res->fi;
1270 if (FIB_RES_GW(*res) &&
1271 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272 rt->rt_gateway = FIB_RES_GW(*res);
1273 memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274 sizeof(fi->fib_metrics));
1275 if (fi->fib_mtu == 0) {
1276 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277 if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278 rt->rt_gateway != rt->rt_dst &&
1279 rt->u.dst.pmtu > 576)
1280 rt->u.dst.pmtu = 576;
1282 #ifdef CONFIG_NET_CLS_ROUTE
1283 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1286 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1288 if (rt->u.dst.pmtu > IP_MAX_MTU)
1289 rt->u.dst.pmtu = IP_MAX_MTU;
1290 if (rt->u.dst.advmss == 0)
1291 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1293 if (rt->u.dst.advmss > 65535 - 40)
1294 rt->u.dst.advmss = 65535 - 40;
1296 #ifdef CONFIG_NET_CLS_ROUTE
1297 #ifdef CONFIG_IP_MULTIPLE_TABLES
1298 set_class_tag(rt, fib_rules_tclass(res));
1300 set_class_tag(rt, itag);
1302 rt->rt_type = res->type;
1305 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306 u8 tos, struct net_device *dev, int our)
1311 struct in_device *in_dev = in_dev_get(dev);
1314 /* Primary sanity checks. */
1319 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320 skb->protocol != htons(ETH_P_IP))
1323 if (ZERONET(saddr)) {
1324 if (!LOCAL_MCAST(daddr))
1326 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327 } else if (fib_validate_source(saddr, 0, tos, 0,
1328 dev, &spec_dst, &itag) < 0)
1331 rth = dst_alloc(&ipv4_dst_ops);
1335 rth->u.dst.output= ip_rt_bug;
1337 atomic_set(&rth->u.dst.__refcnt, 1);
1338 rth->u.dst.flags= DST_HOST;
1339 rth->key.dst = daddr;
1340 rth->rt_dst = daddr;
1342 #ifdef CONFIG_IP_ROUTE_FWMARK
1343 rth->key.fwmark = skb->nfmark;
1345 rth->key.src = saddr;
1346 rth->rt_src = saddr;
1347 #ifdef CONFIG_IP_ROUTE_NAT
1348 rth->rt_dst_map = daddr;
1349 rth->rt_src_map = saddr;
1351 #ifdef CONFIG_NET_CLS_ROUTE
1352 rth->u.dst.tclassid = itag;
1355 rth->key.iif = dev->ifindex;
1356 rth->u.dst.dev = &loopback_dev;
1357 dev_hold(rth->u.dst.dev);
1359 rth->rt_gateway = daddr;
1360 rth->rt_spec_dst= spec_dst;
1361 rth->rt_type = RTN_MULTICAST;
1362 rth->rt_flags = RTCF_MULTICAST;
1364 rth->u.dst.input= ip_local_deliver;
1365 rth->rt_flags |= RTCF_LOCAL;
1368 #ifdef CONFIG_IP_MROUTE
1369 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370 rth->u.dst.input = ip_mr_input;
1372 rt_cache_stat[smp_processor_id()].in_slow_mc++;
1375 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1388 * NOTE. We drop all the packets that has local source
1389 * addresses, because every properly looped back packet
1390 * must have correct destination already attached by output routine.
1392 * Such approach solves two big problems:
1393 * 1. Not simplex devices are handled properly.
1394 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1397 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398 u8 tos, struct net_device *dev)
1401 struct fib_result res;
1402 struct in_device *in_dev = in_dev_get(dev);
1403 struct in_device *out_dev = NULL;
1406 struct rtable * rth;
1412 /* IP on this device is disabled. */
1420 #ifdef CONFIG_IP_ROUTE_FWMARK
1421 key.fwmark = skb->nfmark;
1423 key.iif = dev->ifindex;
1425 key.scope = RT_SCOPE_UNIVERSE;
1427 hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1429 /* Check for the most weird martians, which can be not detected
1433 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434 goto martian_source;
1436 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1439 /* Accept zero addresses only to limited broadcast;
1440 * I even do not know to fix it or not. Waiting for complains :-)
1443 goto martian_source;
1445 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446 goto martian_destination;
1449 * Now we are ready to route packet.
1451 if ((err = fib_lookup(&key, &res)) != 0) {
1452 if (!IN_DEV_FORWARD(in_dev))
1458 rt_cache_stat[smp_processor_id()].in_slow_tot++;
1460 #ifdef CONFIG_IP_ROUTE_NAT
1461 /* Policy is applied before mapping destination,
1462 but rerouting after map should be made with old source.
1466 u32 src_map = saddr;
1468 src_map = fib_rules_policy(saddr, &res, &flags);
1470 if (res.type == RTN_NAT) {
1471 key.dst = fib_rules_map_destination(daddr, &res);
1474 if (fib_lookup(&key, &res))
1477 if (res.type != RTN_UNICAST)
1485 if (res.type == RTN_BROADCAST)
1488 if (res.type == RTN_LOCAL) {
1490 result = fib_validate_source(saddr, daddr, tos,
1491 loopback_dev.ifindex,
1492 dev, &spec_dst, &itag);
1494 goto martian_source;
1496 flags |= RTCF_DIRECTSRC;
1501 if (!IN_DEV_FORWARD(in_dev))
1503 if (res.type != RTN_UNICAST)
1504 goto martian_destination;
1506 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1507 if (res.fi->fib_nhs > 1 && key.oif == 0)
1508 fib_select_multipath(&key, &res);
1510 out_dev = in_dev_get(FIB_RES_DEV(res));
1511 if (out_dev == NULL) {
1512 if (net_ratelimit())
1513 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514 "Please, report\n");
1518 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1521 goto martian_source;
1524 flags |= RTCF_DIRECTSRC;
1526 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527 (IN_DEV_SHARED_MEDIA(out_dev) ||
1528 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529 flags |= RTCF_DOREDIRECT;
1531 if (skb->protocol != htons(ETH_P_IP)) {
1532 /* Not IP (i.e. ARP). Do not create route, if it is
1533 * invalid for proxy arp. DNAT routes are always valid.
1535 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1539 rth = dst_alloc(&ipv4_dst_ops);
1543 atomic_set(&rth->u.dst.__refcnt, 1);
1544 rth->u.dst.flags= DST_HOST;
1545 rth->key.dst = daddr;
1546 rth->rt_dst = daddr;
1548 #ifdef CONFIG_IP_ROUTE_FWMARK
1549 rth->key.fwmark = skb->nfmark;
1551 rth->key.src = saddr;
1552 rth->rt_src = saddr;
1553 rth->rt_gateway = daddr;
1554 #ifdef CONFIG_IP_ROUTE_NAT
1555 rth->rt_src_map = key.src;
1556 rth->rt_dst_map = key.dst;
1557 if (flags&RTCF_DNAT)
1558 rth->rt_gateway = key.dst;
1561 rth->key.iif = dev->ifindex;
1562 rth->u.dst.dev = out_dev->dev;
1563 dev_hold(rth->u.dst.dev);
1565 rth->rt_spec_dst= spec_dst;
1567 rth->u.dst.input = ip_forward;
1568 rth->u.dst.output = ip_output;
1570 rt_set_nexthop(rth, &res, itag);
1572 rth->rt_flags = flags;
1574 #ifdef CONFIG_NET_FASTROUTE
1575 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576 struct net_device *odev = rth->u.dst.dev;
1578 dev->accept_fastpath &&
1579 odev->mtu >= dev->mtu &&
1580 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581 rth->rt_flags |= RTCF_FAST;
1586 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1590 in_dev_put(out_dev);
1596 if (skb->protocol != htons(ETH_P_IP))
1600 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1602 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1605 goto martian_source;
1607 flags |= RTCF_DIRECTSRC;
1609 flags |= RTCF_BROADCAST;
1610 res.type = RTN_BROADCAST;
1611 rt_cache_stat[smp_processor_id()].in_brd++;
1614 rth = dst_alloc(&ipv4_dst_ops);
1618 rth->u.dst.output= ip_rt_bug;
1620 atomic_set(&rth->u.dst.__refcnt, 1);
1621 rth->u.dst.flags= DST_HOST;
1622 rth->key.dst = daddr;
1623 rth->rt_dst = daddr;
1625 #ifdef CONFIG_IP_ROUTE_FWMARK
1626 rth->key.fwmark = skb->nfmark;
1628 rth->key.src = saddr;
1629 rth->rt_src = saddr;
1630 #ifdef CONFIG_IP_ROUTE_NAT
1631 rth->rt_dst_map = key.dst;
1632 rth->rt_src_map = key.src;
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1638 rth->key.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1642 rth->rt_gateway = daddr;
1643 rth->rt_spec_dst= spec_dst;
1644 rth->u.dst.input= ip_local_deliver;
1645 rth->rt_flags = flags|RTCF_LOCAL;
1646 if (res.type == RTN_UNREACHABLE) {
1647 rth->u.dst.input= ip_error;
1648 rth->u.dst.error= -err;
1649 rth->rt_flags &= ~RTCF_LOCAL;
1651 rth->rt_type = res.type;
1655 rt_cache_stat[smp_processor_id()].in_no_route++;
1656 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657 res.type = RTN_UNREACHABLE;
1661 * Do not cache martian addresses: they should be logged (RFC1812)
1663 martian_destination:
1664 rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665 #ifdef CONFIG_IP_ROUTE_VERBOSE
1666 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1681 rt_cache_stat[smp_processor_id()].in_martian_src++;
1682 #ifdef CONFIG_IP_ROUTE_VERBOSE
1683 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1685 * RFC1812 recommendation, if source is martian,
1686 * the only hint is MAC header.
1688 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689 "%u.%u.%u.%u, on dev %s\n",
1690 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691 if (dev->hard_header_len) {
1693 unsigned char *p = skb->mac.raw;
1694 printk(KERN_WARNING "ll header: ");
1695 for (i = 0; i < dev->hard_header_len; i++, p++) {
1697 if (i < (dev->hard_header_len - 1))
1707 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708 u8 tos, struct net_device *dev)
1710 struct rtable * rth;
1712 int iif = dev->ifindex;
1714 tos &= IPTOS_RT_MASK;
1715 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1717 read_lock(&rt_hash_table[hash].lock);
1718 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719 if (rth->key.dst == daddr &&
1720 rth->key.src == saddr &&
1721 rth->key.iif == iif &&
1722 rth->key.oif == 0 &&
1723 #ifdef CONFIG_IP_ROUTE_FWMARK
1724 rth->key.fwmark == skb->nfmark &&
1726 rth->key.tos == tos) {
1727 rth->u.dst.lastuse = jiffies;
1728 dst_hold(&rth->u.dst);
1730 rt_cache_stat[smp_processor_id()].in_hit++;
1731 read_unlock(&rt_hash_table[hash].lock);
1732 skb->dst = (struct dst_entry*)rth;
1735 rt_cache_stat[smp_processor_id()].in_hlist_search++;
1737 read_unlock(&rt_hash_table[hash].lock);
1739 /* Multicast recognition logic is moved from route cache to here.
1740 The problem was that too many Ethernet cards have broken/missing
1741 hardware multicast filters :-( As result the host on multicasting
1742 network acquires a lot of useless route cache entries, sort of
1743 SDR messages from all the world. Now we try to get rid of them.
1744 Really, provided software IP multicast filter is organized
1745 reasonably (at least, hashed), it does not result in a slowdown
1746 comparing with route cache reject entries.
1747 Note, that multicast routers are not affected, because
1748 route cache entry is created eventually.
1750 if (MULTICAST(daddr)) {
1751 struct in_device *in_dev;
1753 read_lock(&inetdev_lock);
1754 if ((in_dev = __in_dev_get(dev)) != NULL) {
1755 int our = ip_check_mc(in_dev, daddr, saddr);
1757 #ifdef CONFIG_IP_MROUTE
1758 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1761 read_unlock(&inetdev_lock);
1762 return ip_route_input_mc(skb, daddr, saddr,
1766 read_unlock(&inetdev_lock);
1769 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1773 * Major route resolver routine.
1776 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1779 struct fib_result res;
1782 struct net_device *dev_out = NULL;
1788 tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789 key.dst = oldkey->dst;
1790 key.src = oldkey->src;
1791 key.tos = tos & IPTOS_RT_MASK;
1792 key.iif = loopback_dev.ifindex;
1793 key.oif = oldkey->oif;
1794 #ifdef CONFIG_IP_ROUTE_FWMARK
1795 key.fwmark = oldkey->fwmark;
1797 key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1800 #ifdef CONFIG_IP_MULTIPLE_TABLES
1806 if (MULTICAST(oldkey->src) ||
1807 BADCLASS(oldkey->src) ||
1808 ZERONET(oldkey->src))
1811 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812 dev_out = ip_dev_find(oldkey->src);
1813 if (dev_out == NULL)
1816 /* I removed check for oif == dev_out->oif here.
1817 It was wrong by three reasons:
1818 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1819 assigned to multiple interfaces.
1820 2. Moreover, we are allowed to send packets with saddr
1821 of another iface. --ANK
1824 if (oldkey->oif == 0
1825 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826 /* Special hack: user can direct multicasts
1827 and limited broadcast via necessary interface
1828 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1829 This hack is not just for fun, it allows
1830 vic,vat and friends to work.
1831 They bind socket to loopback, set ttl to zero
1832 and expect that it will work.
1833 From the viewpoint of routing cache they are broken,
1834 because we are not allowed to build multicast path
1835 with loopback source addr (look, routing cache
1836 cannot know, that ttl is zero, so that packet
1837 will not leave this host and route is valid).
1838 Luckily, this hack is good workaround.
1841 key.oif = dev_out->ifindex;
1849 dev_out = dev_get_by_index(oldkey->oif);
1851 if (dev_out == NULL)
1853 if (__in_dev_get(dev_out) == NULL) {
1855 goto out; /* Wrong error code */
1858 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1860 key.src = inet_select_addr(dev_out, 0,
1865 if (MULTICAST(oldkey->dst))
1866 key.src = inet_select_addr(dev_out, 0,
1868 else if (!oldkey->dst)
1869 key.src = inet_select_addr(dev_out, 0,
1877 key.dst = key.src = htonl(INADDR_LOOPBACK);
1880 dev_out = &loopback_dev;
1882 key.oif = loopback_dev.ifindex;
1883 res.type = RTN_LOCAL;
1884 flags |= RTCF_LOCAL;
1888 if (fib_lookup(&key, &res)) {
1891 /* Apparently, routing tables are wrong. Assume,
1892 that the destination is on link.
1895 Because we are allowed to send to iface
1896 even if it has NO routes and NO assigned
1897 addresses. When oif is specified, routing
1898 tables are looked up with only one purpose:
1899 to catch if destination is gatewayed, rather than
1900 direct. Moreover, if MSG_DONTROUTE is set,
1901 we send packet, ignoring both routing tables
1902 and ifaddr state. --ANK
1905 We could make it even if oif is unknown,
1906 likely IPv6, but we do not.
1910 key.src = inet_select_addr(dev_out, 0,
1912 res.type = RTN_UNICAST;
1922 if (res.type == RTN_NAT)
1925 if (res.type == RTN_LOCAL) {
1930 dev_out = &loopback_dev;
1932 key.oif = dev_out->ifindex;
1934 fib_info_put(res.fi);
1936 flags |= RTCF_LOCAL;
1940 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1941 if (res.fi->fib_nhs > 1 && key.oif == 0)
1942 fib_select_multipath(&key, &res);
1945 if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946 fib_select_default(&key, &res);
1949 key.src = FIB_RES_PREFSRC(res);
1953 dev_out = FIB_RES_DEV(res);
1955 key.oif = dev_out->ifindex;
1958 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1961 if (key.dst == 0xFFFFFFFF)
1962 res.type = RTN_BROADCAST;
1963 else if (MULTICAST(key.dst))
1964 res.type = RTN_MULTICAST;
1965 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1968 if (dev_out->flags & IFF_LOOPBACK)
1969 flags |= RTCF_LOCAL;
1971 if (res.type == RTN_BROADCAST) {
1972 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1974 fib_info_put(res.fi);
1977 } else if (res.type == RTN_MULTICAST) {
1978 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979 read_lock(&inetdev_lock);
1980 if (!__in_dev_get(dev_out) ||
1981 !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982 flags &= ~RTCF_LOCAL;
1983 read_unlock(&inetdev_lock);
1984 /* If multicast route do not exist use
1985 default one, but do not gateway in this case.
1988 if (res.fi && res.prefixlen < 4) {
1989 fib_info_put(res.fi);
1994 rth = dst_alloc(&ipv4_dst_ops);
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
2000 rth->key.dst = oldkey->dst;
2002 rth->key.src = oldkey->src;
2004 rth->key.oif = oldkey->oif;
2005 #ifdef CONFIG_IP_ROUTE_FWMARK
2006 rth->key.fwmark = oldkey->fwmark;
2008 rth->rt_dst = key.dst;
2009 rth->rt_src = key.src;
2010 #ifdef CONFIG_IP_ROUTE_NAT
2011 rth->rt_dst_map = key.dst;
2012 rth->rt_src_map = key.src;
2014 rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
2015 rth->u.dst.dev = dev_out;
2017 rth->rt_gateway = key.dst;
2018 rth->rt_spec_dst= key.src;
2020 rth->u.dst.output=ip_output;
2022 rt_cache_stat[smp_processor_id()].out_slow_tot++;
2024 if (flags & RTCF_LOCAL) {
2025 rth->u.dst.input = ip_local_deliver;
2026 rth->rt_spec_dst = key.dst;
2028 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029 rth->rt_spec_dst = key.src;
2030 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031 rth->u.dst.output = ip_mc_output;
2032 rt_cache_stat[smp_processor_id()].out_slow_mc++;
2034 #ifdef CONFIG_IP_MROUTE
2035 if (res.type == RTN_MULTICAST) {
2036 struct in_device *in_dev = in_dev_get(dev_out);
2038 if (IN_DEV_MFORWARD(in_dev) &&
2039 !LOCAL_MCAST(oldkey->dst)) {
2040 rth->u.dst.input = ip_mr_input;
2041 rth->u.dst.output = ip_mc_output;
2049 rt_set_nexthop(rth, &res, 0);
2051 rth->rt_flags = flags;
2053 hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054 err = rt_intern_hash(hash, rth, rp);
2070 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2075 hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2077 read_lock_bh(&rt_hash_table[hash].lock);
2078 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079 if (rth->key.dst == key->dst &&
2080 rth->key.src == key->src &&
2081 rth->key.iif == 0 &&
2082 rth->key.oif == key->oif &&
2083 #ifdef CONFIG_IP_ROUTE_FWMARK
2084 rth->key.fwmark == key->fwmark &&
2086 !((rth->key.tos ^ key->tos) &
2087 (IPTOS_RT_MASK | RTO_ONLINK))) {
2088 rth->u.dst.lastuse = jiffies;
2089 dst_hold(&rth->u.dst);
2091 rt_cache_stat[smp_processor_id()].out_hit++;
2092 read_unlock_bh(&rt_hash_table[hash].lock);
2096 rt_cache_stat[smp_processor_id()].out_hlist_search++;
2098 read_unlock_bh(&rt_hash_table[hash].lock);
2100 return ip_route_output_slow(rp, key);
2103 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2106 struct rtable *rt = (struct rtable*)skb->dst;
2108 struct nlmsghdr *nlh;
2109 unsigned char *b = skb->tail;
2110 struct rta_cacheinfo ci;
2111 #ifdef CONFIG_IP_MROUTE
2112 struct rtattr *eptr;
2114 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115 r = NLMSG_DATA(nlh);
2116 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117 r->rtm_family = AF_INET;
2118 r->rtm_dst_len = 32;
2120 r->rtm_tos = rt->key.tos;
2121 r->rtm_table = RT_TABLE_MAIN;
2122 r->rtm_type = rt->rt_type;
2123 r->rtm_scope = RT_SCOPE_UNIVERSE;
2124 r->rtm_protocol = RTPROT_UNSPEC;
2125 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126 if (rt->rt_flags & RTCF_NOTIFY)
2127 r->rtm_flags |= RTM_F_NOTIFY;
2128 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2130 r->rtm_src_len = 32;
2131 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2134 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135 #ifdef CONFIG_NET_CLS_ROUTE
2136 if (rt->u.dst.tclassid)
2137 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2140 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141 else if (rt->rt_src != rt->key.src)
2142 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143 if (rt->rt_dst != rt->rt_gateway)
2144 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146 goto rtattr_failure;
2147 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
2148 ci.rta_used = rt->u.dst.__use;
2149 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2150 if (rt->u.dst.expires)
2151 ci.rta_expires = rt->u.dst.expires - jiffies;
2154 ci.rta_error = rt->u.dst.error;
2155 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2157 ci.rta_id = rt->peer->ip_id_count;
2158 if (rt->peer->tcp_ts_stamp) {
2159 ci.rta_ts = rt->peer->tcp_ts;
2160 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2163 #ifdef CONFIG_IP_MROUTE
2164 eptr = (struct rtattr*)skb->tail;
2166 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2168 #ifdef CONFIG_IP_MROUTE
2169 u32 dst = rt->rt_dst;
2171 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172 ipv4_devconf.mc_forwarding) {
2173 int err = ipmr_get_route(skb, r, nowait);
2180 if (err == -EMSGSIZE)
2182 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2187 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2190 nlh->nlmsg_len = skb->tail - b;
2195 skb_trim(skb, b - skb->data);
2199 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2201 struct rtattr **rta = arg;
2202 struct rtmsg *rtm = NLMSG_DATA(nlh);
2203 struct rtable *rt = NULL;
2208 struct sk_buff *skb;
2210 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2214 /* Reserve room for dummy headers, this skb can pass
2215 through good chunk of routing engine.
2217 skb->mac.raw = skb->nh.raw = skb->data;
2219 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2220 skb->nh.iph->protocol = IPPROTO_ICMP;
2221 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2223 if (rta[RTA_SRC - 1])
2224 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2225 if (rta[RTA_DST - 1])
2226 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2227 if (rta[RTA_IIF - 1])
2228 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2231 struct net_device *dev = __dev_get_by_index(iif);
2235 skb->protocol = htons(ETH_P_IP);
2238 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2240 rt = (struct rtable*)skb->dst;
2241 if (!err && rt->u.dst.error)
2242 err = -rt->u.dst.error;
2245 if (rta[RTA_OIF - 1])
2246 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2247 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2252 skb->dst = &rt->u.dst;
2253 if (rtm->rtm_flags & RTM_F_NOTIFY)
2254 rt->rt_flags |= RTCF_NOTIFY;
2256 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2258 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2267 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2277 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2284 s_idx = idx = cb->args[1];
2285 for (h = 0; h <= rt_hash_mask; h++) {
2286 if (h < s_h) continue;
2289 read_lock_bh(&rt_hash_table[h].lock);
2290 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2291 rt = rt->u.rt_next, idx++) {
2294 skb->dst = dst_clone(&rt->u.dst);
2295 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2297 RTM_NEWROUTE, 1) <= 0) {
2298 dst_release(xchg(&skb->dst, NULL));
2299 read_unlock_bh(&rt_hash_table[h].lock);
2302 dst_release(xchg(&skb->dst, NULL));
2304 read_unlock_bh(&rt_hash_table[h].lock);
2313 void ip_rt_multicast_event(struct in_device *in_dev)
2318 #ifdef CONFIG_SYSCTL
2319 static int flush_delay;
2321 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2322 struct file *filp, void *buffer,
2326 proc_dointvec(ctl, write, filp, buffer, lenp);
2327 rt_cache_flush(flush_delay);
2334 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2335 int nlen, void *oldval,
2336 size_t *oldlenp, void *newval,
2337 size_t newlen, void **context)
2340 if (newlen != sizeof(int))
2342 if (get_user(delay, (int *)newval))
2344 rt_cache_flush(delay);
2348 ctl_table ipv4_route_table[] = {
2350 ctl_name: NET_IPV4_ROUTE_FLUSH,
2353 maxlen: sizeof(int),
2355 proc_handler: &ipv4_sysctl_rtcache_flush,
2356 strategy: &ipv4_sysctl_rtcache_flush_strategy,
2359 ctl_name: NET_IPV4_ROUTE_MIN_DELAY,
2360 procname: "min_delay",
2361 data: &ip_rt_min_delay,
2362 maxlen: sizeof(int),
2364 proc_handler: &proc_dointvec_jiffies,
2365 strategy: &sysctl_jiffies,
2368 ctl_name: NET_IPV4_ROUTE_MAX_DELAY,
2369 procname: "max_delay",
2370 data: &ip_rt_max_delay,
2371 maxlen: sizeof(int),
2373 proc_handler: &proc_dointvec_jiffies,
2374 strategy: &sysctl_jiffies,
2377 ctl_name: NET_IPV4_ROUTE_GC_THRESH,
2378 procname: "gc_thresh",
2379 data: &ipv4_dst_ops.gc_thresh,
2380 maxlen: sizeof(int),
2382 proc_handler: &proc_dointvec,
2385 ctl_name: NET_IPV4_ROUTE_MAX_SIZE,
2386 procname: "max_size",
2387 data: &ip_rt_max_size,
2388 maxlen: sizeof(int),
2390 proc_handler: &proc_dointvec,
2393 ctl_name: NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2394 procname: "gc_min_interval",
2395 data: &ip_rt_gc_min_interval,
2396 maxlen: sizeof(int),
2398 proc_handler: &proc_dointvec_jiffies,
2399 strategy: &sysctl_jiffies,
2402 ctl_name: NET_IPV4_ROUTE_GC_TIMEOUT,
2403 procname: "gc_timeout",
2404 data: &ip_rt_gc_timeout,
2405 maxlen: sizeof(int),
2407 proc_handler: &proc_dointvec_jiffies,
2408 strategy: &sysctl_jiffies,
2411 ctl_name: NET_IPV4_ROUTE_GC_INTERVAL,
2412 procname: "gc_interval",
2413 data: &ip_rt_gc_interval,
2414 maxlen: sizeof(int),
2416 proc_handler: &proc_dointvec_jiffies,
2417 strategy: &sysctl_jiffies,
2420 ctl_name: NET_IPV4_ROUTE_REDIRECT_LOAD,
2421 procname: "redirect_load",
2422 data: &ip_rt_redirect_load,
2423 maxlen: sizeof(int),
2425 proc_handler: &proc_dointvec,
2428 ctl_name: NET_IPV4_ROUTE_REDIRECT_NUMBER,
2429 procname: "redirect_number",
2430 data: &ip_rt_redirect_number,
2431 maxlen: sizeof(int),
2433 proc_handler: &proc_dointvec,
2436 ctl_name: NET_IPV4_ROUTE_REDIRECT_SILENCE,
2437 procname: "redirect_silence",
2438 data: &ip_rt_redirect_silence,
2439 maxlen: sizeof(int),
2441 proc_handler: &proc_dointvec,
2444 ctl_name: NET_IPV4_ROUTE_ERROR_COST,
2445 procname: "error_cost",
2446 data: &ip_rt_error_cost,
2447 maxlen: sizeof(int),
2449 proc_handler: &proc_dointvec,
2452 ctl_name: NET_IPV4_ROUTE_ERROR_BURST,
2453 procname: "error_burst",
2454 data: &ip_rt_error_burst,
2455 maxlen: sizeof(int),
2457 proc_handler: &proc_dointvec,
2460 ctl_name: NET_IPV4_ROUTE_GC_ELASTICITY,
2461 procname: "gc_elasticity",
2462 data: &ip_rt_gc_elasticity,
2463 maxlen: sizeof(int),
2465 proc_handler: &proc_dointvec,
2468 ctl_name: NET_IPV4_ROUTE_MTU_EXPIRES,
2469 procname: "mtu_expires",
2470 data: &ip_rt_mtu_expires,
2471 maxlen: sizeof(int),
2473 proc_handler: &proc_dointvec_jiffies,
2474 strategy: &sysctl_jiffies,
2477 ctl_name: NET_IPV4_ROUTE_MIN_PMTU,
2478 procname: "min_pmtu",
2479 data: &ip_rt_min_pmtu,
2480 maxlen: sizeof(int),
2482 proc_handler: &proc_dointvec,
2485 ctl_name: NET_IPV4_ROUTE_MIN_ADVMSS,
2486 procname: "min_adv_mss",
2487 data: &ip_rt_min_advmss,
2488 maxlen: sizeof(int),
2490 proc_handler: &proc_dointvec,
2493 ctl_name: NET_IPV4_ROUTE_SECRET_INTERVAL,
2494 procname: "secret_interval",
2495 data: &ip_rt_secret_interval,
2496 maxlen: sizeof(int),
2498 proc_handler: &proc_dointvec_jiffies,
2499 strategy: &sysctl_jiffies,
2505 #ifdef CONFIG_NET_CLS_ROUTE
2506 struct ip_rt_acct *ip_rt_acct;
2508 /* This code sucks. But you should have seen it before! --RR */
2510 /* IP route accounting ptr for this logical cpu number. */
2511 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2513 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2514 int length, int *eof, void *data)
2518 if ((offset & 3) || (length & 3))
2521 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2526 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2527 length = sizeof(struct ip_rt_acct) * 256 - offset;
2531 offset /= sizeof(u32);
2534 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2535 u32 *dst = (u32 *) buffer;
2537 /* Copy first cpu. */
2539 memcpy(dst, src, length);
2541 /* Add the other cpus in, one int at a time */
2542 for (i = 1; i < smp_num_cpus; i++) {
2545 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2547 for (j = 0; j < length/4; j++)
2555 void __init ip_rt_init(void)
2559 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2560 (jiffies ^ (jiffies >> 7)));
2562 #ifdef CONFIG_NET_CLS_ROUTE
2564 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2566 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2568 panic("IP: failed to allocate ip_rt_acct\n");
2569 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2572 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2573 sizeof(struct rtable),
2574 0, SLAB_HWCACHE_ALIGN,
2577 if (!ipv4_dst_ops.kmem_cachep)
2578 panic("IP: failed to allocate ip_dst_cache\n");
2580 goal = num_physpages >> (26 - PAGE_SHIFT);
2582 for (order = 0; (1UL << order) < goal; order++)
2586 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2587 sizeof(struct rt_hash_bucket);
2588 while (rt_hash_mask & (rt_hash_mask - 1))
2590 rt_hash_table = (struct rt_hash_bucket *)
2591 __get_free_pages(GFP_ATOMIC, order);
2592 } while (rt_hash_table == NULL && --order > 0);
2595 panic("Failed to allocate IP route cache hash table\n");
2597 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2599 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2601 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2605 for (i = 0; i <= rt_hash_mask; i++) {
2606 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2607 rt_hash_table[i].chain = NULL;
2610 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2611 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2616 rt_flush_timer.function = rt_run_flush;
2617 rt_periodic_timer.function = rt_check_expire;
2618 rt_secret_timer.function = rt_secret_rebuild;
2620 /* All the timers, started at system startup tend
2621 to synchronize. Perturb it a bit.
2623 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2625 add_timer(&rt_periodic_timer);
2627 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2628 ip_rt_secret_interval;
2629 add_timer(&rt_secret_timer);
2631 proc_net_create ("rt_cache", 0, rt_cache_get_info);
2632 create_proc_info_entry ("rt_cache", 0, proc_net_stat,
2633 rt_cache_stat_get_info);
2634 #ifdef CONFIG_NET_CLS_ROUTE
2635 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);