2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.5 2006/09/05 08:03:56 michaelc Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <asm/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
98 #include <net/ip_fib.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
104 #include <linux/sysctl.h>
107 #define IP_MAX_MTU 0xFFF0
109 #define RT_GC_TIMEOUT (300*HZ)
111 int ip_rt_min_delay = 2 * HZ;
112 int ip_rt_max_delay = 10 * HZ;
114 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
115 int ip_rt_gc_interval = 60 * HZ;
116 int ip_rt_gc_min_interval = HZ / 2;
117 int ip_rt_redirect_number = 9;
118 int ip_rt_redirect_load = HZ / 50;
119 int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
120 int ip_rt_error_cost = HZ;
121 int ip_rt_error_burst = 5 * HZ;
122 int ip_rt_gc_elasticity = 1;
123 int ip_rt_mtu_expires = 10 * 60 * HZ;
124 int ip_rt_min_pmtu = 512 + 20 + 20;
125 int ip_rt_min_advmss = 256;
126 int ip_rt_secret_interval = 10 * 60 * HZ;
127 static unsigned long rt_deadline;
129 #define RTprint(a...) printk(KERN_DEBUG a)
131 static struct timer_list rt_flush_timer;
132 static struct timer_list rt_periodic_timer;
133 static struct timer_list rt_secret_timer;
136 * Interface to generic destination cache.
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static void ipv4_dst_destroy(struct dst_entry *dst);
141 static void ipv4_dst_ifdown(struct dst_entry *dst, int how);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void ipv4_link_failure(struct sk_buff *skb);
144 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(void);
148 static struct dst_ops ipv4_dst_ops = {
150 .protocol = __constant_htons(ETH_P_IP),
151 .gc = rt_garbage_collect,
152 .check = ipv4_dst_check,
153 .destroy = ipv4_dst_destroy,
154 .ifdown = ipv4_dst_ifdown,
155 .negative_advice = ipv4_negative_advice,
156 .link_failure = ipv4_link_failure,
157 .update_pmtu = ip_rt_update_pmtu,
158 .entry_size = sizeof(struct rtable),
161 #define ECN_OR_COST(class) TC_PRIO_##class
163 __u8 ip_tos2prio[16] = {
167 ECN_OR_COST(BESTEFFORT),
173 ECN_OR_COST(INTERACTIVE),
175 ECN_OR_COST(INTERACTIVE),
176 TC_PRIO_INTERACTIVE_BULK,
177 ECN_OR_COST(INTERACTIVE_BULK),
178 TC_PRIO_INTERACTIVE_BULK,
179 ECN_OR_COST(INTERACTIVE_BULK)
187 /* The locking scheme is rather straight forward:
189 * 1) Read-Copy Update protects the buckets of the central route hash.
190 * 2) Only writers remove entries, and they hold the lock
191 * as they look at rtable reference counts.
192 * 3) Only readers acquire references to rtable entries,
193 * they do so with atomic increments and with the
197 struct rt_hash_bucket {
198 struct rtable *chain;
200 } __attribute__((__aligned__(8)));
202 static struct rt_hash_bucket *rt_hash_table;
203 static unsigned rt_hash_mask;
204 static int rt_hash_log;
205 static unsigned int rt_hash_rnd;
207 struct rt_cache_stat *rt_cache_stat;
209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
210 struct rtable **res);
212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
214 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
218 #ifdef CONFIG_PROC_FS
219 struct rt_cache_iter_state {
223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
225 struct rtable *r = NULL;
226 struct rt_cache_iter_state *st = seq->private;
228 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
230 r = rt_hash_table[st->bucket].chain;
238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
240 struct rt_cache_iter_state *st = seq->private;
242 smp_read_barrier_depends();
246 if (--st->bucket < 0)
249 r = rt_hash_table[st->bucket].chain;
254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
256 struct rtable *r = rt_cache_get_first(seq);
259 while (pos && (r = rt_cache_get_next(seq, r)))
261 return pos ? NULL : r;
264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
266 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 struct rtable *r = NULL;
273 if (v == SEQ_START_TOKEN)
274 r = rt_cache_get_first(seq);
276 r = rt_cache_get_next(seq, v);
281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
283 if (v && v != SEQ_START_TOKEN)
287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
289 if (v == SEQ_START_TOKEN)
290 seq_printf(seq, "%-127s\n",
291 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
292 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
295 struct rtable *r = v;
298 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
299 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
300 r->u.dst.dev ? r->u.dst.dev->name : "*",
301 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
302 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
303 r->u.dst.__use, 0, (unsigned long)r->rt_src,
304 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
305 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
306 dst_metric(&r->u.dst, RTAX_WINDOW),
307 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
308 dst_metric(&r->u.dst, RTAX_RTTVAR)),
310 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
311 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
314 seq_printf(seq, "%-127s\n", temp);
319 static struct seq_operations rt_cache_seq_ops = {
320 .start = rt_cache_seq_start,
321 .next = rt_cache_seq_next,
322 .stop = rt_cache_seq_stop,
323 .show = rt_cache_seq_show,
326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
328 struct seq_file *seq;
330 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
334 rc = seq_open(file, &rt_cache_seq_ops);
337 seq = file->private_data;
339 memset(s, 0, sizeof(*s));
347 static struct file_operations rt_cache_seq_fops = {
348 .owner = THIS_MODULE,
349 .open = rt_cache_seq_open,
352 .release = seq_release_private,
356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
360 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
361 if (!cpu_possible(cpu))
364 return per_cpu_ptr(rt_cache_stat, cpu);
369 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
373 for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
374 if (!cpu_possible(cpu))
377 return per_cpu_ptr(rt_cache_stat, cpu);
383 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
388 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
390 struct rt_cache_stat *st = v;
392 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
393 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
394 atomic_read(&ipv4_dst_ops.entries),
417 static struct seq_operations rt_cpu_seq_ops = {
418 .start = rt_cpu_seq_start,
419 .next = rt_cpu_seq_next,
420 .stop = rt_cpu_seq_stop,
421 .show = rt_cpu_seq_show,
425 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
427 return seq_open(file, &rt_cpu_seq_ops);
430 static struct file_operations rt_cpu_seq_fops = {
431 .owner = THIS_MODULE,
432 .open = rt_cpu_seq_open,
435 .release = seq_release,
438 #endif /* CONFIG_PROC_FS */
440 static __inline__ void rt_free(struct rtable *rt)
442 call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
445 static __inline__ void rt_drop(struct rtable *rt)
448 call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
451 static __inline__ int rt_fast_clean(struct rtable *rth)
453 /* Kill broadcast/multicast entries very aggresively, if they
454 collide in hash table with more useful entries */
455 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
456 rth->fl.iif && rth->u.rt_next;
459 static __inline__ int rt_valuable(struct rtable *rth)
461 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
465 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
470 if (atomic_read(&rth->u.dst.__refcnt))
474 if (rth->u.dst.expires &&
475 time_after_eq(jiffies, rth->u.dst.expires))
478 age = jiffies - rth->u.dst.lastuse;
480 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
481 (age <= tmo2 && rt_valuable(rth)))
487 /* Bits of score are:
489 * 30: not quite useless
490 * 29..0: usage counter
492 static inline u32 rt_score(struct rtable *rt)
494 u32 score = jiffies - rt->u.dst.lastuse;
496 score = ~score & ~(3<<30);
502 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
508 /* This runs via a timer and thus is always in BH context. */
509 static void rt_check_expire(unsigned long dummy)
513 struct rtable *rth, **rthp;
514 unsigned long now = jiffies;
516 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
517 t -= ip_rt_gc_timeout) {
518 unsigned long tmo = ip_rt_gc_timeout;
520 i = (i + 1) & rt_hash_mask;
521 rthp = &rt_hash_table[i].chain;
523 spin_lock(&rt_hash_table[i].lock);
524 while ((rth = *rthp) != NULL) {
525 if (rth->u.dst.expires) {
526 /* Entry is expired even if it is in use */
527 if (time_before_eq(now, rth->u.dst.expires)) {
529 rthp = &rth->u.rt_next;
532 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
534 rthp = &rth->u.rt_next;
538 /* Cleanup aged off entries. */
539 *rthp = rth->u.rt_next;
542 spin_unlock(&rt_hash_table[i].lock);
544 /* Fallback loop breaker. */
545 if (time_after(jiffies, now))
549 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
552 /* This can run from both BH and non-BH contexts, the latter
553 * in the case of a forced flush event.
555 static void rt_run_flush(unsigned long dummy)
558 struct rtable *rth, *next;
562 get_random_bytes(&rt_hash_rnd, 4);
564 for (i = rt_hash_mask; i >= 0; i--) {
565 spin_lock_bh(&rt_hash_table[i].lock);
566 rth = rt_hash_table[i].chain;
568 rt_hash_table[i].chain = NULL;
569 spin_unlock_bh(&rt_hash_table[i].lock);
571 for (; rth; rth = next) {
572 next = rth->u.rt_next;
578 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
580 void rt_cache_flush(int delay)
582 unsigned long now = jiffies;
583 int user_mode = !in_softirq();
586 delay = ip_rt_min_delay;
588 spin_lock_bh(&rt_flush_lock);
590 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
591 long tmo = (long)(rt_deadline - now);
593 /* If flush timer is already running
594 and flush request is not immediate (delay > 0):
596 if deadline is not achieved, prolongate timer to "delay",
597 otherwise fire it at deadline time.
600 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
608 spin_unlock_bh(&rt_flush_lock);
613 if (rt_deadline == 0)
614 rt_deadline = now + ip_rt_max_delay;
616 mod_timer(&rt_flush_timer, now+delay);
617 spin_unlock_bh(&rt_flush_lock);
620 static void rt_secret_rebuild(unsigned long dummy)
622 unsigned long now = jiffies;
625 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
629 Short description of GC goals.
631 We want to build algorithm, which will keep routing cache
632 at some equilibrium point, when number of aged off entries
633 is kept approximately equal to newly generated ones.
635 Current expiration strength is variable "expire".
636 We try to adjust it dynamically, so that if networking
637 is idle expires is large enough to keep enough of warm entries,
638 and when load increases it reduces to limit cache size.
641 static int rt_garbage_collect(void)
643 static unsigned long expire = RT_GC_TIMEOUT;
644 static unsigned long last_gc;
646 static int equilibrium;
647 struct rtable *rth, **rthp;
648 unsigned long now = jiffies;
652 * Garbage collection is pretty expensive,
653 * do not make it too frequently.
656 RT_CACHE_STAT_INC(gc_total);
658 if (now - last_gc < ip_rt_gc_min_interval &&
659 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
660 RT_CACHE_STAT_INC(gc_ignored);
664 /* Calculate number of entries, which we want to expire now. */
665 goal = atomic_read(&ipv4_dst_ops.entries) -
666 (ip_rt_gc_elasticity << rt_hash_log);
668 if (equilibrium < ipv4_dst_ops.gc_thresh)
669 equilibrium = ipv4_dst_ops.gc_thresh;
670 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
672 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
673 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
676 /* We are in dangerous area. Try to reduce cache really
679 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
680 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
683 if (now - last_gc >= ip_rt_gc_min_interval)
694 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
695 unsigned long tmo = expire;
697 k = (k + 1) & rt_hash_mask;
698 rthp = &rt_hash_table[k].chain;
699 spin_lock_bh(&rt_hash_table[k].lock);
700 while ((rth = *rthp) != NULL) {
701 if (!rt_may_expire(rth, tmo, expire)) {
703 rthp = &rth->u.rt_next;
706 *rthp = rth->u.rt_next;
710 spin_unlock_bh(&rt_hash_table[k].lock);
719 /* Goal is not achieved. We stop process if:
721 - if expire reduced to zero. Otherwise, expire is halfed.
722 - if table is not full.
723 - if we are called from interrupt.
724 - jiffies check is just fallback/debug loop breaker.
725 We will not spin here for long time in any case.
728 RT_CACHE_STAT_INC(gc_goal_miss);
734 #if RT_CACHE_DEBUG >= 2
735 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
736 atomic_read(&ipv4_dst_ops.entries), goal, i);
739 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
741 } while (!in_softirq() && time_before_eq(jiffies, now));
743 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
747 printk(KERN_WARNING "dst cache overflow\n");
749 RT_CACHE_STAT_INC(gc_dst_overflow);
753 expire += ip_rt_gc_min_interval;
754 if (expire > ip_rt_gc_timeout ||
755 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
756 expire = ip_rt_gc_timeout;
757 #if RT_CACHE_DEBUG >= 2
758 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
759 atomic_read(&ipv4_dst_ops.entries), goal, rover);
764 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
766 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
767 fl1->oif == fl2->oif &&
768 fl1->iif == fl2->iif;
771 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
773 struct rtable *rth, **rthp;
775 struct rtable *cand, **candp;
778 int attempts = !in_softirq();
787 rthp = &rt_hash_table[hash].chain;
789 spin_lock_bh(&rt_hash_table[hash].lock);
790 while ((rth = *rthp) != NULL) {
791 if (compare_keys(&rth->fl, &rt->fl)) {
793 *rthp = rth->u.rt_next;
795 * Since lookup is lockfree, the deletion
796 * must be visible to another weakly ordered CPU before
797 * the insertion at the start of the hash chain.
800 rth->u.rt_next = rt_hash_table[hash].chain;
802 * Since lookup is lockfree, the update writes
803 * must be ordered for consistency on SMP.
806 rt_hash_table[hash].chain = rth;
809 dst_hold(&rth->u.dst);
810 rth->u.dst.lastuse = now;
811 spin_unlock_bh(&rt_hash_table[hash].lock);
818 if (!atomic_read(&rth->u.dst.__refcnt)) {
819 u32 score = rt_score(rth);
821 if (score <= min_score) {
830 rthp = &rth->u.rt_next;
834 /* ip_rt_gc_elasticity used to be average length of chain
835 * length, when exceeded gc becomes really aggressive.
837 * The second limit is less certain. At the moment it allows
838 * only 2 entries per bucket. We will see.
840 if (chain_length > ip_rt_gc_elasticity) {
841 *candp = cand->u.rt_next;
846 /* Try to bind route to arp only if it is output
847 route or unicast forwarding path.
849 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
850 int err = arp_bind_neighbour(&rt->u.dst);
852 spin_unlock_bh(&rt_hash_table[hash].lock);
854 if (err != -ENOBUFS) {
859 /* Neighbour tables are full and nothing
860 can be released. Try to shrink route cache,
861 it is most likely it holds some neighbour records.
863 if (attempts-- > 0) {
864 int saved_elasticity = ip_rt_gc_elasticity;
865 int saved_int = ip_rt_gc_min_interval;
866 ip_rt_gc_elasticity = 1;
867 ip_rt_gc_min_interval = 0;
868 rt_garbage_collect();
869 ip_rt_gc_min_interval = saved_int;
870 ip_rt_gc_elasticity = saved_elasticity;
875 printk(KERN_WARNING "Neighbour table overflow.\n");
881 rt->u.rt_next = rt_hash_table[hash].chain;
882 #if RT_CACHE_DEBUG >= 2
885 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
886 NIPQUAD(rt->rt_dst));
887 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
888 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
892 rt_hash_table[hash].chain = rt;
893 spin_unlock_bh(&rt_hash_table[hash].lock);
898 void rt_bind_peer(struct rtable *rt, int create)
900 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
901 struct inet_peer *peer;
903 peer = inet_getpeer(rt->rt_dst, create);
905 spin_lock_bh(&rt_peer_lock);
906 if (rt->peer == NULL) {
910 spin_unlock_bh(&rt_peer_lock);
916 * Peer allocation may fail only in serious out-of-memory conditions. However
917 * we still can generate some output.
918 * Random ID selection looks a bit dangerous because we have no chances to
919 * select ID being unique in a reasonable period of time.
920 * But broken packet identifier may be better than no packet at all.
922 static void ip_select_fb_ident(struct iphdr *iph)
924 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
925 static u32 ip_fallback_id;
928 spin_lock_bh(&ip_fb_id_lock);
929 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
930 iph->id = htons(salt & 0xFFFF);
931 ip_fallback_id = salt;
932 spin_unlock_bh(&ip_fb_id_lock);
935 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
937 struct rtable *rt = (struct rtable *) dst;
940 if (rt->peer == NULL)
943 /* If peer is attached to destination, it is never detached,
944 so that we need not to grab a lock to dereference it.
947 iph->id = htons(inet_getid(rt->peer, more));
951 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
953 ip_select_fb_ident(iph);
956 static void rt_del(unsigned hash, struct rtable *rt)
958 struct rtable **rthp;
960 spin_lock_bh(&rt_hash_table[hash].lock);
962 for (rthp = &rt_hash_table[hash].chain; *rthp;
963 rthp = &(*rthp)->u.rt_next)
965 *rthp = rt->u.rt_next;
969 spin_unlock_bh(&rt_hash_table[hash].lock);
972 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
973 u32 saddr, u8 tos, struct net_device *dev)
976 struct in_device *in_dev = in_dev_get(dev);
977 struct rtable *rth, **rthp;
978 u32 skeys[2] = { saddr, 0 };
979 int ikeys[2] = { dev->ifindex, 0 };
981 tos &= IPTOS_RT_MASK;
986 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
987 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
988 goto reject_redirect;
990 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
991 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
992 goto reject_redirect;
993 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
994 goto reject_redirect;
996 if (inet_addr_type(new_gw) != RTN_UNICAST)
997 goto reject_redirect;
1000 for (i = 0; i < 2; i++) {
1001 for (k = 0; k < 2; k++) {
1002 unsigned hash = rt_hash_code(daddr,
1003 skeys[i] ^ (ikeys[k] << 5),
1006 rthp=&rt_hash_table[hash].chain;
1009 while ((rth = *rthp) != NULL) {
1012 smp_read_barrier_depends();
1013 if (rth->fl.fl4_dst != daddr ||
1014 rth->fl.fl4_src != skeys[i] ||
1015 rth->fl.fl4_tos != tos ||
1016 rth->fl.oif != ikeys[k] ||
1018 rthp = &rth->u.rt_next;
1022 if (rth->rt_dst != daddr ||
1023 rth->rt_src != saddr ||
1025 rth->rt_gateway != old_gw ||
1026 rth->u.dst.dev != dev)
1029 dst_hold(&rth->u.dst);
1032 rt = dst_alloc(&ipv4_dst_ops);
1039 /* Copy all the information. */
1041 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1042 rt->u.dst.__use = 1;
1043 atomic_set(&rt->u.dst.__refcnt, 1);
1044 rt->u.dst.child = NULL;
1046 dev_hold(rt->u.dst.dev);
1048 in_dev_hold(rt->idev);
1049 rt->u.dst.obsolete = 0;
1050 rt->u.dst.lastuse = jiffies;
1051 rt->u.dst.path = &rt->u.dst;
1052 rt->u.dst.neighbour = NULL;
1053 rt->u.dst.hh = NULL;
1054 rt->u.dst.xfrm = NULL;
1056 rt->rt_flags |= RTCF_REDIRECTED;
1058 /* Gateway is different ... */
1059 rt->rt_gateway = new_gw;
1061 /* Redirect received -> path was valid */
1062 dst_confirm(&rth->u.dst);
1065 atomic_inc(&rt->peer->refcnt);
1067 if (arp_bind_neighbour(&rt->u.dst) ||
1068 !(rt->u.dst.neighbour->nud_state &
1070 if (rt->u.dst.neighbour)
1071 neigh_event_send(rt->u.dst.neighbour, NULL);
1078 if (!rt_intern_hash(hash, rt, &rt))
1091 #ifdef CONFIG_IP_ROUTE_VERBOSE
1092 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1093 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1094 "%u.%u.%u.%u ignored.\n"
1095 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1097 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1098 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1103 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1105 struct rtable *rt = (struct rtable*)dst;
1106 struct dst_entry *ret = dst;
1109 if (dst->obsolete) {
1112 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1113 rt->u.dst.expires) {
1114 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1118 #if RT_CACHE_DEBUG >= 1
1119 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1120 "%u.%u.%u.%u/%02x dropped\n",
1121 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1132 * 1. The first ip_rt_redirect_number redirects are sent
1133 * with exponential backoff, then we stop sending them at all,
1134 * assuming that the host ignores our redirects.
1135 * 2. If we did not see packets requiring redirects
1136 * during ip_rt_redirect_silence, we assume that the host
1137 * forgot redirected route and start to send redirects again.
1139 * This algorithm is much cheaper and more intelligent than dumb load limiting
1142 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1143 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1146 void ip_rt_send_redirect(struct sk_buff *skb)
1148 struct rtable *rt = (struct rtable*)skb->dst;
1149 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1154 if (!IN_DEV_TX_REDIRECTS(in_dev))
1157 /* No redirected packets during ip_rt_redirect_silence;
1158 * reset the algorithm.
1160 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1161 rt->u.dst.rate_tokens = 0;
1163 /* Too many ignored redirects; do not send anything
1164 * set u.dst.rate_last to the last seen redirected packet.
1166 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1167 rt->u.dst.rate_last = jiffies;
1171 /* Check for load limit; set rate_last to the latest sent
1174 if (time_after(jiffies,
1175 (rt->u.dst.rate_last +
1176 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1177 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1178 rt->u.dst.rate_last = jiffies;
1179 ++rt->u.dst.rate_tokens;
1180 #ifdef CONFIG_IP_ROUTE_VERBOSE
1181 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1182 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1184 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1185 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1186 NIPQUAD(rt->rt_src), rt->rt_iif,
1187 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1194 static int ip_error(struct sk_buff *skb)
1196 struct rtable *rt = (struct rtable*)skb->dst;
1200 switch (rt->u.dst.error) {
1205 code = ICMP_HOST_UNREACH;
1208 code = ICMP_NET_UNREACH;
1211 code = ICMP_PKT_FILTERED;
1216 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1217 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1218 rt->u.dst.rate_tokens = ip_rt_error_burst;
1219 rt->u.dst.rate_last = now;
1220 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1221 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1222 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1225 out: kfree_skb(skb);
1230 * The last two values are not from the RFC but
1231 * are needed for AMPRnet AX.25 paths.
1234 static unsigned short mtu_plateau[] =
1235 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1237 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1241 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1242 if (old_mtu > mtu_plateau[i])
1243 return mtu_plateau[i];
1247 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1250 unsigned short old_mtu = ntohs(iph->tot_len);
1252 u32 skeys[2] = { iph->saddr, 0, };
1253 u32 daddr = iph->daddr;
1254 u8 tos = iph->tos & IPTOS_RT_MASK;
1255 unsigned short est_mtu = 0;
1257 if (ipv4_config.no_pmtu_disc)
1260 for (i = 0; i < 2; i++) {
1261 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1264 for (rth = rt_hash_table[hash].chain; rth;
1265 rth = rth->u.rt_next) {
1266 smp_read_barrier_depends();
1267 if (rth->fl.fl4_dst == daddr &&
1268 rth->fl.fl4_src == skeys[i] &&
1269 rth->rt_dst == daddr &&
1270 rth->rt_src == iph->saddr &&
1271 rth->fl.fl4_tos == tos &&
1273 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1274 unsigned short mtu = new_mtu;
1276 if (new_mtu < 68 || new_mtu >= old_mtu) {
1278 /* BSD 4.2 compatibility hack :-( */
1280 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1281 old_mtu >= 68 + (iph->ihl << 2))
1282 old_mtu -= iph->ihl << 2;
1284 mtu = guess_mtu(old_mtu);
1286 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1287 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1288 dst_confirm(&rth->u.dst);
1289 if (mtu < ip_rt_min_pmtu) {
1290 mtu = ip_rt_min_pmtu;
1291 rth->u.dst.metrics[RTAX_LOCK-1] |=
1294 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1295 dst_set_expires(&rth->u.dst,
1304 return est_mtu ? : new_mtu;
1307 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1309 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1310 !(dst_metric_locked(dst, RTAX_MTU))) {
1311 if (mtu < ip_rt_min_pmtu) {
1312 mtu = ip_rt_min_pmtu;
1313 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1315 dst->metrics[RTAX_MTU-1] = mtu;
1316 dst_set_expires(dst, ip_rt_mtu_expires);
1320 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1326 static void ipv4_dst_destroy(struct dst_entry *dst)
1328 struct rtable *rt = (struct rtable *) dst;
1329 struct inet_peer *peer = rt->peer;
1330 struct in_device *idev = rt->idev;
1343 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1345 struct rtable *rt = (struct rtable *) dst;
1346 struct in_device *idev = rt->idev;
1353 static void ipv4_link_failure(struct sk_buff *skb)
1357 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1359 rt = (struct rtable *) skb->dst;
1361 dst_set_expires(&rt->u.dst, 0);
1364 static int ip_rt_bug(struct sk_buff **pskb)
1366 struct sk_buff *skb = *pskb;
1368 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1369 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1370 skb->dev ? skb->dev->name : "?");
1376 We do not cache source address of outgoing interface,
1377 because it is used only by IP RR, TS and SRR options,
1378 so that it out of fast path.
1380 BTW remember: "addr" is allowed to be not aligned
1384 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1387 struct fib_result res;
1389 if (rt->fl.iif == 0)
1391 else if (fib_lookup(&rt->fl, &res) == 0) {
1392 #ifdef CONFIG_IP_ROUTE_NAT
1393 if (res.type == RTN_NAT)
1394 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1398 src = FIB_RES_PREFSRC(res);
1401 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1403 memcpy(addr, &src, 4);
1406 #ifdef CONFIG_NET_CLS_ROUTE
1407 static void set_class_tag(struct rtable *rt, u32 tag)
1409 if (!(rt->u.dst.tclassid & 0xFFFF))
1410 rt->u.dst.tclassid |= tag & 0xFFFF;
1411 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1412 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1416 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1418 struct fib_info *fi = res->fi;
1421 if (FIB_RES_GW(*res) &&
1422 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1423 rt->rt_gateway = FIB_RES_GW(*res);
1424 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1425 sizeof(rt->u.dst.metrics));
1426 if (fi->fib_mtu == 0) {
1427 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1428 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1429 rt->rt_gateway != rt->rt_dst &&
1430 rt->u.dst.dev->mtu > 576)
1431 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1433 #ifdef CONFIG_NET_CLS_ROUTE
1434 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1437 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1439 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1440 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1441 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1442 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1443 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1444 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1446 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1447 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1449 #ifdef CONFIG_NET_CLS_ROUTE
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451 set_class_tag(rt, fib_rules_tclass(res));
1453 set_class_tag(rt, itag);
1455 rt->rt_type = res->type;
1458 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1459 u8 tos, struct net_device *dev, int our)
1464 struct in_device *in_dev = in_dev_get(dev);
1467 /* Primary sanity checks. */
1472 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1473 skb->protocol != htons(ETH_P_IP))
1476 if (ZERONET(saddr)) {
1477 if (!LOCAL_MCAST(daddr))
1479 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1480 } else if (fib_validate_source(saddr, 0, tos, 0,
1481 dev, &spec_dst, &itag) < 0)
1484 rth = dst_alloc(&ipv4_dst_ops);
1488 rth->u.dst.output= ip_rt_bug;
1490 atomic_set(&rth->u.dst.__refcnt, 1);
1491 rth->u.dst.flags= DST_HOST;
1492 if (in_dev->cnf.no_policy)
1493 rth->u.dst.flags |= DST_NOPOLICY;
1494 rth->fl.fl4_dst = daddr;
1495 rth->rt_dst = daddr;
1496 rth->fl.fl4_tos = tos;
1497 #ifdef CONFIG_IP_ROUTE_FWMARK
1498 rth->fl.fl4_fwmark= skb->nfmark;
1500 rth->fl.fl4_src = saddr;
1501 rth->rt_src = saddr;
1502 #ifdef CONFIG_IP_ROUTE_NAT
1503 rth->rt_dst_map = daddr;
1504 rth->rt_src_map = saddr;
1506 #ifdef CONFIG_NET_CLS_ROUTE
1507 rth->u.dst.tclassid = itag;
1510 rth->fl.iif = dev->ifindex;
1511 rth->u.dst.dev = &loopback_dev;
1512 dev_hold(rth->u.dst.dev);
1513 rth->idev = in_dev_get(rth->u.dst.dev);
1515 rth->rt_gateway = daddr;
1516 rth->rt_spec_dst= spec_dst;
1517 rth->rt_type = RTN_MULTICAST;
1518 rth->rt_flags = RTCF_MULTICAST;
1520 rth->u.dst.input= ip_local_deliver;
1521 rth->rt_flags |= RTCF_LOCAL;
1524 #ifdef CONFIG_IP_MROUTE
1525 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1526 rth->u.dst.input = ip_mr_input;
1528 RT_CACHE_STAT_INC(in_slow_mc);
1531 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1532 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1544 * NOTE. We drop all the packets that has local source
1545 * addresses, because every properly looped back packet
1546 * must have correct destination already attached by output routine.
1548 * Such approach solves two big problems:
1549 * 1. Not simplex devices are handled properly.
1550 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1553 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1554 u8 tos, struct net_device *dev)
1556 struct fib_result res;
1557 struct in_device *in_dev = in_dev_get(dev);
1558 struct in_device *out_dev = NULL;
1559 struct flowi fl = { .nl_u = { .ip4_u =
1563 .scope = RT_SCOPE_UNIVERSE,
1564 #ifdef CONFIG_IP_ROUTE_FWMARK
1565 .fwmark = skb->nfmark
1568 .iif = dev->ifindex };
1571 struct rtable * rth;
1577 /* IP on this device is disabled. */
1582 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1584 /* Check for the most weird martians, which can be not detected
1588 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1589 goto martian_source;
1591 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1594 /* Accept zero addresses only to limited broadcast;
1595 * I even do not know to fix it or not. Waiting for complains :-)
1598 goto martian_source;
1600 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1601 goto martian_destination;
1604 * Now we are ready to route packet.
1606 if ((err = fib_lookup(&fl, &res)) != 0) {
1607 if (!IN_DEV_FORWARD(in_dev))
1613 RT_CACHE_STAT_INC(in_slow_tot);
1615 #ifdef CONFIG_IP_ROUTE_NAT
1616 /* Policy is applied before mapping destination,
1617 but rerouting after map should be made with old source.
1621 u32 src_map = saddr;
1623 src_map = fib_rules_policy(saddr, &res, &flags);
1625 if (res.type == RTN_NAT) {
1626 fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1629 if (fib_lookup(&fl, &res))
1632 if (res.type != RTN_UNICAST)
1636 fl.fl4_src = src_map;
1640 if (res.type == RTN_BROADCAST)
1643 if (res.type == RTN_LOCAL) {
1645 result = fib_validate_source(saddr, daddr, tos,
1646 loopback_dev.ifindex,
1647 dev, &spec_dst, &itag);
1649 goto martian_source;
1651 flags |= RTCF_DIRECTSRC;
1656 if (!IN_DEV_FORWARD(in_dev))
1658 if (res.type != RTN_UNICAST)
1659 goto martian_destination;
1661 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1662 if (res.fi->fib_nhs > 1 && fl.oif == 0)
1663 fib_select_multipath(&fl, &res);
1665 out_dev = in_dev_get(FIB_RES_DEV(res));
1666 if (out_dev == NULL) {
1667 if (net_ratelimit())
1668 printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1669 "Please, report\n");
1673 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1676 goto martian_source;
1679 flags |= RTCF_DIRECTSRC;
1681 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1682 (IN_DEV_SHARED_MEDIA(out_dev) ||
1683 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1684 flags |= RTCF_DOREDIRECT;
1686 if (skb->protocol != htons(ETH_P_IP)) {
1687 /* Not IP (i.e. ARP). Do not create route, if it is
1688 * invalid for proxy arp. DNAT routes are always valid.
1690 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1694 rth = dst_alloc(&ipv4_dst_ops);
1698 atomic_set(&rth->u.dst.__refcnt, 1);
1699 rth->u.dst.flags= DST_HOST;
1700 if (in_dev->cnf.no_policy)
1701 rth->u.dst.flags |= DST_NOPOLICY;
1702 if (in_dev->cnf.no_xfrm)
1703 rth->u.dst.flags |= DST_NOXFRM;
1704 rth->fl.fl4_dst = daddr;
1705 rth->rt_dst = daddr;
1706 rth->fl.fl4_tos = tos;
1707 #ifdef CONFIG_IP_ROUTE_FWMARK
1708 rth->fl.fl4_fwmark= skb->nfmark;
1710 rth->fl.fl4_src = saddr;
1711 rth->rt_src = saddr;
1712 rth->rt_gateway = daddr;
1713 #ifdef CONFIG_IP_ROUTE_NAT
1714 rth->rt_src_map = fl.fl4_src;
1715 rth->rt_dst_map = fl.fl4_dst;
1716 if (flags&RTCF_DNAT)
1717 rth->rt_gateway = fl.fl4_dst;
1720 rth->fl.iif = dev->ifindex;
1721 rth->u.dst.dev = out_dev->dev;
1722 dev_hold(rth->u.dst.dev);
1723 rth->idev = in_dev_get(rth->u.dst.dev);
1725 rth->rt_spec_dst= spec_dst;
1727 rth->u.dst.input = ip_forward;
1728 rth->u.dst.output = ip_output;
1730 rt_set_nexthop(rth, &res, itag);
1732 rth->rt_flags = flags;
1735 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1739 in_dev_put(out_dev);
1745 if (skb->protocol != htons(ETH_P_IP))
1749 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1751 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1754 goto martian_source;
1756 flags |= RTCF_DIRECTSRC;
1758 flags |= RTCF_BROADCAST;
1759 res.type = RTN_BROADCAST;
1760 RT_CACHE_STAT_INC(in_brd);
1763 rth = dst_alloc(&ipv4_dst_ops);
1767 rth->u.dst.output= ip_rt_bug;
1769 atomic_set(&rth->u.dst.__refcnt, 1);
1770 rth->u.dst.flags= DST_HOST;
1771 if (in_dev->cnf.no_policy)
1772 rth->u.dst.flags |= DST_NOPOLICY;
1773 rth->fl.fl4_dst = daddr;
1774 rth->rt_dst = daddr;
1775 rth->fl.fl4_tos = tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777 rth->fl.fl4_fwmark= skb->nfmark;
1779 rth->fl.fl4_src = saddr;
1780 rth->rt_src = saddr;
1781 #ifdef CONFIG_IP_ROUTE_NAT
1782 rth->rt_dst_map = fl.fl4_dst;
1783 rth->rt_src_map = fl.fl4_src;
1785 #ifdef CONFIG_NET_CLS_ROUTE
1786 rth->u.dst.tclassid = itag;
1789 rth->fl.iif = dev->ifindex;
1790 rth->u.dst.dev = &loopback_dev;
1791 dev_hold(rth->u.dst.dev);
1792 rth->idev = in_dev_get(rth->u.dst.dev);
1793 rth->rt_gateway = daddr;
1794 rth->rt_spec_dst= spec_dst;
1795 rth->u.dst.input= ip_local_deliver;
1796 rth->rt_flags = flags|RTCF_LOCAL;
1797 if (res.type == RTN_UNREACHABLE) {
1798 rth->u.dst.input= ip_error;
1799 rth->u.dst.error= -err;
1800 rth->rt_flags &= ~RTCF_LOCAL;
1802 rth->rt_type = res.type;
1806 RT_CACHE_STAT_INC(in_no_route);
1807 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1808 res.type = RTN_UNREACHABLE;
1812 * Do not cache martian addresses: they should be logged (RFC1812)
1814 martian_destination:
1815 RT_CACHE_STAT_INC(in_martian_dst);
1816 #ifdef CONFIG_IP_ROUTE_VERBOSE
1817 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1818 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1819 "%u.%u.%u.%u, dev %s\n",
1820 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1832 RT_CACHE_STAT_INC(in_martian_src);
1833 #ifdef CONFIG_IP_ROUTE_VERBOSE
1834 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1836 * RFC1812 recommendation, if source is martian,
1837 * the only hint is MAC header.
1839 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1840 "%u.%u.%u.%u, on dev %s\n",
1841 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1842 if (dev->hard_header_len) {
1844 unsigned char *p = skb->mac.raw;
1845 printk(KERN_WARNING "ll header: ");
1846 for (i = 0; i < dev->hard_header_len; i++, p++) {
1848 if (i < (dev->hard_header_len - 1))
1858 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1859 u8 tos, struct net_device *dev)
1861 struct rtable * rth;
1863 int iif = dev->ifindex;
1865 tos &= IPTOS_RT_MASK;
1866 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1869 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1870 smp_read_barrier_depends();
1871 if (rth->fl.fl4_dst == daddr &&
1872 rth->fl.fl4_src == saddr &&
1873 rth->fl.iif == iif &&
1875 #ifdef CONFIG_IP_ROUTE_FWMARK
1876 rth->fl.fl4_fwmark == skb->nfmark &&
1878 rth->fl.fl4_tos == tos) {
1879 rth->u.dst.lastuse = jiffies;
1880 dst_hold(&rth->u.dst);
1882 RT_CACHE_STAT_INC(in_hit);
1884 skb->dst = (struct dst_entry*)rth;
1887 RT_CACHE_STAT_INC(in_hlist_search);
1891 /* Multicast recognition logic is moved from route cache to here.
1892 The problem was that too many Ethernet cards have broken/missing
1893 hardware multicast filters :-( As result the host on multicasting
1894 network acquires a lot of useless route cache entries, sort of
1895 SDR messages from all the world. Now we try to get rid of them.
1896 Really, provided software IP multicast filter is organized
1897 reasonably (at least, hashed), it does not result in a slowdown
1898 comparing with route cache reject entries.
1899 Note, that multicast routers are not affected, because
1900 route cache entry is created eventually.
1902 if (MULTICAST(daddr)) {
1903 struct in_device *in_dev;
1905 read_lock(&inetdev_lock);
1906 if ((in_dev = __in_dev_get(dev)) != NULL) {
1907 int our = ip_check_mc(in_dev, daddr, saddr,
1908 skb->nh.iph->protocol);
1910 #ifdef CONFIG_IP_MROUTE
1911 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1914 read_unlock(&inetdev_lock);
1915 return ip_route_input_mc(skb, daddr, saddr,
1919 read_unlock(&inetdev_lock);
1922 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1926 * Major route resolver routine.
1929 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1931 u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1932 struct flowi fl = { .nl_u = { .ip4_u =
1933 { .daddr = oldflp->fl4_dst,
1934 .saddr = oldflp->fl4_src,
1935 .tos = tos & IPTOS_RT_MASK,
1936 .scope = ((tos & RTO_ONLINK) ?
1939 #ifdef CONFIG_IP_ROUTE_FWMARK
1940 .fwmark = oldflp->fl4_fwmark
1943 .iif = loopback_dev.ifindex,
1944 .oif = oldflp->oif };
1945 struct fib_result res;
1948 struct net_device *dev_out = NULL;
1949 struct in_device *in_dev = NULL;
1955 #ifdef CONFIG_IP_MULTIPLE_TABLES
1959 if (oldflp->fl4_src) {
1961 if (MULTICAST(oldflp->fl4_src) ||
1962 BADCLASS(oldflp->fl4_src) ||
1963 ZERONET(oldflp->fl4_src))
1966 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1967 dev_out = ip_dev_find(oldflp->fl4_src);
1968 if (dev_out == NULL)
1971 /* I removed check for oif == dev_out->oif here.
1972 It was wrong for two reasons:
1973 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1974 assigned to multiple interfaces.
1975 2. Moreover, we are allowed to send packets with saddr
1976 of another iface. --ANK
1979 if (oldflp->oif == 0
1980 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1981 /* Special hack: user can direct multicasts
1982 and limited broadcast via necessary interface
1983 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1984 This hack is not just for fun, it allows
1985 vic,vat and friends to work.
1986 They bind socket to loopback, set ttl to zero
1987 and expect that it will work.
1988 From the viewpoint of routing cache they are broken,
1989 because we are not allowed to build multicast path
1990 with loopback source addr (look, routing cache
1991 cannot know, that ttl is zero, so that packet
1992 will not leave this host and route is valid).
1993 Luckily, this hack is good workaround.
1996 fl.oif = dev_out->ifindex;
2004 dev_out = dev_get_by_index(oldflp->oif);
2006 if (dev_out == NULL)
2008 if (__in_dev_get(dev_out) == NULL) {
2010 goto out; /* Wrong error code */
2013 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2015 fl.fl4_src = inet_select_addr(dev_out, 0,
2020 if (MULTICAST(oldflp->fl4_dst))
2021 fl.fl4_src = inet_select_addr(dev_out, 0,
2023 else if (!oldflp->fl4_dst)
2024 fl.fl4_src = inet_select_addr(dev_out, 0,
2030 fl.fl4_dst = fl.fl4_src;
2032 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2035 dev_out = &loopback_dev;
2037 fl.oif = loopback_dev.ifindex;
2038 res.type = RTN_LOCAL;
2039 flags |= RTCF_LOCAL;
2043 if (fib_lookup(&fl, &res)) {
2046 /* Apparently, routing tables are wrong. Assume,
2047 that the destination is on link.
2050 Because we are allowed to send to iface
2051 even if it has NO routes and NO assigned
2052 addresses. When oif is specified, routing
2053 tables are looked up with only one purpose:
2054 to catch if destination is gatewayed, rather than
2055 direct. Moreover, if MSG_DONTROUTE is set,
2056 we send packet, ignoring both routing tables
2057 and ifaddr state. --ANK
2060 We could make it even if oif is unknown,
2061 likely IPv6, but we do not.
2064 if (fl.fl4_src == 0)
2065 fl.fl4_src = inet_select_addr(dev_out, 0,
2067 res.type = RTN_UNICAST;
2077 if (res.type == RTN_NAT)
2080 if (res.type == RTN_LOCAL) {
2082 fl.fl4_src = fl.fl4_dst;
2085 dev_out = &loopback_dev;
2087 fl.oif = dev_out->ifindex;
2089 fib_info_put(res.fi);
2091 flags |= RTCF_LOCAL;
2095 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2096 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2097 fib_select_multipath(&fl, &res);
2100 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2101 fib_select_default(&fl, &res);
2104 fl.fl4_src = FIB_RES_PREFSRC(res);
2108 dev_out = FIB_RES_DEV(res);
2110 fl.oif = dev_out->ifindex;
2113 if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2116 if (fl.fl4_dst == 0xFFFFFFFF)
2117 res.type = RTN_BROADCAST;
2118 else if (MULTICAST(fl.fl4_dst))
2119 res.type = RTN_MULTICAST;
2120 else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2123 if (dev_out->flags & IFF_LOOPBACK)
2124 flags |= RTCF_LOCAL;
2126 in_dev = in_dev_get(dev_out);
2130 if (res.type == RTN_BROADCAST) {
2131 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2133 fib_info_put(res.fi);
2136 } else if (res.type == RTN_MULTICAST) {
2137 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2138 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2139 flags &= ~RTCF_LOCAL;
2140 /* If multicast route do not exist use
2141 default one, but do not gateway in this case.
2144 if (res.fi && res.prefixlen < 4) {
2145 fib_info_put(res.fi);
2150 rth = dst_alloc(&ipv4_dst_ops);
2154 atomic_set(&rth->u.dst.__refcnt, 1);
2155 rth->u.dst.flags= DST_HOST;
2156 if (in_dev->cnf.no_xfrm)
2157 rth->u.dst.flags |= DST_NOXFRM;
2158 if (in_dev->cnf.no_policy)
2159 rth->u.dst.flags |= DST_NOPOLICY;
2160 rth->fl.fl4_dst = oldflp->fl4_dst;
2161 rth->fl.fl4_tos = tos;
2162 rth->fl.fl4_src = oldflp->fl4_src;
2163 rth->fl.oif = oldflp->oif;
2164 #ifdef CONFIG_IP_ROUTE_FWMARK
2165 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2167 rth->rt_dst = fl.fl4_dst;
2168 rth->rt_src = fl.fl4_src;
2169 #ifdef CONFIG_IP_ROUTE_NAT
2170 rth->rt_dst_map = fl.fl4_dst;
2171 rth->rt_src_map = fl.fl4_src;
2173 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2174 rth->u.dst.dev = dev_out;
2176 rth->idev = in_dev_get(dev_out);
2177 rth->rt_gateway = fl.fl4_dst;
2178 rth->rt_spec_dst= fl.fl4_src;
2180 rth->u.dst.output=ip_output;
2182 RT_CACHE_STAT_INC(out_slow_tot);
2184 if (flags & RTCF_LOCAL) {
2185 rth->u.dst.input = ip_local_deliver;
2186 rth->rt_spec_dst = fl.fl4_dst;
2188 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2189 rth->rt_spec_dst = fl.fl4_src;
2190 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2191 rth->u.dst.output = ip_mc_output;
2192 RT_CACHE_STAT_INC(out_slow_mc);
2194 #ifdef CONFIG_IP_MROUTE
2195 if (res.type == RTN_MULTICAST) {
2196 if (IN_DEV_MFORWARD(in_dev) &&
2197 !LOCAL_MCAST(oldflp->fl4_dst)) {
2198 rth->u.dst.input = ip_mr_input;
2199 rth->u.dst.output = ip_mc_output;
2205 rt_set_nexthop(rth, &res, 0);
2208 rth->rt_flags = flags;
2210 hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2211 err = rt_intern_hash(hash, rth, rp);
2229 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2234 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2237 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2238 smp_read_barrier_depends();
2239 if (rth->fl.fl4_dst == flp->fl4_dst &&
2240 rth->fl.fl4_src == flp->fl4_src &&
2242 rth->fl.oif == flp->oif &&
2243 #ifdef CONFIG_IP_ROUTE_FWMARK
2244 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2246 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2247 (IPTOS_RT_MASK | RTO_ONLINK))) {
2248 rth->u.dst.lastuse = jiffies;
2249 dst_hold(&rth->u.dst);
2251 RT_CACHE_STAT_INC(out_hit);
2256 RT_CACHE_STAT_INC(out_hlist_search);
2260 return ip_route_output_slow(rp, flp);
2263 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2267 if ((err = __ip_route_output_key(rp, flp)) != 0)
2269 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2272 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2276 if ((err = __ip_route_output_key(rp, flp)) != 0)
2278 return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2281 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2284 struct rtable *rt = (struct rtable*)skb->dst;
2286 struct nlmsghdr *nlh;
2287 unsigned char *b = skb->tail;
2288 struct rta_cacheinfo ci;
2289 #ifdef CONFIG_IP_MROUTE
2290 struct rtattr *eptr;
2292 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2293 r = NLMSG_DATA(nlh);
2294 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2295 r->rtm_family = AF_INET;
2296 r->rtm_dst_len = 32;
2298 r->rtm_tos = rt->fl.fl4_tos;
2299 r->rtm_table = RT_TABLE_MAIN;
2300 r->rtm_type = rt->rt_type;
2301 r->rtm_scope = RT_SCOPE_UNIVERSE;
2302 r->rtm_protocol = RTPROT_UNSPEC;
2303 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2304 if (rt->rt_flags & RTCF_NOTIFY)
2305 r->rtm_flags |= RTM_F_NOTIFY;
2306 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2307 if (rt->fl.fl4_src) {
2308 r->rtm_src_len = 32;
2309 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2312 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2313 #ifdef CONFIG_NET_CLS_ROUTE
2314 if (rt->u.dst.tclassid)
2315 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2318 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2319 else if (rt->rt_src != rt->fl.fl4_src)
2320 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2321 if (rt->rt_dst != rt->rt_gateway)
2322 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2323 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2324 goto rtattr_failure;
2325 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2326 ci.rta_used = rt->u.dst.__use;
2327 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2328 if (rt->u.dst.expires)
2329 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2332 ci.rta_error = rt->u.dst.error;
2333 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2335 ci.rta_id = rt->peer->ip_id_count;
2336 if (rt->peer->tcp_ts_stamp) {
2337 ci.rta_ts = rt->peer->tcp_ts;
2338 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2341 #ifdef CONFIG_IP_MROUTE
2342 eptr = (struct rtattr*)skb->tail;
2344 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2346 #ifdef CONFIG_IP_MROUTE
2347 u32 dst = rt->rt_dst;
2349 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2350 ipv4_devconf.mc_forwarding) {
2351 int err = ipmr_get_route(skb, r, nowait);
2358 if (err == -EMSGSIZE)
2360 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2365 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2368 nlh->nlmsg_len = skb->tail - b;
2373 skb_trim(skb, b - skb->data);
2377 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2379 struct rtattr **rta = arg;
2380 struct rtmsg *rtm = NLMSG_DATA(nlh);
2381 struct rtable *rt = NULL;
2386 struct sk_buff *skb;
2388 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2392 /* Reserve room for dummy headers, this skb can pass
2393 through good chunk of routing engine.
2395 skb->mac.raw = skb->data;
2396 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2398 if (rta[RTA_SRC - 1])
2399 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2400 if (rta[RTA_DST - 1])
2401 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2402 if (rta[RTA_IIF - 1])
2403 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2406 struct net_device *dev = __dev_get_by_index(iif);
2410 skb->protocol = htons(ETH_P_IP);
2413 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2415 rt = (struct rtable*)skb->dst;
2416 if (!err && rt->u.dst.error)
2417 err = -rt->u.dst.error;
2419 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2421 .tos = rtm->rtm_tos } } };
2423 if (rta[RTA_OIF - 1])
2424 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2426 err = ip_route_output_key(&rt, &fl);
2431 skb->dst = &rt->u.dst;
2432 if (rtm->rtm_flags & RTM_F_NOTIFY)
2433 rt->rt_flags |= RTCF_NOTIFY;
2435 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2437 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2446 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2456 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2463 s_idx = idx = cb->args[1];
2464 for (h = 0; h <= rt_hash_mask; h++) {
2465 if (h < s_h) continue;
2469 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2470 rt = rt->u.rt_next, idx++) {
2471 smp_read_barrier_depends();
2474 skb->dst = dst_clone(&rt->u.dst);
2475 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2477 RTM_NEWROUTE, 1) <= 0) {
2478 dst_release(xchg(&skb->dst, NULL));
2482 dst_release(xchg(&skb->dst, NULL));
2493 void ip_rt_multicast_event(struct in_device *in_dev)
2498 #ifdef CONFIG_SYSCTL
2499 static int flush_delay;
2501 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2502 struct file *filp, void __user *buffer,
2503 size_t *lenp, loff_t *ppos)
2506 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2507 rt_cache_flush(flush_delay);
2514 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2517 void __user *oldval,
2518 size_t __user *oldlenp,
2519 void __user *newval,
2524 if (newlen != sizeof(int))
2526 if (get_user(delay, (int __user *)newval))
2528 rt_cache_flush(delay);
2532 ctl_table ipv4_route_table[] = {
2534 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2535 .procname = "flush",
2536 .data = &flush_delay,
2537 .maxlen = sizeof(int),
2539 .proc_handler = &ipv4_sysctl_rtcache_flush,
2540 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2543 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2544 .procname = "min_delay",
2545 .data = &ip_rt_min_delay,
2546 .maxlen = sizeof(int),
2548 .proc_handler = &proc_dointvec_jiffies,
2549 .strategy = &sysctl_jiffies,
2552 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2553 .procname = "max_delay",
2554 .data = &ip_rt_max_delay,
2555 .maxlen = sizeof(int),
2557 .proc_handler = &proc_dointvec_jiffies,
2558 .strategy = &sysctl_jiffies,
2561 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2562 .procname = "gc_thresh",
2563 .data = &ipv4_dst_ops.gc_thresh,
2564 .maxlen = sizeof(int),
2566 .proc_handler = &proc_dointvec,
2569 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2570 .procname = "max_size",
2571 .data = &ip_rt_max_size,
2572 .maxlen = sizeof(int),
2574 .proc_handler = &proc_dointvec,
2577 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2578 .procname = "gc_min_interval",
2579 .data = &ip_rt_gc_min_interval,
2580 .maxlen = sizeof(int),
2582 .proc_handler = &proc_dointvec_jiffies,
2583 .strategy = &sysctl_jiffies,
2586 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2587 .procname = "gc_timeout",
2588 .data = &ip_rt_gc_timeout,
2589 .maxlen = sizeof(int),
2591 .proc_handler = &proc_dointvec_jiffies,
2592 .strategy = &sysctl_jiffies,
2595 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2596 .procname = "gc_interval",
2597 .data = &ip_rt_gc_interval,
2598 .maxlen = sizeof(int),
2600 .proc_handler = &proc_dointvec_jiffies,
2601 .strategy = &sysctl_jiffies,
2604 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2605 .procname = "redirect_load",
2606 .data = &ip_rt_redirect_load,
2607 .maxlen = sizeof(int),
2609 .proc_handler = &proc_dointvec,
2612 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2613 .procname = "redirect_number",
2614 .data = &ip_rt_redirect_number,
2615 .maxlen = sizeof(int),
2617 .proc_handler = &proc_dointvec,
2620 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2621 .procname = "redirect_silence",
2622 .data = &ip_rt_redirect_silence,
2623 .maxlen = sizeof(int),
2625 .proc_handler = &proc_dointvec,
2628 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2629 .procname = "error_cost",
2630 .data = &ip_rt_error_cost,
2631 .maxlen = sizeof(int),
2633 .proc_handler = &proc_dointvec,
2636 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2637 .procname = "error_burst",
2638 .data = &ip_rt_error_burst,
2639 .maxlen = sizeof(int),
2641 .proc_handler = &proc_dointvec,
2644 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2645 .procname = "gc_elasticity",
2646 .data = &ip_rt_gc_elasticity,
2647 .maxlen = sizeof(int),
2649 .proc_handler = &proc_dointvec,
2652 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2653 .procname = "mtu_expires",
2654 .data = &ip_rt_mtu_expires,
2655 .maxlen = sizeof(int),
2657 .proc_handler = &proc_dointvec_jiffies,
2658 .strategy = &sysctl_jiffies,
2661 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2662 .procname = "min_pmtu",
2663 .data = &ip_rt_min_pmtu,
2664 .maxlen = sizeof(int),
2666 .proc_handler = &proc_dointvec,
2669 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2670 .procname = "min_adv_mss",
2671 .data = &ip_rt_min_advmss,
2672 .maxlen = sizeof(int),
2674 .proc_handler = &proc_dointvec,
2677 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2678 .procname = "secret_interval",
2679 .data = &ip_rt_secret_interval,
2680 .maxlen = sizeof(int),
2682 .proc_handler = &proc_dointvec_jiffies,
2683 .strategy = &sysctl_jiffies,
2689 #ifdef CONFIG_NET_CLS_ROUTE
2690 struct ip_rt_acct *ip_rt_acct;
2692 /* This code sucks. But you should have seen it before! --RR */
2694 /* IP route accounting ptr for this logical cpu number. */
2695 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2697 #ifdef CONFIG_PROC_FS
2698 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2699 int length, int *eof, void *data)
2703 if ((offset & 3) || (length & 3))
2706 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2711 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2712 length = sizeof(struct ip_rt_acct) * 256 - offset;
2716 offset /= sizeof(u32);
2719 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2720 u32 *dst = (u32 *) buffer;
2722 /* Copy first cpu. */
2724 memcpy(dst, src, length);
2726 /* Add the other cpus in, one int at a time */
2730 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2732 for (j = 0; j < length/4; j++)
2738 #endif /* CONFIG_PROC_FS */
2739 #endif /* CONFIG_NET_CLS_ROUTE */
2741 static __initdata unsigned long rhash_entries;
2742 static int __init set_rhash_entries(char *str)
2746 rhash_entries = simple_strtoul(str, &str, 0);
2749 __setup("rhash_entries=", set_rhash_entries);
2751 int __init ip_rt_init(void)
2753 int i, order, goal, rc = 0;
2755 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2756 (jiffies ^ (jiffies >> 7)));
2758 #ifdef CONFIG_NET_CLS_ROUTE
2760 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2762 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2764 panic("IP: failed to allocate ip_rt_acct\n");
2765 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2768 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2769 sizeof(struct rtable),
2770 0, SLAB_HWCACHE_ALIGN,
2773 if (!ipv4_dst_ops.kmem_cachep)
2774 panic("IP: failed to allocate ip_dst_cache\n");
2776 goal = num_physpages >> (26 - PAGE_SHIFT);
2778 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2779 for (order = 0; (1UL << order) < goal; order++)
2783 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2784 sizeof(struct rt_hash_bucket);
2785 while (rt_hash_mask & (rt_hash_mask - 1))
2787 rt_hash_table = (struct rt_hash_bucket *)
2788 __get_free_pages(GFP_ATOMIC, order);
2789 } while (rt_hash_table == NULL && --order > 0);
2792 panic("Failed to allocate IP route cache hash table\n");
2794 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2796 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2798 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2802 for (i = 0; i <= rt_hash_mask; i++) {
2803 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2804 rt_hash_table[i].chain = NULL;
2807 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2808 #if defined(CONFIG_MIPS_BRCM)
2809 #if defined(SDRAM_8M) //vic
2810 ip_rt_max_size = (rt_hash_mask + 1) * 1;
2811 #elif (defined(SDRAM_16M) && defined(SUPPORT_TR69C)) //vic
2812 ip_rt_max_size = (rt_hash_mask + 1) * 1;
2814 ip_rt_max_size = (rt_hash_mask + 1) * 2;
2817 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2820 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2827 init_timer(&rt_flush_timer);
2828 rt_flush_timer.function = rt_run_flush;
2829 init_timer(&rt_periodic_timer);
2830 rt_periodic_timer.function = rt_check_expire;
2831 init_timer(&rt_secret_timer);
2832 rt_secret_timer.function = rt_secret_rebuild;
2834 /* All the timers, started at system startup tend
2835 to synchronize. Perturb it a bit.
2837 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2839 add_timer(&rt_periodic_timer);
2841 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2842 ip_rt_secret_interval;
2843 add_timer(&rt_secret_timer);
2845 #ifdef CONFIG_PROC_FS
2846 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2847 !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2848 free_percpu(rt_cache_stat);
2852 #ifdef CONFIG_NET_CLS_ROUTE
2853 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2863 EXPORT_SYMBOL(__ip_select_ident);
2864 EXPORT_SYMBOL(ip_route_input);
2865 EXPORT_SYMBOL(ip_route_output_key);