route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *
  57  *              This program is free software; you can redistribute it and/or
  58  *              modify it under the terms of the GNU General Public License
  59  *              as published by the Free Software Foundation; either version
  60  *              2 of the License, or (at your option) any later version.
  61  */
  62
  63 #include <linux/config.h>
  64 #include <asm/uaccess.h>
  65 #include <asm/system.h>
  66 #include <asm/bitops.h>
  67 #include <linux/types.h>
  68 #include <linux/kernel.h>
  69 #include <linux/sched.h>
  70 #include <linux/mm.h>
  71 #include <linux/string.h>
  72 #include <linux/socket.h>
  73 #include <linux/sockios.h>
  74 #include <linux/errno.h>
  75 #include <linux/in.h>
  76 #include <linux/inet.h>
  77 #include <linux/netdevice.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/init.h>
  80 #include <linux/skbuff.h>
  81 #include <linux/rtnetlink.h>
  82 #include <linux/inetdevice.h>
  83 #include <linux/igmp.h>
  84 #include <linux/pkt_sched.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netfilter_ipv4.h>
  87 #include <linux/random.h>
  88 #include <linux/jhash.h>
  89 #include <net/protocol.h>
  90 #include <net/ip.h>
  91 #include <net/route.h>
  92 #include <net/inetpeer.h>
  93 #include <net/sock.h>
  94 #include <net/ip_fib.h>
  95 #include <net/arp.h>
  96 #include <net/tcp.h>
  97 #include <net/icmp.h>
  98 #ifdef CONFIG_SYSCTL
  99 #include <linux/sysctl.h>
 100 #endif
 101
 102 #define IP_MAX_MTU      0xFFF0
 103
 104 #define RT_GC_TIMEOUT (300*HZ)
 105
 106 int ip_rt_min_delay             = 2 * HZ;
 107 int ip_rt_max_delay             = 10 * HZ;
 108 int ip_rt_max_size;
 109 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
 110 int ip_rt_gc_interval           = 60 * HZ;
 111 int ip_rt_gc_min_interval       = HZ / 2;
 112 int ip_rt_redirect_number       = 9;
 113 int ip_rt_redirect_load         = HZ / 50;
 114 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
 115 int ip_rt_error_cost            = HZ;
 116 int ip_rt_error_burst           = 5 * HZ;
 117 int ip_rt_gc_elasticity         = 8;
 118 int ip_rt_mtu_expires           = 10 * 60 * HZ;
 119 int ip_rt_min_pmtu              = 512 + 20 + 20;
 120 int ip_rt_min_advmss            = 256;
 121 int ip_rt_secret_interval       = 10 * 60 * HZ;
 122 static unsigned long rt_deadline;
 123
 124 #define RTprint(a...)   printk(KERN_DEBUG a)
 125
 126 static struct timer_list rt_flush_timer;
 127 static struct timer_list rt_periodic_timer;
 128 static struct timer_list rt_secret_timer;
 129
 130 /*
 131  *      Interface to generic destination cache.
 132  */
 133
 134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 135 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
 136                                            struct sk_buff *skb);
 137 static void              ipv4_dst_destroy(struct dst_entry *dst);
 138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 139 static void              ipv4_link_failure(struct sk_buff *skb);
 140 static int rt_garbage_collect(void);
 141
 142
 143 struct dst_ops ipv4_dst_ops = {
 144         family:                 AF_INET,
 145         protocol:               __constant_htons(ETH_P_IP),
 146         gc:                     rt_garbage_collect,
 147         check:                  ipv4_dst_check,
 148         reroute:                ipv4_dst_reroute,
 149         destroy:                ipv4_dst_destroy,
 150         negative_advice:        ipv4_negative_advice,
 151         link_failure:           ipv4_link_failure,
 152         entry_size:             sizeof(struct rtable),
 153 };
 154
 155 #define ECN_OR_COST(class)      TC_PRIO_##class
 156
 157 __u8 ip_tos2prio[16] = {
 158         TC_PRIO_BESTEFFORT,
 159         ECN_OR_COST(FILLER),
 160         TC_PRIO_BESTEFFORT,
 161         ECN_OR_COST(BESTEFFORT),
 162         TC_PRIO_BULK,
 163         ECN_OR_COST(BULK),
 164         TC_PRIO_BULK,
 165         ECN_OR_COST(BULK),
 166         TC_PRIO_INTERACTIVE,
 167         ECN_OR_COST(INTERACTIVE),
 168         TC_PRIO_INTERACTIVE,
 169         ECN_OR_COST(INTERACTIVE),
 170         TC_PRIO_INTERACTIVE_BULK,
 171         ECN_OR_COST(INTERACTIVE_BULK),
 172         TC_PRIO_INTERACTIVE_BULK,
 173         ECN_OR_COST(INTERACTIVE_BULK)
 174 };
 175
 176
 177 /*
 178  * Route cache.
 179  */
 180
 181 /* The locking scheme is rather straight forward:
 182  *
 183  * 1) A BH protected rwlocks protect buckets of the central route hash.
 184  * 2) Only writers remove entries, and they hold the lock
 185  *    as they look at rtable reference counts.
 186  * 3) Only readers acquire references to rtable entries,
 187  *    they do so with atomic increments and with the
 188  *    lock held.
 189  */
 190
 191 struct rt_hash_bucket {
 192         struct rtable   *chain;
 193         rwlock_t        lock;
 194 } __attribute__((__aligned__(8)));
 195
 196 static struct rt_hash_bucket    *rt_hash_table;
 197 static unsigned                 rt_hash_mask;
 198 static int                      rt_hash_log;
 199 static unsigned int             rt_hash_rnd;
 200
 201 struct rt_cache_stat rt_cache_stat[NR_CPUS];
 202
 203 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 204                                 struct rtable **res);
 205
 206 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 207 {
 208         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 209                 & rt_hash_mask);
 210 }
 211
 212 static int rt_cache_get_info(char *buffer, char **start, off_t offset,
 213                                 int length)
 214 {
 215         int len = 0;
 216         off_t pos = 128;
 217         char temp[256];
 218         struct rtable *r;
 219         int i;
 220
 221         if (offset < 128) {
 222                 sprintf(buffer, "%-127s\n",
 223                         "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 224                         "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 225                         "HHUptod\tSpecDst");
 226                 len = 128;
 227         }
 228
 229         for (i = rt_hash_mask; i >= 0; i--) {
 230                 read_lock_bh(&rt_hash_table[i].lock);
 231                 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
 232                         /*
 233                          *      Spin through entries until we are ready
 234                          */
 235                         pos += 128;
 236
 237                         if (pos <= offset) {
 238                                 len = 0;
 239                                 continue;
 240                         }
 241                         sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 242                                 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 243                                 r->u.dst.dev ? r->u.dst.dev->name : "*",
 244                                 (unsigned long)r->rt_dst,
 245                                 (unsigned long)r->rt_gateway,
 246                                 r->rt_flags,
 247                                 atomic_read(&r->u.dst.__refcnt),
 248                                 r->u.dst.__use,
 249                                 0,
 250                                 (unsigned long)r->rt_src,
 251                                 (r->u.dst.advmss ?
 252                                  (int) r->u.dst.advmss + 40 : 0),
 253                                 r->u.dst.window,
 254                                 (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
 255                                 r->key.tos,
 256                                 r->u.dst.hh ?
 257                                         atomic_read(&r->u.dst.hh->hh_refcnt) :
 258                                         -1,
 259                                 r->u.dst.hh ?
 260                                         (r->u.dst.hh->hh_output ==
 261                                          dev_queue_xmit) : 0,
 262                                 r->rt_spec_dst);
 263                         sprintf(buffer + len, "%-127s\n", temp);
 264                         len += 128;
 265                         if (pos >= offset+length) {
 266                                 read_unlock_bh(&rt_hash_table[i].lock);
 267                                 goto done;
 268                         }
 269                 }
 270                 read_unlock_bh(&rt_hash_table[i].lock);
 271         }
 272
 273 done:
 274         *start = buffer + len - (pos - offset);
 275         len = pos - offset;
 276         if (len > length)
 277                 len = length;
 278         return len;
 279 }
 280
 281 static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
 282 {
 283         unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
 284         int i, lcpu;
 285         int len = 0;
 286
 287         len += sprintf(buffer+len, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288         for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) {
 289                 i = cpu_logical_map(lcpu);
 290
 291                 len += sprintf(buffer+len, "%08x  %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                                dst_entries,
 293                                rt_cache_stat[i].in_hit,
 294                                rt_cache_stat[i].in_slow_tot,
 295                                rt_cache_stat[i].in_slow_mc,
 296                                rt_cache_stat[i].in_no_route,
 297                                rt_cache_stat[i].in_brd,
 298                                rt_cache_stat[i].in_martian_dst,
 299                                rt_cache_stat[i].in_martian_src,
 300
 301                                rt_cache_stat[i].out_hit,
 302                                rt_cache_stat[i].out_slow_tot,
 303                                rt_cache_stat[i].out_slow_mc,
 304
 305                                rt_cache_stat[i].gc_total,
 306                                rt_cache_stat[i].gc_ignored,
 307                                rt_cache_stat[i].gc_goal_miss,
 308                                rt_cache_stat[i].gc_dst_overflow,
 309                                rt_cache_stat[i].in_hlist_search,
 310                                rt_cache_stat[i].out_hlist_search
 311
 312                         );
 313         }
 314         len -= offset;
 315
 316         if (len > length)
 317                 len = length;
 318         if (len < 0)
 319                 len = 0;
 320
 321         *start = buffer + offset;
 322         return len;
 323 }
 324
 325 static __inline__ void rt_free(struct rtable *rt)
 326 {
 327         dst_free(&rt->u.dst);
 328 }
 329
 330 static __inline__ void rt_drop(struct rtable *rt)
 331 {
 332         ip_rt_put(rt);
 333         dst_free(&rt->u.dst);
 334 }
 335
 336 static __inline__ int rt_fast_clean(struct rtable *rth)
 337 {
 338         /* Kill broadcast/multicast entries very aggresively, if they
 339            collide in hash table with more useful entries */
 340         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 341                 rth->key.iif && rth->u.rt_next;
 342 }
 343
 344 static __inline__ int rt_valuable(struct rtable *rth)
 345 {
 346         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 347                 rth->u.dst.expires;
 348 }
 349
 350 static __inline__ int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 351 {
 352         unsigned long age;
 353         int ret = 0;
 354
 355         if (atomic_read(&rth->u.dst.__refcnt))
 356                 goto out;
 357
 358         ret = 1;
 359         if (rth->u.dst.expires &&
 360             time_after_eq(jiffies, rth->u.dst.expires))
 361                 goto out;
 362
 363         age = jiffies - rth->u.dst.lastuse;
 364         ret = 0;
 365         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 366             (age <= tmo2 && rt_valuable(rth)))
 367                 goto out;
 368         ret = 1;
 369 out:    return ret;
 370 }
 371
 372 /* Bits of score are:
 373  * 31: very valuable
 374  * 30: not quite useless
 375  * 29..0: usage counter
 376  */
 377 static inline u32 rt_score(struct rtable *rt)
 378 {
 379         u32 score = jiffies - rt->u.dst.lastuse;
 380
 381         score = ~score & ~(3<<30);
 382
 383         if (rt_valuable(rt))
 384                 score |= (1<<31);
 385
 386         if (!rt->key.iif ||
 387             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 388                 score |= (1<<30);
 389
 390         return score;
 391 }
 392
 393 /* This runs via a timer and thus is always in BH context. */
 394 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
 395 {
 396         static int rover;
 397         int i = rover, t;
 398         struct rtable *rth, **rthp;
 399         unsigned long now = jiffies;
 400
 401         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 402              t -= ip_rt_gc_timeout) {
 403                 unsigned long tmo = ip_rt_gc_timeout;
 404
 405                 i = (i + 1) & rt_hash_mask;
 406                 rthp = &rt_hash_table[i].chain;
 407
 408                 write_lock(&rt_hash_table[i].lock);
 409                 while ((rth = *rthp) != NULL) {
 410                         if (rth->u.dst.expires) {
 411                                 /* Entry is expired even if it is in use */
 412                                 if (time_before_eq(now, rth->u.dst.expires)) {
 413                                         tmo >>= 1;
 414                                         rthp = &rth->u.rt_next;
 415                                         continue;
 416                                 }
 417                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 418                                 tmo >>= 1;
 419                                 rthp = &rth->u.rt_next;
 420                                 continue;
 421                         }
 422
 423                         /* Cleanup aged off entries. */
 424                         *rthp = rth->u.rt_next;
 425                         rt_free(rth);
 426                 }
 427                 write_unlock(&rt_hash_table[i].lock);
 428
 429                 /* Fallback loop breaker. */
 430                 if (time_after(jiffies, now))
 431                         break;
 432         }
 433         rover = i;
 434         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 435 }
 436
 437 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
 438
 439 /* This can run from both BH and non-BH contexts, the latter
 440  * in the case of a forced flush event.
 441  */
 442 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
 443 {
 444         int i;
 445         struct rtable *rth, *next;
 446
 447         rt_deadline = 0;
 448
 449         get_random_bytes(&rt_hash_rnd, 4);
 450
 451         for (i = rt_hash_mask; i >= 0; i--) {
 452                 write_lock_bh(&rt_hash_table[i].lock);
 453                 rth = rt_hash_table[i].chain;
 454                 if (rth)
 455                         rt_hash_table[i].chain = NULL;
 456                 write_unlock_bh(&rt_hash_table[i].lock);
 457
 458                 for (; rth; rth = next) {
 459                         next = rth->u.rt_next;
 460                         rt_free(rth);
 461                 }
 462         }
 463 }
 464
 465 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
 466
 467 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 468
 469 void rt_cache_flush(int delay)
 470 {
 471         unsigned long now = jiffies;
 472         int user_mode = !in_softirq();
 473
 474         if (delay < 0)
 475                 delay = ip_rt_min_delay;
 476
 477         spin_lock_bh(&rt_flush_lock);
 478
 479         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 480                 long tmo = (long)(rt_deadline - now);
 481
 482                 /* If flush timer is already running
 483                    and flush request is not immediate (delay > 0):
 484
 485                    if deadline is not achieved, prolongate timer to "delay",
 486                    otherwise fire it at deadline time.
 487                  */
 488
 489                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 490                         tmo = 0;
 491
 492                 if (delay > tmo)
 493                         delay = tmo;
 494         }
 495
 496         if (delay <= 0) {
 497                 spin_unlock_bh(&rt_flush_lock);
 498                 SMP_TIMER_NAME(rt_run_flush)(0);
 499                 return;
 500         }
 501
 502         if (rt_deadline == 0)
 503                 rt_deadline = now + ip_rt_max_delay;
 504
 505         mod_timer(&rt_flush_timer, now+delay);
 506         spin_unlock_bh(&rt_flush_lock);
 507 }
 508
 509 static void rt_secret_rebuild(unsigned long dummy)
 510 {
 511         unsigned long now = jiffies;
 512
 513         rt_cache_flush(0);
 514         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 515 }
 516
 517 /*
 518    Short description of GC goals.
 519
 520    We want to build algorithm, which will keep routing cache
 521    at some equilibrium point, when number of aged off entries
 522    is kept approximately equal to newly generated ones.
 523
 524    Current expiration strength is variable "expire".
 525    We try to adjust it dynamically, so that if networking
 526    is idle expires is large enough to keep enough of warm entries,
 527    and when load increases it reduces to limit cache size.
 528  */
 529
 530 static int rt_garbage_collect(void)
 531 {
 532         static unsigned long expire = RT_GC_TIMEOUT;
 533         static unsigned long last_gc;
 534         static int rover;
 535         static int equilibrium;
 536         struct rtable *rth, **rthp;
 537         unsigned long now = jiffies;
 538         int goal;
 539
 540         /*
 541          * Garbage collection is pretty expensive,
 542          * do not make it too frequently.
 543          */
 544
 545         rt_cache_stat[smp_processor_id()].gc_total++;
 546
 547         if (now - last_gc < ip_rt_gc_min_interval &&
 548             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 549                 rt_cache_stat[smp_processor_id()].gc_ignored++;
 550                 goto out;
 551         }
 552
 553         /* Calculate number of entries, which we want to expire now. */
 554         goal = atomic_read(&ipv4_dst_ops.entries) -
 555                 (ip_rt_gc_elasticity << rt_hash_log);
 556         if (goal <= 0) {
 557                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 558                         equilibrium = ipv4_dst_ops.gc_thresh;
 559                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 560                 if (goal > 0) {
 561                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 562                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 563                 }
 564         } else {
 565                 /* We are in dangerous area. Try to reduce cache really
 566                  * aggressively.
 567                  */
 568                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 569                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 570         }
 571
 572         if (now - last_gc >= ip_rt_gc_min_interval)
 573                 last_gc = now;
 574
 575         if (goal <= 0) {
 576                 equilibrium += goal;
 577                 goto work_done;
 578         }
 579
 580         do {
 581                 int i, k;
 582
 583                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 584                         unsigned long tmo = expire;
 585
 586                         k = (k + 1) & rt_hash_mask;
 587                         rthp = &rt_hash_table[k].chain;
 588                         write_lock_bh(&rt_hash_table[k].lock);
 589                         while ((rth = *rthp) != NULL) {
 590                                 if (!rt_may_expire(rth, tmo, expire)) {
 591                                         tmo >>= 1;
 592                                         rthp = &rth->u.rt_next;
 593                                         continue;
 594                                 }
 595                                 *rthp = rth->u.rt_next;
 596                                 rt_free(rth);
 597                                 goal--;
 598                         }
 599                         write_unlock_bh(&rt_hash_table[k].lock);
 600                         if (goal <= 0)
 601                                 break;
 602                 }
 603                 rover = k;
 604
 605                 if (goal <= 0)
 606                         goto work_done;
 607
 608                 /* Goal is not achieved. We stop process if:
 609
 610                    - if expire reduced to zero. Otherwise, expire is halfed.
 611                    - if table is not full.
 612                    - if we are called from interrupt.
 613                    - jiffies check is just fallback/debug loop breaker.
 614                      We will not spin here for long time in any case.
 615                  */
 616
 617                 rt_cache_stat[smp_processor_id()].gc_goal_miss++;
 618
 619                 if (expire == 0)
 620                         break;
 621
 622                 expire >>= 1;
 623 #if RT_CACHE_DEBUG >= 2
 624                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 625                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 626 #endif
 627
 628                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 629                         goto out;
 630         } while (!in_softirq() && time_before_eq(jiffies, now));
 631
 632         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 633                 goto out;
 634         if (net_ratelimit())
 635                 printk(KERN_WARNING "dst cache overflow\n");
 636         rt_cache_stat[smp_processor_id()].gc_dst_overflow++;
 637         return 1;
 638
 639 work_done:
 640         expire += ip_rt_gc_min_interval;
 641         if (expire > ip_rt_gc_timeout ||
 642             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 643                 expire = ip_rt_gc_timeout;
 644 #if RT_CACHE_DEBUG >= 2
 645         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 646                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 647 #endif
 648 out:    return 0;
 649 }
 650
 651 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 652 {
 653         struct rtable   *rth, **rthp;
 654         unsigned long   now;
 655         struct rtable *cand, **candp;
 656         u32             min_score;
 657         int             chain_length;
 658         int attempts = !in_softirq();
 659
 660 restart:
 661         chain_length = 0;
 662         min_score = ~(u32)0;
 663         cand = NULL;
 664         candp = NULL;
 665         now = jiffies;
 666
 667         rthp = &rt_hash_table[hash].chain;
 668
 669         write_lock_bh(&rt_hash_table[hash].lock);
 670         while ((rth = *rthp) != NULL) {
 671                 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 672                         /* Put it first */
 673                         *rthp = rth->u.rt_next;
 674                         rth->u.rt_next = rt_hash_table[hash].chain;
 675                         rt_hash_table[hash].chain = rth;
 676
 677                         rth->u.dst.__use++;
 678                         dst_hold(&rth->u.dst);
 679                         rth->u.dst.lastuse = now;
 680                         write_unlock_bh(&rt_hash_table[hash].lock);
 681
 682                         rt_drop(rt);
 683                         *rp = rth;
 684                         return 0;
 685                 }
 686
 687                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 688                         u32 score = rt_score(rth);
 689
 690                         if (score <= min_score) {
 691                                 cand = rth;
 692                                 candp = rthp;
 693                                 min_score = score;
 694                         }
 695                 }
 696
 697                 chain_length++;
 698
 699                 rthp = &rth->u.rt_next;
 700         }
 701
 702         if (cand) {
 703                 /* ip_rt_gc_elasticity used to be average length of chain
 704                  * length, when exceeded gc becomes really aggressive.
 705                  *
 706                  * The second limit is less certain. At the moment it allows
 707                  * only 2 entries per bucket. We will see.
 708                  */
 709                 if (chain_length > ip_rt_gc_elasticity) {
 710                         *candp = cand->u.rt_next;
 711                         rt_free(cand);
 712                 }
 713         }
 714
 715         /* Try to bind route to arp only if it is output
 716            route or unicast forwarding path.
 717          */
 718         if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 719                 int err = arp_bind_neighbour(&rt->u.dst);
 720                 if (err) {
 721                         write_unlock_bh(&rt_hash_table[hash].lock);
 722
 723                         if (err != -ENOBUFS) {
 724                                 rt_drop(rt);
 725                                 return err;
 726                         }
 727
 728                         /* Neighbour tables are full and nothing
 729                            can be released. Try to shrink route cache,
 730                            it is most likely it holds some neighbour records.
 731                          */
 732                         if (attempts-- > 0) {
 733                                 int saved_elasticity = ip_rt_gc_elasticity;
 734                                 int saved_int = ip_rt_gc_min_interval;
 735                                 ip_rt_gc_elasticity     = 1;
 736                                 ip_rt_gc_min_interval   = 0;
 737                                 rt_garbage_collect();
 738                                 ip_rt_gc_min_interval   = saved_int;
 739                                 ip_rt_gc_elasticity     = saved_elasticity;
 740                                 goto restart;
 741                         }
 742
 743                         if (net_ratelimit())
 744                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 745                         rt_drop(rt);
 746                         return -ENOBUFS;
 747                 }
 748         }
 749
 750         rt->u.rt_next = rt_hash_table[hash].chain;
 751 #if RT_CACHE_DEBUG >= 2
 752         if (rt->u.rt_next) {
 753                 struct rtable *trt;
 754                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 755                        NIPQUAD(rt->rt_dst));
 756                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 757                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 758                 printk("\n");
 759         }
 760 #endif
 761         rt_hash_table[hash].chain = rt;
 762         write_unlock_bh(&rt_hash_table[hash].lock);
 763         *rp = rt;
 764         return 0;
 765 }
 766
 767 void rt_bind_peer(struct rtable *rt, int create)
 768 {
 769         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
 770         struct inet_peer *peer;
 771
 772         peer = inet_getpeer(rt->rt_dst, create);
 773
 774         spin_lock_bh(&rt_peer_lock);
 775         if (rt->peer == NULL) {
 776                 rt->peer = peer;
 777                 peer = NULL;
 778         }
 779         spin_unlock_bh(&rt_peer_lock);
 780         if (peer)
 781                 inet_putpeer(peer);
 782 }
 783
 784 /*
 785  * Peer allocation may fail only in serious out-of-memory conditions.  However
 786  * we still can generate some output.
 787  * Random ID selection looks a bit dangerous because we have no chances to
 788  * select ID being unique in a reasonable period of time.
 789  * But broken packet identifier may be better than no packet at all.
 790  */
 791 static void ip_select_fb_ident(struct iphdr *iph)
 792 {
 793         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
 794         static u32 ip_fallback_id;
 795         u32 salt;
 796
 797         spin_lock_bh(&ip_fb_id_lock);
 798         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
 799         iph->id = htons(salt & 0xFFFF);
 800         ip_fallback_id = salt;
 801         spin_unlock_bh(&ip_fb_id_lock);
 802 }
 803
 804 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
 805 {
 806         struct rtable *rt = (struct rtable *) dst;
 807
 808         if (rt) {
 809                 if (rt->peer == NULL)
 810                         rt_bind_peer(rt, 1);
 811
 812                 /* If peer is attached to destination, it is never detached,
 813                    so that we need not to grab a lock to dereference it.
 814                  */
 815                 if (rt->peer) {
 816                         iph->id = htons(inet_getid(rt->peer));
 817                         return;
 818                 }
 819         } else
 820                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
 821
 822         ip_select_fb_ident(iph);
 823 }
 824
 825 static void rt_del(unsigned hash, struct rtable *rt)
 826 {
 827         struct rtable **rthp;
 828
 829         write_lock_bh(&rt_hash_table[hash].lock);
 830         ip_rt_put(rt);
 831         for (rthp = &rt_hash_table[hash].chain; *rthp;
 832              rthp = &(*rthp)->u.rt_next)
 833                 if (*rthp == rt) {
 834                         *rthp = rt->u.rt_next;
 835                         rt_free(rt);
 836                         break;
 837                 }
 838         write_unlock_bh(&rt_hash_table[hash].lock);
 839 }
 840
 841 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 842                     u32 saddr, u8 tos, struct net_device *dev)
 843 {
 844         int i, k;
 845         struct in_device *in_dev = in_dev_get(dev);
 846         struct rtable *rth, **rthp;
 847         u32  skeys[2] = { saddr, 0 };
 848         int  ikeys[2] = { dev->ifindex, 0 };
 849
 850         tos &= IPTOS_RT_MASK;
 851
 852         if (!in_dev)
 853                 return;
 854
 855         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 856             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 857                 goto reject_redirect;
 858
 859         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 860                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 861                         goto reject_redirect;
 862                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 863                         goto reject_redirect;
 864         } else {
 865                 if (inet_addr_type(new_gw) != RTN_UNICAST)
 866                         goto reject_redirect;
 867         }
 868
 869         for (i = 0; i < 2; i++) {
 870                 for (k = 0; k < 2; k++) {
 871                         unsigned hash = rt_hash_code(daddr,
 872                                                      skeys[i] ^ (ikeys[k] << 5),
 873                                                      tos);
 874
 875                         rthp=&rt_hash_table[hash].chain;
 876
 877                         read_lock(&rt_hash_table[hash].lock);
 878                         while ((rth = *rthp) != NULL) {
 879                                 struct rtable *rt;
 880
 881                                 if (rth->key.dst != daddr ||
 882                                     rth->key.src != skeys[i] ||
 883                                     rth->key.tos != tos ||
 884                                     rth->key.oif != ikeys[k] ||
 885                                     rth->key.iif != 0) {
 886                                         rthp = &rth->u.rt_next;
 887                                         continue;
 888                                 }
 889
 890                                 if (rth->rt_dst != daddr ||
 891                                     rth->rt_src != saddr ||
 892                                     rth->u.dst.error ||
 893                                     rth->rt_gateway != old_gw ||
 894                                     rth->u.dst.dev != dev)
 895                                         break;
 896
 897                                 dst_hold(&rth->u.dst);
 898                                 read_unlock(&rt_hash_table[hash].lock);
 899
 900                                 rt = dst_alloc(&ipv4_dst_ops);
 901                                 if (rt == NULL) {
 902                                         ip_rt_put(rth);
 903                                         in_dev_put(in_dev);
 904                                         return;
 905                                 }
 906
 907                                 /* Copy all the information. */
 908                                 *rt = *rth;
 909                                 rt->u.dst.__use         = 1;
 910                                 atomic_set(&rt->u.dst.__refcnt, 1);
 911                                 if (rt->u.dst.dev)
 912                                         dev_hold(rt->u.dst.dev);
 913                                 rt->u.dst.lastuse       = jiffies;
 914                                 rt->u.dst.neighbour     = NULL;
 915                                 rt->u.dst.hh            = NULL;
 916                                 rt->u.dst.obsolete      = 0;
 917
 918                                 rt->rt_flags            |= RTCF_REDIRECTED;
 919
 920                                 /* Gateway is different ... */
 921                                 rt->rt_gateway          = new_gw;
 922
 923                                 /* Redirect received -> path was valid */
 924                                 dst_confirm(&rth->u.dst);
 925
 926                                 if (rt->peer)
 927                                         atomic_inc(&rt->peer->refcnt);
 928
 929                                 if (arp_bind_neighbour(&rt->u.dst) ||
 930                                     !(rt->u.dst.neighbour->nud_state &
 931                                             NUD_VALID)) {
 932                                         if (rt->u.dst.neighbour)
 933                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
 934                                         ip_rt_put(rth);
 935                                         rt_drop(rt);
 936                                         goto do_next;
 937                                 }
 938
 939                                 rt_del(hash, rth);
 940                                 if (!rt_intern_hash(hash, rt, &rt))
 941                                         ip_rt_put(rt);
 942                                 goto do_next;
 943                         }
 944                         read_unlock(&rt_hash_table[hash].lock);
 945                 do_next:
 946                         ;
 947                 }
 948         }
 949         in_dev_put(in_dev);
 950         return;
 951
 952 reject_redirect:
 953 #ifdef CONFIG_IP_ROUTE_VERBOSE
 954         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 955                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
 956                         "%u.%u.%u.%u ignored.\n"
 957                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
 958                         "tos %02x\n",
 959                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
 960                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
 961 #endif
 962         in_dev_put(in_dev);
 963 }
 964
 965 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 966 {
 967         struct rtable *rt = (struct rtable*)dst;
 968         struct dst_entry *ret = dst;
 969
 970         if (rt) {
 971                 if (dst->obsolete) {
 972                         ip_rt_put(rt);
 973                         ret = NULL;
 974                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 975                            rt->u.dst.expires) {
 976                         unsigned hash = rt_hash_code(rt->key.dst,
 977                                                      rt->key.src ^
 978                                                         (rt->key.oif << 5),
 979                                                      rt->key.tos);
 980 #if RT_CACHE_DEBUG >= 1
 981                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
 982                                           "%u.%u.%u.%u/%02x dropped\n",
 983                                 NIPQUAD(rt->rt_dst), rt->key.tos);
 984 #endif
 985                         rt_del(hash, rt);
 986                         ret = NULL;
 987                 }
 988         }
 989         return ret;
 990 }
 991
 992 /*
 993  * Algorithm:
 994  *      1. The first ip_rt_redirect_number redirects are sent
 995  *         with exponential backoff, then we stop sending them at all,
 996  *         assuming that the host ignores our redirects.
 997  *      2. If we did not see packets requiring redirects
 998  *         during ip_rt_redirect_silence, we assume that the host
 999  *         forgot redirected route and start to send redirects again.
1000  *
1001  * This algorithm is much cheaper and more intelligent than dumb load limiting
1002  * in icmp.c.
1003  *
1004  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1005  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1006  */
1007
1008 void ip_rt_send_redirect(struct sk_buff *skb)
1009 {
1010         struct rtable *rt = (struct rtable*)skb->dst;
1011         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1012
1013         if (!in_dev)
1014                 return;
1015
1016         if (!IN_DEV_TX_REDIRECTS(in_dev))
1017                 goto out;
1018
1019         /* No redirected packets during ip_rt_redirect_silence;
1020          * reset the algorithm.
1021          */
1022         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1023                 rt->u.dst.rate_tokens = 0;
1024
1025         /* Too many ignored redirects; do not send anything
1026          * set u.dst.rate_last to the last seen redirected packet.
1027          */
1028         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1029                 rt->u.dst.rate_last = jiffies;
1030                 goto out;
1031         }
1032
1033         /* Check for load limit; set rate_last to the latest sent
1034          * redirect.
1035          */
1036         if (time_after(jiffies,
1037                        (rt->u.dst.rate_last +
1038                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1039                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1040                 rt->u.dst.rate_last = jiffies;
1041                 ++rt->u.dst.rate_tokens;
1042 #ifdef CONFIG_IP_ROUTE_VERBOSE
1043                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1044                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1045                     net_ratelimit())
1046                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1047                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1048                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1049                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1050 #endif
1051         }
1052 out:
1053         in_dev_put(in_dev);
1054 }
1055
1056 static int ip_error(struct sk_buff *skb)
1057 {
1058         struct rtable *rt = (struct rtable*)skb->dst;
1059         unsigned long now;
1060         int code;
1061
1062         switch (rt->u.dst.error) {
1063                 case EINVAL:
1064                 default:
1065                         goto out;
1066                 case EHOSTUNREACH:
1067                         code = ICMP_HOST_UNREACH;
1068                         break;
1069                 case ENETUNREACH:
1070                         code = ICMP_NET_UNREACH;
1071                         break;
1072                 case EACCES:
1073                         code = ICMP_PKT_FILTERED;
1074                         break;
1075         }
1076
1077         now = jiffies;
1078         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1079         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1080                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1081         rt->u.dst.rate_last = now;
1082         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1083                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1084                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1085         }
1086
1087 out:    kfree_skb(skb);
1088         return 0;
1089 }
1090
1091 /*
1092  *      The last two values are not from the RFC but
1093  *      are needed for AMPRnet AX.25 paths.
1094  */
1095
1096 static unsigned short mtu_plateau[] =
1097 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1098
1099 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1100 {
1101         int i;
1102
1103         for (i = 0; i < sizeof(mtu_plateau) / sizeof(mtu_plateau[0]); i++)
1104                 if (old_mtu > mtu_plateau[i])
1105                         return mtu_plateau[i];
1106         return 68;
1107 }
1108
1109 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1110 {
1111         int i;
1112         unsigned short old_mtu = ntohs(iph->tot_len);
1113         struct rtable *rth;
1114         u32  skeys[2] = { iph->saddr, 0, };
1115         u32  daddr = iph->daddr;
1116         u8   tos = iph->tos & IPTOS_RT_MASK;
1117         unsigned short est_mtu = 0;
1118
1119         if (ipv4_config.no_pmtu_disc)
1120                 return 0;
1121
1122         for (i = 0; i < 2; i++) {
1123                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1124
1125                 read_lock(&rt_hash_table[hash].lock);
1126                 for (rth = rt_hash_table[hash].chain; rth;
1127                      rth = rth->u.rt_next) {
1128                         if (rth->key.dst == daddr &&
1129                             rth->key.src == skeys[i] &&
1130                             rth->rt_dst  == daddr &&
1131                             rth->rt_src  == iph->saddr &&
1132                             rth->key.tos == tos &&
1133                             rth->key.iif == 0 &&
1134                             !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
1135                                 unsigned short mtu = new_mtu;
1136
1137                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1138
1139                                         /* BSD 4.2 compatibility hack :-( */
1140                                         if (mtu == 0 &&
1141                                             old_mtu >= rth->u.dst.pmtu &&
1142                                             old_mtu >= 68 + (iph->ihl << 2))
1143                                                 old_mtu -= iph->ihl << 2;
1144
1145                                         mtu = guess_mtu(old_mtu);
1146                                 }
1147                                 if (mtu <= rth->u.dst.pmtu) {
1148                                         if (mtu < rth->u.dst.pmtu) {
1149                                                 dst_confirm(&rth->u.dst);
1150                                                 if (mtu < ip_rt_min_pmtu) {
1151                                                         mtu = ip_rt_min_pmtu;
1152                                                         rth->u.dst.mxlock |=
1153                                                                 (1 << RTAX_MTU);
1154                                                 }
1155                                                 rth->u.dst.pmtu = mtu;
1156                                                 dst_set_expires(&rth->u.dst,
1157                                                         ip_rt_mtu_expires);
1158                                         }
1159                                         est_mtu = mtu;
1160                                 }
1161                         }
1162                 }
1163                 read_unlock(&rt_hash_table[hash].lock);
1164         }
1165         return est_mtu ? : new_mtu;
1166 }
1167
1168 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1169 {
1170         if (dst->pmtu > mtu && mtu >= 68 &&
1171             !(dst->mxlock & (1 << RTAX_MTU))) {
1172                 if (mtu < ip_rt_min_pmtu) {
1173                         mtu = ip_rt_min_pmtu;
1174                         dst->mxlock |= (1 << RTAX_MTU);
1175                 }
1176                 dst->pmtu = mtu;
1177                 dst_set_expires(dst, ip_rt_mtu_expires);
1178         }
1179 }
1180
1181 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182 {
1183         dst_release(dst);
1184         return NULL;
1185 }
1186
1187 static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
1188                                           struct sk_buff *skb)
1189 {
1190         return NULL;
1191 }
1192
1193 static void ipv4_dst_destroy(struct dst_entry *dst)
1194 {
1195         struct rtable *rt = (struct rtable *) dst;
1196         struct inet_peer *peer = rt->peer;
1197
1198         if (peer) {
1199                 rt->peer = NULL;
1200                 inet_putpeer(peer);
1201         }
1202 }
1203
1204 static void ipv4_link_failure(struct sk_buff *skb)
1205 {
1206         struct rtable *rt;
1207
1208         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1209
1210         rt = (struct rtable *) skb->dst;
1211         if (rt)
1212                 dst_set_expires(&rt->u.dst, 0);
1213 }
1214
1215 static int ip_rt_bug(struct sk_buff *skb)
1216 {
1217         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1218                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1219                 skb->dev ? skb->dev->name : "?");
1220         kfree_skb(skb);
1221         return 0;
1222 }
1223
1224 /*
1225    We do not cache source address of outgoing interface,
1226    because it is used only by IP RR, TS and SRR options,
1227    so that it out of fast path.
1228
1229    BTW remember: "addr" is allowed to be not aligned
1230    in IP options!
1231  */
1232
1233 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1234 {
1235         u32 src;
1236         struct fib_result res;
1237
1238         if (rt->key.iif == 0)
1239                 src = rt->rt_src;
1240         else if (fib_lookup(&rt->key, &res) == 0) {
1241 #ifdef CONFIG_IP_ROUTE_NAT
1242                 if (res.type == RTN_NAT)
1243                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1244                                                 RT_SCOPE_UNIVERSE);
1245                 else
1246 #endif
1247                         src = FIB_RES_PREFSRC(res);
1248                 fib_res_put(&res);
1249         } else
1250                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1251                                         RT_SCOPE_UNIVERSE);
1252         memcpy(addr, &src, 4);
1253 }
1254
1255 #ifdef CONFIG_NET_CLS_ROUTE
1256 static void set_class_tag(struct rtable *rt, u32 tag)
1257 {
1258         if (!(rt->u.dst.tclassid & 0xFFFF))
1259                 rt->u.dst.tclassid |= tag & 0xFFFF;
1260         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1261                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1262 }
1263 #endif
1264
1265 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1266 {
1267         struct fib_info *fi = res->fi;
1268
1269         if (fi) {
1270                 if (FIB_RES_GW(*res) &&
1271                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1272                         rt->rt_gateway = FIB_RES_GW(*res);
1273                 memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
1274                         sizeof(fi->fib_metrics));
1275                 if (fi->fib_mtu == 0) {
1276                         rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1277                         if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
1278                             rt->rt_gateway != rt->rt_dst &&
1279                             rt->u.dst.pmtu > 576)
1280                                 rt->u.dst.pmtu = 576;
1281                 }
1282 #ifdef CONFIG_NET_CLS_ROUTE
1283                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1284 #endif
1285         } else
1286                 rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1287
1288         if (rt->u.dst.pmtu > IP_MAX_MTU)
1289                 rt->u.dst.pmtu = IP_MAX_MTU;
1290         if (rt->u.dst.advmss == 0)
1291                 rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1292                                        ip_rt_min_advmss);
1293         if (rt->u.dst.advmss > 65535 - 40)
1294                 rt->u.dst.advmss = 65535 - 40;
1295
1296 #ifdef CONFIG_NET_CLS_ROUTE
1297 #ifdef CONFIG_IP_MULTIPLE_TABLES
1298         set_class_tag(rt, fib_rules_tclass(res));
1299 #endif
1300         set_class_tag(rt, itag);
1301 #endif
1302         rt->rt_type = res->type;
1303 }
1304
1305 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1306                                 u8 tos, struct net_device *dev, int our)
1307 {
1308         unsigned hash;
1309         struct rtable *rth;
1310         u32 spec_dst;
1311         struct in_device *in_dev = in_dev_get(dev);
1312         u32 itag = 0;
1313
1314         /* Primary sanity checks. */
1315
1316         if (in_dev == NULL)
1317                 return -EINVAL;
1318
1319         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1320             skb->protocol != htons(ETH_P_IP))
1321                 goto e_inval;
1322
1323         if (ZERONET(saddr)) {
1324                 if (!LOCAL_MCAST(daddr))
1325                         goto e_inval;
1326                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1327         } else if (fib_validate_source(saddr, 0, tos, 0,
1328                                         dev, &spec_dst, &itag) < 0)
1329                 goto e_inval;
1330
1331         rth = dst_alloc(&ipv4_dst_ops);
1332         if (!rth)
1333                 goto e_nobufs;
1334
1335         rth->u.dst.output= ip_rt_bug;
1336
1337         atomic_set(&rth->u.dst.__refcnt, 1);
1338         rth->u.dst.flags= DST_HOST;
1339         rth->key.dst    = daddr;
1340         rth->rt_dst     = daddr;
1341         rth->key.tos    = tos;
1342 #ifdef CONFIG_IP_ROUTE_FWMARK
1343         rth->key.fwmark = skb->nfmark;
1344 #endif
1345         rth->key.src    = saddr;
1346         rth->rt_src     = saddr;
1347 #ifdef CONFIG_IP_ROUTE_NAT
1348         rth->rt_dst_map = daddr;
1349         rth->rt_src_map = saddr;
1350 #endif
1351 #ifdef CONFIG_NET_CLS_ROUTE
1352         rth->u.dst.tclassid = itag;
1353 #endif
1354         rth->rt_iif     =
1355         rth->key.iif    = dev->ifindex;
1356         rth->u.dst.dev  = &loopback_dev;
1357         dev_hold(rth->u.dst.dev);
1358         rth->key.oif    = 0;
1359         rth->rt_gateway = daddr;
1360         rth->rt_spec_dst= spec_dst;
1361         rth->rt_type    = RTN_MULTICAST;
1362         rth->rt_flags   = RTCF_MULTICAST;
1363         if (our) {
1364                 rth->u.dst.input= ip_local_deliver;
1365                 rth->rt_flags |= RTCF_LOCAL;
1366         }
1367
1368 #ifdef CONFIG_IP_MROUTE
1369         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1370                 rth->u.dst.input = ip_mr_input;
1371 #endif
1372         rt_cache_stat[smp_processor_id()].in_slow_mc++;
1373
1374         in_dev_put(in_dev);
1375         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1376         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1377
1378 e_nobufs:
1379         in_dev_put(in_dev);
1380         return -ENOBUFS;
1381
1382 e_inval:
1383         in_dev_put(in_dev);
1384         return -EINVAL;
1385 }
1386
1387 /*
1388  *      NOTE. We drop all the packets that has local source
1389  *      addresses, because every properly looped back packet
1390  *      must have correct destination already attached by output routine.
1391  *
1392  *      Such approach solves two big problems:
1393  *      1. Not simplex devices are handled properly.
1394  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1395  */
1396
1397 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1398                         u8 tos, struct net_device *dev)
1399 {
1400         struct rt_key   key;
1401         struct fib_result res;
1402         struct in_device *in_dev = in_dev_get(dev);
1403         struct in_device *out_dev = NULL;
1404         unsigned        flags = 0;
1405         u32             itag = 0;
1406         struct rtable * rth;
1407         unsigned        hash;
1408         u32             spec_dst;
1409         int             err = -EINVAL;
1410         int             free_res = 0;
1411
1412         /* IP on this device is disabled. */
1413
1414         if (!in_dev)
1415                 goto out;
1416
1417         key.dst         = daddr;
1418         key.src         = saddr;
1419         key.tos         = tos;
1420 #ifdef CONFIG_IP_ROUTE_FWMARK
1421         key.fwmark      = skb->nfmark;
1422 #endif
1423         key.iif         = dev->ifindex;
1424         key.oif         = 0;
1425         key.scope       = RT_SCOPE_UNIVERSE;
1426
1427         hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
1428
1429         /* Check for the most weird martians, which can be not detected
1430            by fib_lookup.
1431          */
1432
1433         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1434                 goto martian_source;
1435
1436         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1437                 goto brd_input;
1438
1439         /* Accept zero addresses only to limited broadcast;
1440          * I even do not know to fix it or not. Waiting for complains :-)
1441          */
1442         if (ZERONET(saddr))
1443                 goto martian_source;
1444
1445         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1446                 goto martian_destination;
1447
1448         /*
1449          *      Now we are ready to route packet.
1450          */
1451         if ((err = fib_lookup(&key, &res)) != 0) {
1452                 if (!IN_DEV_FORWARD(in_dev))
1453                         goto e_inval;
1454                 goto no_route;
1455         }
1456         free_res = 1;
1457
1458         rt_cache_stat[smp_processor_id()].in_slow_tot++;
1459
1460 #ifdef CONFIG_IP_ROUTE_NAT
1461         /* Policy is applied before mapping destination,
1462            but rerouting after map should be made with old source.
1463          */
1464
1465         if (1) {
1466                 u32 src_map = saddr;
1467                 if (res.r)
1468                         src_map = fib_rules_policy(saddr, &res, &flags);
1469
1470                 if (res.type == RTN_NAT) {
1471                         key.dst = fib_rules_map_destination(daddr, &res);
1472                         fib_res_put(&res);
1473                         free_res = 0;
1474                         if (fib_lookup(&key, &res))
1475                                 goto e_inval;
1476                         free_res = 1;
1477                         if (res.type != RTN_UNICAST)
1478                                 goto e_inval;
1479                         flags |= RTCF_DNAT;
1480                 }
1481                 key.src = src_map;
1482         }
1483 #endif
1484
1485         if (res.type == RTN_BROADCAST)
1486                 goto brd_input;
1487
1488         if (res.type == RTN_LOCAL) {
1489                 int result;
1490                 result = fib_validate_source(saddr, daddr, tos,
1491                                              loopback_dev.ifindex,
1492                                              dev, &spec_dst, &itag);
1493                 if (result < 0)
1494                         goto martian_source;
1495                 if (result)
1496                         flags |= RTCF_DIRECTSRC;
1497                 spec_dst = daddr;
1498                 goto local_input;
1499         }
1500
1501         if (!IN_DEV_FORWARD(in_dev))
1502                 goto e_inval;
1503         if (res.type != RTN_UNICAST)
1504                 goto martian_destination;
1505
1506 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1507         if (res.fi->fib_nhs > 1 && key.oif == 0)
1508                 fib_select_multipath(&key, &res);
1509 #endif
1510         out_dev = in_dev_get(FIB_RES_DEV(res));
1511         if (out_dev == NULL) {
1512                 if (net_ratelimit())
1513                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1514                                          "Please, report\n");
1515                 goto e_inval;
1516         }
1517
1518         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1519                                   &spec_dst, &itag);
1520         if (err < 0)
1521                 goto martian_source;
1522
1523         if (err)
1524                 flags |= RTCF_DIRECTSRC;
1525
1526         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1527             (IN_DEV_SHARED_MEDIA(out_dev) ||
1528              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1529                 flags |= RTCF_DOREDIRECT;
1530
1531         if (skb->protocol != htons(ETH_P_IP)) {
1532                 /* Not IP (i.e. ARP). Do not create route, if it is
1533                  * invalid for proxy arp. DNAT routes are always valid.
1534                  */
1535                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1536                         goto e_inval;
1537         }
1538
1539         rth = dst_alloc(&ipv4_dst_ops);
1540         if (!rth)
1541                 goto e_nobufs;
1542
1543         atomic_set(&rth->u.dst.__refcnt, 1);
1544         rth->u.dst.flags= DST_HOST;
1545         rth->key.dst    = daddr;
1546         rth->rt_dst     = daddr;
1547         rth->key.tos    = tos;
1548 #ifdef CONFIG_IP_ROUTE_FWMARK
1549         rth->key.fwmark = skb->nfmark;
1550 #endif
1551         rth->key.src    = saddr;
1552         rth->rt_src     = saddr;
1553         rth->rt_gateway = daddr;
1554 #ifdef CONFIG_IP_ROUTE_NAT
1555         rth->rt_src_map = key.src;
1556         rth->rt_dst_map = key.dst;
1557         if (flags&RTCF_DNAT)
1558                 rth->rt_gateway = key.dst;
1559 #endif
1560         rth->rt_iif     =
1561         rth->key.iif    = dev->ifindex;
1562         rth->u.dst.dev  = out_dev->dev;
1563         dev_hold(rth->u.dst.dev);
1564         rth->key.oif    = 0;
1565         rth->rt_spec_dst= spec_dst;
1566
1567         rth->u.dst.input = ip_forward;
1568         rth->u.dst.output = ip_output;
1569
1570         rt_set_nexthop(rth, &res, itag);
1571
1572         rth->rt_flags = flags;
1573
1574 #ifdef CONFIG_NET_FASTROUTE
1575         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1576                 struct net_device *odev = rth->u.dst.dev;
1577                 if (odev != dev &&
1578                     dev->accept_fastpath &&
1579                     odev->mtu >= dev->mtu &&
1580                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1581                         rth->rt_flags |= RTCF_FAST;
1582         }
1583 #endif
1584
1585 intern:
1586         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1587 done:
1588         in_dev_put(in_dev);
1589         if (out_dev)
1590                 in_dev_put(out_dev);
1591         if (free_res)
1592                 fib_res_put(&res);
1593 out:    return err;
1594
1595 brd_input:
1596         if (skb->protocol != htons(ETH_P_IP))
1597                 goto e_inval;
1598
1599         if (ZERONET(saddr))
1600                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1601         else {
1602                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1603                                           &itag);
1604                 if (err < 0)
1605                         goto martian_source;
1606                 if (err)
1607                         flags |= RTCF_DIRECTSRC;
1608         }
1609         flags |= RTCF_BROADCAST;
1610         res.type = RTN_BROADCAST;
1611         rt_cache_stat[smp_processor_id()].in_brd++;
1612
1613 local_input:
1614         rth = dst_alloc(&ipv4_dst_ops);
1615         if (!rth)
1616                 goto e_nobufs;
1617
1618         rth->u.dst.output= ip_rt_bug;
1619
1620         atomic_set(&rth->u.dst.__refcnt, 1);
1621         rth->u.dst.flags= DST_HOST;
1622         rth->key.dst    = daddr;
1623         rth->rt_dst     = daddr;
1624         rth->key.tos    = tos;
1625 #ifdef CONFIG_IP_ROUTE_FWMARK
1626         rth->key.fwmark = skb->nfmark;
1627 #endif
1628         rth->key.src    = saddr;
1629         rth->rt_src     = saddr;
1630 #ifdef CONFIG_IP_ROUTE_NAT
1631         rth->rt_dst_map = key.dst;
1632         rth->rt_src_map = key.src;
1633 #endif
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635         rth->u.dst.tclassid = itag;
1636 #endif
1637         rth->rt_iif     =
1638         rth->key.iif    = dev->ifindex;
1639         rth->u.dst.dev  = &loopback_dev;
1640         dev_hold(rth->u.dst.dev);
1641         rth->key.oif    = 0;
1642         rth->rt_gateway = daddr;
1643         rth->rt_spec_dst= spec_dst;
1644         rth->u.dst.input= ip_local_deliver;
1645         rth->rt_flags   = flags|RTCF_LOCAL;
1646         if (res.type == RTN_UNREACHABLE) {
1647                 rth->u.dst.input= ip_error;
1648                 rth->u.dst.error= -err;
1649                 rth->rt_flags   &= ~RTCF_LOCAL;
1650         }
1651         rth->rt_type    = res.type;
1652         goto intern;
1653
1654 no_route:
1655         rt_cache_stat[smp_processor_id()].in_no_route++;
1656         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1657         res.type = RTN_UNREACHABLE;
1658         goto local_input;
1659
1660         /*
1661          *      Do not cache martian addresses: they should be logged (RFC1812)
1662          */
1663 martian_destination:
1664         rt_cache_stat[smp_processor_id()].in_martian_dst++;
1665 #ifdef CONFIG_IP_ROUTE_VERBOSE
1666         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1667                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1668                         "%u.%u.%u.%u, dev %s\n",
1669                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670 #endif
1671 e_inval:
1672         err = -EINVAL;
1673         goto done;
1674
1675 e_nobufs:
1676         err = -ENOBUFS;
1677         goto done;
1678
1679 martian_source:
1680
1681         rt_cache_stat[smp_processor_id()].in_martian_src++;
1682 #ifdef CONFIG_IP_ROUTE_VERBOSE
1683         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1684                 /*
1685                  *      RFC1812 recommendation, if source is martian,
1686                  *      the only hint is MAC header.
1687                  */
1688                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1689                         "%u.%u.%u.%u, on dev %s\n",
1690                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1691                 if (dev->hard_header_len) {
1692                         int i;
1693                         unsigned char *p = skb->mac.raw;
1694                         printk(KERN_WARNING "ll header: ");
1695                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1696                                 printk("%02x", *p);
1697                                 if (i < (dev->hard_header_len - 1))
1698                                         printk(":");
1699                         }
1700                         printk("\n");
1701                 }
1702         }
1703 #endif
1704         goto e_inval;
1705 }
1706
1707 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1708                    u8 tos, struct net_device *dev)
1709 {
1710         struct rtable * rth;
1711         unsigned        hash;
1712         int iif = dev->ifindex;
1713
1714         tos &= IPTOS_RT_MASK;
1715         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1716
1717         read_lock(&rt_hash_table[hash].lock);
1718         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1719                 if (rth->key.dst == daddr &&
1720                     rth->key.src == saddr &&
1721                     rth->key.iif == iif &&
1722                     rth->key.oif == 0 &&
1723 #ifdef CONFIG_IP_ROUTE_FWMARK
1724                     rth->key.fwmark == skb->nfmark &&
1725 #endif
1726                     rth->key.tos == tos) {
1727                         rth->u.dst.lastuse = jiffies;
1728                         dst_hold(&rth->u.dst);
1729                         rth->u.dst.__use++;
1730                         rt_cache_stat[smp_processor_id()].in_hit++;
1731                         read_unlock(&rt_hash_table[hash].lock);
1732                         skb->dst = (struct dst_entry*)rth;
1733                         return 0;
1734                 }
1735                 rt_cache_stat[smp_processor_id()].in_hlist_search++;
1736         }
1737         read_unlock(&rt_hash_table[hash].lock);
1738
1739         /* Multicast recognition logic is moved from route cache to here.
1740            The problem was that too many Ethernet cards have broken/missing
1741            hardware multicast filters :-( As result the host on multicasting
1742            network acquires a lot of useless route cache entries, sort of
1743            SDR messages from all the world. Now we try to get rid of them.
1744            Really, provided software IP multicast filter is organized
1745            reasonably (at least, hashed), it does not result in a slowdown
1746            comparing with route cache reject entries.
1747            Note, that multicast routers are not affected, because
1748            route cache entry is created eventually.
1749          */
1750         if (MULTICAST(daddr)) {
1751                 struct in_device *in_dev;
1752
1753                 read_lock(&inetdev_lock);
1754                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1755                         int our = ip_check_mc(in_dev, daddr, saddr);
1756                         if (our
1757 #ifdef CONFIG_IP_MROUTE
1758                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1759 #endif
1760                             ) {
1761                                 read_unlock(&inetdev_lock);
1762                                 return ip_route_input_mc(skb, daddr, saddr,
1763                                                          tos, dev, our);
1764                         }
1765                 }
1766                 read_unlock(&inetdev_lock);
1767                 return -EINVAL;
1768         }
1769         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1770 }
1771
1772 /*
1773  * Major route resolver routine.
1774  */
1775
1776 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1777 {
1778         struct rt_key key;
1779         struct fib_result res;
1780         unsigned flags = 0;
1781         struct rtable *rth;
1782         struct net_device *dev_out = NULL;
1783         unsigned hash;
1784         int free_res = 0;
1785         int err;
1786         u32 tos;
1787
1788         tos             = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
1789         key.dst         = oldkey->dst;
1790         key.src         = oldkey->src;
1791         key.tos         = tos & IPTOS_RT_MASK;
1792         key.iif         = loopback_dev.ifindex;
1793         key.oif         = oldkey->oif;
1794 #ifdef CONFIG_IP_ROUTE_FWMARK
1795         key.fwmark      = oldkey->fwmark;
1796 #endif
1797         key.scope       = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
1798                                                 RT_SCOPE_UNIVERSE;
1799         res.fi          = NULL;
1800 #ifdef CONFIG_IP_MULTIPLE_TABLES
1801         res.r           = NULL;
1802 #endif
1803
1804         if (oldkey->src) {
1805                 err = -EINVAL;
1806                 if (MULTICAST(oldkey->src) ||
1807                     BADCLASS(oldkey->src) ||
1808                     ZERONET(oldkey->src))
1809                         goto out;
1810
1811                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812                 dev_out = ip_dev_find(oldkey->src);
1813                 if (dev_out == NULL)
1814                         goto out;
1815
1816                 /* I removed check for oif == dev_out->oif here.
1817                    It was wrong by three reasons:
1818                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1819                       assigned to multiple interfaces.
1820                    2. Moreover, we are allowed to send packets with saddr
1821                       of another iface. --ANK
1822                  */
1823
1824                 if (oldkey->oif == 0
1825                     && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1826                         /* Special hack: user can direct multicasts
1827                            and limited broadcast via necessary interface
1828                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1829                            This hack is not just for fun, it allows
1830                            vic,vat and friends to work.
1831                            They bind socket to loopback, set ttl to zero
1832                            and expect that it will work.
1833                            From the viewpoint of routing cache they are broken,
1834                            because we are not allowed to build multicast path
1835                            with loopback source addr (look, routing cache
1836                            cannot know, that ttl is zero, so that packet
1837                            will not leave this host and route is valid).
1838                            Luckily, this hack is good workaround.
1839                          */
1840
1841                         key.oif = dev_out->ifindex;
1842                         goto make_route;
1843                 }
1844                 if (dev_out)
1845                         dev_put(dev_out);
1846                 dev_out = NULL;
1847         }
1848         if (oldkey->oif) {
1849                 dev_out = dev_get_by_index(oldkey->oif);
1850                 err = -ENODEV;
1851                 if (dev_out == NULL)
1852                         goto out;
1853                 if (__in_dev_get(dev_out) == NULL) {
1854                         dev_put(dev_out);
1855                         goto out;       /* Wrong error code */
1856                 }
1857
1858                 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1859                         if (!key.src)
1860                                 key.src = inet_select_addr(dev_out, 0,
1861                                                                 RT_SCOPE_LINK);
1862                         goto make_route;
1863                 }
1864                 if (!key.src) {
1865                         if (MULTICAST(oldkey->dst))
1866                                 key.src = inet_select_addr(dev_out, 0,
1867                                                                 key.scope);
1868                         else if (!oldkey->dst)
1869                                 key.src = inet_select_addr(dev_out, 0,
1870                                                                 RT_SCOPE_HOST);
1871                 }
1872         }
1873
1874         if (!key.dst) {
1875                 key.dst = key.src;
1876                 if (!key.dst)
1877                         key.dst = key.src = htonl(INADDR_LOOPBACK);
1878                 if (dev_out)
1879                         dev_put(dev_out);
1880                 dev_out = &loopback_dev;
1881                 dev_hold(dev_out);
1882                 key.oif = loopback_dev.ifindex;
1883                 res.type = RTN_LOCAL;
1884                 flags |= RTCF_LOCAL;
1885                 goto make_route;
1886         }
1887
1888         if (fib_lookup(&key, &res)) {
1889                 res.fi = NULL;
1890                 if (oldkey->oif) {
1891                         /* Apparently, routing tables are wrong. Assume,
1892                            that the destination is on link.
1893
1894                            WHY? DW.
1895                            Because we are allowed to send to iface
1896                            even if it has NO routes and NO assigned
1897                            addresses. When oif is specified, routing
1898                            tables are looked up with only one purpose:
1899                            to catch if destination is gatewayed, rather than
1900                            direct. Moreover, if MSG_DONTROUTE is set,
1901                            we send packet, ignoring both routing tables
1902                            and ifaddr state. --ANK
1903
1904
1905                            We could make it even if oif is unknown,
1906                            likely IPv6, but we do not.
1907                          */
1908
1909                         if (key.src == 0)
1910                                 key.src = inet_select_addr(dev_out, 0,
1911                                                            RT_SCOPE_LINK);
1912                         res.type = RTN_UNICAST;
1913                         goto make_route;
1914                 }
1915                 if (dev_out)
1916                         dev_put(dev_out);
1917                 err = -ENETUNREACH;
1918                 goto out;
1919         }
1920         free_res = 1;
1921
1922         if (res.type == RTN_NAT)
1923                 goto e_inval;
1924
1925         if (res.type == RTN_LOCAL) {
1926                 if (!key.src)
1927                         key.src = key.dst;
1928                 if (dev_out)
1929                         dev_put(dev_out);
1930                 dev_out = &loopback_dev;
1931                 dev_hold(dev_out);
1932                 key.oif = dev_out->ifindex;
1933                 if (res.fi)
1934                         fib_info_put(res.fi);
1935                 res.fi = NULL;
1936                 flags |= RTCF_LOCAL;
1937                 goto make_route;
1938         }
1939
1940 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1941         if (res.fi->fib_nhs > 1 && key.oif == 0)
1942                 fib_select_multipath(&key, &res);
1943         else
1944 #endif
1945         if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
1946                 fib_select_default(&key, &res);
1947
1948         if (!key.src)
1949                 key.src = FIB_RES_PREFSRC(res);
1950
1951         if (dev_out)
1952                 dev_put(dev_out);
1953         dev_out = FIB_RES_DEV(res);
1954         dev_hold(dev_out);
1955         key.oif = dev_out->ifindex;
1956
1957 make_route:
1958         if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1959                 goto e_inval;
1960
1961         if (key.dst == 0xFFFFFFFF)
1962                 res.type = RTN_BROADCAST;
1963         else if (MULTICAST(key.dst))
1964                 res.type = RTN_MULTICAST;
1965         else if (BADCLASS(key.dst) || ZERONET(key.dst))
1966                 goto e_inval;
1967
1968         if (dev_out->flags & IFF_LOOPBACK)
1969                 flags |= RTCF_LOCAL;
1970
1971         if (res.type == RTN_BROADCAST) {
1972                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1973                 if (res.fi) {
1974                         fib_info_put(res.fi);
1975                         res.fi = NULL;
1976                 }
1977         } else if (res.type == RTN_MULTICAST) {
1978                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1979                 read_lock(&inetdev_lock);
1980                 if (!__in_dev_get(dev_out) ||
1981                     !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
1982                         flags &= ~RTCF_LOCAL;
1983                 read_unlock(&inetdev_lock);
1984                 /* If multicast route do not exist use
1985                    default one, but do not gateway in this case.
1986                    Yes, it is hack.
1987                  */
1988                 if (res.fi && res.prefixlen < 4) {
1989                         fib_info_put(res.fi);
1990                         res.fi = NULL;
1991                 }
1992         }
1993
1994         rth = dst_alloc(&ipv4_dst_ops);
1995         if (!rth)
1996                 goto e_nobufs;
1997
1998         atomic_set(&rth->u.dst.__refcnt, 1);
1999         rth->u.dst.flags= DST_HOST;
2000         rth->key.dst    = oldkey->dst;
2001         rth->key.tos    = tos;
2002         rth->key.src    = oldkey->src;
2003         rth->key.iif    = 0;
2004         rth->key.oif    = oldkey->oif;
2005 #ifdef CONFIG_IP_ROUTE_FWMARK
2006         rth->key.fwmark = oldkey->fwmark;
2007 #endif
2008         rth->rt_dst     = key.dst;
2009         rth->rt_src     = key.src;
2010 #ifdef CONFIG_IP_ROUTE_NAT
2011         rth->rt_dst_map = key.dst;
2012         rth->rt_src_map = key.src;
2013 #endif
2014         rth->rt_iif     = oldkey->oif ? : dev_out->ifindex;
2015         rth->u.dst.dev  = dev_out;
2016         dev_hold(dev_out);
2017         rth->rt_gateway = key.dst;
2018         rth->rt_spec_dst= key.src;
2019
2020         rth->u.dst.output=ip_output;
2021
2022         rt_cache_stat[smp_processor_id()].out_slow_tot++;
2023
2024         if (flags & RTCF_LOCAL) {
2025                 rth->u.dst.input = ip_local_deliver;
2026                 rth->rt_spec_dst = key.dst;
2027         }
2028         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2029                 rth->rt_spec_dst = key.src;
2030                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2031                         rth->u.dst.output = ip_mc_output;
2032                         rt_cache_stat[smp_processor_id()].out_slow_mc++;
2033                 }
2034 #ifdef CONFIG_IP_MROUTE
2035                 if (res.type == RTN_MULTICAST) {
2036                         struct in_device *in_dev = in_dev_get(dev_out);
2037                         if (in_dev) {
2038                                 if (IN_DEV_MFORWARD(in_dev) &&
2039                                     !LOCAL_MCAST(oldkey->dst)) {
2040                                         rth->u.dst.input = ip_mr_input;
2041                                         rth->u.dst.output = ip_mc_output;
2042                                 }
2043                                 in_dev_put(in_dev);
2044                         }
2045                 }
2046 #endif
2047         }
2048
2049         rt_set_nexthop(rth, &res, 0);
2050
2051         rth->rt_flags = flags;
2052
2053         hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
2054         err = rt_intern_hash(hash, rth, rp);
2055 done:
2056         if (free_res)
2057                 fib_res_put(&res);
2058         if (dev_out)
2059                 dev_put(dev_out);
2060 out:    return err;
2061
2062 e_inval:
2063         err = -EINVAL;
2064         goto done;
2065 e_nobufs:
2066         err = -ENOBUFS;
2067         goto done;
2068 }
2069
2070 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
2071 {
2072         unsigned hash;
2073         struct rtable *rth;
2074
2075         hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
2076
2077         read_lock_bh(&rt_hash_table[hash].lock);
2078         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2079                 if (rth->key.dst == key->dst &&
2080                     rth->key.src == key->src &&
2081                     rth->key.iif == 0 &&
2082                     rth->key.oif == key->oif &&
2083 #ifdef CONFIG_IP_ROUTE_FWMARK
2084                     rth->key.fwmark == key->fwmark &&
2085 #endif
2086                     !((rth->key.tos ^ key->tos) &
2087                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2088                         rth->u.dst.lastuse = jiffies;
2089                         dst_hold(&rth->u.dst);
2090                         rth->u.dst.__use++;
2091                         rt_cache_stat[smp_processor_id()].out_hit++;
2092                         read_unlock_bh(&rt_hash_table[hash].lock);
2093                         *rp = rth;
2094                         return 0;
2095                 }
2096                 rt_cache_stat[smp_processor_id()].out_hlist_search++;
2097         }
2098         read_unlock_bh(&rt_hash_table[hash].lock);
2099
2100         return ip_route_output_slow(rp, key);
2101 }
2102
2103 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2104                         int nowait)
2105 {
2106         struct rtable *rt = (struct rtable*)skb->dst;
2107         struct rtmsg *r;
2108         struct nlmsghdr  *nlh;
2109         unsigned char    *b = skb->tail;
2110         struct rta_cacheinfo ci;
2111 #ifdef CONFIG_IP_MROUTE
2112         struct rtattr *eptr;
2113 #endif
2114         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2115         r = NLMSG_DATA(nlh);
2116         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2117         r->rtm_family    = AF_INET;
2118         r->rtm_dst_len  = 32;
2119         r->rtm_src_len  = 0;
2120         r->rtm_tos      = rt->key.tos;
2121         r->rtm_table    = RT_TABLE_MAIN;
2122         r->rtm_type     = rt->rt_type;
2123         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2124         r->rtm_protocol = RTPROT_UNSPEC;
2125         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2126         if (rt->rt_flags & RTCF_NOTIFY)
2127                 r->rtm_flags |= RTM_F_NOTIFY;
2128         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2129         if (rt->key.src) {
2130                 r->rtm_src_len = 32;
2131                 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
2132         }
2133         if (rt->u.dst.dev)
2134                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2135 #ifdef CONFIG_NET_CLS_ROUTE
2136         if (rt->u.dst.tclassid)
2137                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2138 #endif
2139         if (rt->key.iif)
2140                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2141         else if (rt->rt_src != rt->key.src)
2142                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2143         if (rt->rt_dst != rt->rt_gateway)
2144                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2145         if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
2146                 goto rtattr_failure;
2147         ci.rta_lastuse  = jiffies - rt->u.dst.lastuse;
2148         ci.rta_used     = rt->u.dst.__use;
2149         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2150         if (rt->u.dst.expires)
2151                 ci.rta_expires = rt->u.dst.expires - jiffies;
2152         else
2153                 ci.rta_expires = 0;
2154         ci.rta_error    = rt->u.dst.error;
2155         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2156         if (rt->peer) {
2157                 ci.rta_id = rt->peer->ip_id_count;
2158                 if (rt->peer->tcp_ts_stamp) {
2159                         ci.rta_ts = rt->peer->tcp_ts;
2160                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2161                 }
2162         }
2163 #ifdef CONFIG_IP_MROUTE
2164         eptr = (struct rtattr*)skb->tail;
2165 #endif
2166         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2167         if (rt->key.iif) {
2168 #ifdef CONFIG_IP_MROUTE
2169                 u32 dst = rt->rt_dst;
2170
2171                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2172                     ipv4_devconf.mc_forwarding) {
2173                         int err = ipmr_get_route(skb, r, nowait);
2174                         if (err <= 0) {
2175                                 if (!nowait) {
2176                                         if (err == 0)
2177                                                 return 0;
2178                                         goto nlmsg_failure;
2179                                 } else {
2180                                         if (err == -EMSGSIZE)
2181                                                 goto nlmsg_failure;
2182                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2183                                 }
2184                         }
2185                 } else
2186 #endif
2187                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2188         }
2189
2190         nlh->nlmsg_len = skb->tail - b;
2191         return skb->len;
2192
2193 nlmsg_failure:
2194 rtattr_failure:
2195         skb_trim(skb, b - skb->data);
2196         return -1;
2197 }
2198
2199 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2200 {
2201         struct rtattr **rta = arg;
2202         struct rtmsg *rtm = NLMSG_DATA(nlh);
2203         struct rtable *rt = NULL;
2204         u32 dst = 0;
2205         u32 src = 0;
2206         int iif = 0;
2207         int err = -ENOBUFS;
2208         struct sk_buff *skb;
2209
2210         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211         if (!skb)
2212                 goto out;
2213
2214         /* Reserve room for dummy headers, this skb can pass
2215            through good chunk of routing engine.
2216          */
2217         skb->mac.raw = skb->nh.raw = skb->data;
2218
2219         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2220         skb->nh.iph->protocol = IPPROTO_ICMP;
2221         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2222
2223         if (rta[RTA_SRC - 1])
2224                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2225         if (rta[RTA_DST - 1])
2226                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2227         if (rta[RTA_IIF - 1])
2228                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2229
2230         if (iif) {
2231                 struct net_device *dev = __dev_get_by_index(iif);
2232                 err = -ENODEV;
2233                 if (!dev)
2234                         goto out_free;
2235                 skb->protocol   = htons(ETH_P_IP);
2236                 skb->dev        = dev;
2237                 local_bh_disable();
2238                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2239                 local_bh_enable();
2240                 rt = (struct rtable*)skb->dst;
2241                 if (!err && rt->u.dst.error)
2242                         err = -rt->u.dst.error;
2243         } else {
2244                 int oif = 0;
2245                 if (rta[RTA_OIF - 1])
2246                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2247                 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2248         }
2249         if (err)
2250                 goto out_free;
2251
2252         skb->dst = &rt->u.dst;
2253         if (rtm->rtm_flags & RTM_F_NOTIFY)
2254                 rt->rt_flags |= RTCF_NOTIFY;
2255
2256         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2257
2258         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2259                                 RTM_NEWROUTE, 0);
2260         if (!err)
2261                 goto out_free;
2262         if (err < 0) {
2263                 err = -EMSGSIZE;
2264                 goto out_free;
2265         }
2266
2267         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2268         if (err > 0)
2269                 err = 0;
2270 out:    return err;
2271
2272 out_free:
2273         kfree_skb(skb);
2274         goto out;
2275 }
2276
2277 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2278 {
2279         struct rtable *rt;
2280         int h, s_h;
2281         int idx, s_idx;
2282
2283         s_h = cb->args[0];
2284         s_idx = idx = cb->args[1];
2285         for (h = 0; h <= rt_hash_mask; h++) {
2286                 if (h < s_h) continue;
2287                 if (h > s_h)
2288                         s_idx = 0;
2289                 read_lock_bh(&rt_hash_table[h].lock);
2290                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2291                      rt = rt->u.rt_next, idx++) {
2292                         if (idx < s_idx)
2293                                 continue;
2294                         skb->dst = dst_clone(&rt->u.dst);
2295                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2296                                          cb->nlh->nlmsg_seq,
2297                                          RTM_NEWROUTE, 1) <= 0) {
2298                                 dst_release(xchg(&skb->dst, NULL));
2299                                 read_unlock_bh(&rt_hash_table[h].lock);
2300                                 goto done;
2301                         }
2302                         dst_release(xchg(&skb->dst, NULL));
2303                 }
2304                 read_unlock_bh(&rt_hash_table[h].lock);
2305         }
2306
2307 done:
2308         cb->args[0] = h;
2309         cb->args[1] = idx;
2310         return skb->len;
2311 }
2312
2313 void ip_rt_multicast_event(struct in_device *in_dev)
2314 {
2315         rt_cache_flush(0);
2316 }
2317
2318 #ifdef CONFIG_SYSCTL
2319 static int flush_delay;
2320
2321 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2322                                         struct file *filp, void *buffer,
2323                                         size_t *lenp)
2324 {
2325         if (write) {
2326                 proc_dointvec(ctl, write, filp, buffer, lenp);
2327                 rt_cache_flush(flush_delay);
2328                 return 0;
2329         }
2330
2331         return -EINVAL;
2332 }
2333
2334 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name,
2335                                                 int nlen, void *oldval,
2336                                                 size_t *oldlenp, void *newval,
2337                                                 size_t newlen, void **context)
2338 {
2339         int delay;
2340         if (newlen != sizeof(int))
2341                 return -EINVAL;
2342         if (get_user(delay, (int *)newval))
2343                 return -EFAULT;
2344         rt_cache_flush(delay);
2345         return 0;
2346 }
2347
2348 ctl_table ipv4_route_table[] = {
2349         {
2350                 ctl_name:       NET_IPV4_ROUTE_FLUSH,
2351                 procname:       "flush",
2352                 data:           &flush_delay,
2353                 maxlen:         sizeof(int),
2354                 mode:           0644,
2355                 proc_handler:   &ipv4_sysctl_rtcache_flush,
2356                 strategy:       &ipv4_sysctl_rtcache_flush_strategy,
2357         },
2358         {
2359                 ctl_name:       NET_IPV4_ROUTE_MIN_DELAY,
2360                 procname:       "min_delay",
2361                 data:           &ip_rt_min_delay,
2362                 maxlen:         sizeof(int),
2363                 mode:           0644,
2364                 proc_handler:   &proc_dointvec_jiffies,
2365                 strategy:       &sysctl_jiffies,
2366         },
2367         {
2368                 ctl_name:       NET_IPV4_ROUTE_MAX_DELAY,
2369                 procname:       "max_delay",
2370                 data:           &ip_rt_max_delay,
2371                 maxlen:         sizeof(int),
2372                 mode:           0644,
2373                 proc_handler:   &proc_dointvec_jiffies,
2374                 strategy:       &sysctl_jiffies,
2375         },
2376         {
2377                 ctl_name:       NET_IPV4_ROUTE_GC_THRESH,
2378                 procname:       "gc_thresh",
2379                 data:           &ipv4_dst_ops.gc_thresh,
2380                 maxlen:         sizeof(int),
2381                 mode:           0644,
2382                 proc_handler:   &proc_dointvec,
2383         },
2384         {
2385                 ctl_name:       NET_IPV4_ROUTE_MAX_SIZE,
2386                 procname:       "max_size",
2387                 data:           &ip_rt_max_size,
2388                 maxlen:         sizeof(int),
2389                 mode:           0644,
2390                 proc_handler:   &proc_dointvec,
2391         },
2392         {
2393                 ctl_name:       NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2394                 procname:       "gc_min_interval",
2395                 data:           &ip_rt_gc_min_interval,
2396                 maxlen:         sizeof(int),
2397                 mode:           0644,
2398                 proc_handler:   &proc_dointvec_jiffies,
2399                 strategy:       &sysctl_jiffies,
2400         },
2401         {
2402                 ctl_name:       NET_IPV4_ROUTE_GC_TIMEOUT,
2403                 procname:       "gc_timeout",
2404                 data:           &ip_rt_gc_timeout,
2405                 maxlen:         sizeof(int),
2406                 mode:           0644,
2407                 proc_handler:   &proc_dointvec_jiffies,
2408                 strategy:       &sysctl_jiffies,
2409         },
2410         {
2411                 ctl_name:       NET_IPV4_ROUTE_GC_INTERVAL,
2412                 procname:       "gc_interval",
2413                 data:           &ip_rt_gc_interval,
2414                 maxlen:         sizeof(int),
2415                 mode:           0644,
2416                 proc_handler:   &proc_dointvec_jiffies,
2417                 strategy:       &sysctl_jiffies,
2418         },
2419         {
2420                 ctl_name:       NET_IPV4_ROUTE_REDIRECT_LOAD,
2421                 procname:       "redirect_load",
2422                 data:           &ip_rt_redirect_load,
2423                 maxlen:         sizeof(int),
2424                 mode:           0644,
2425                 proc_handler:   &proc_dointvec,
2426         },
2427         {
2428                 ctl_name:       NET_IPV4_ROUTE_REDIRECT_NUMBER,
2429                 procname:       "redirect_number",
2430                 data:           &ip_rt_redirect_number,
2431                 maxlen:         sizeof(int),
2432                 mode:           0644,
2433                 proc_handler:   &proc_dointvec,
2434         },
2435         {
2436                 ctl_name:       NET_IPV4_ROUTE_REDIRECT_SILENCE,
2437                 procname:       "redirect_silence",
2438                 data:           &ip_rt_redirect_silence,
2439                 maxlen:         sizeof(int),
2440                 mode:           0644,
2441                 proc_handler:   &proc_dointvec,
2442         },
2443         {
2444                 ctl_name:       NET_IPV4_ROUTE_ERROR_COST,
2445                 procname:       "error_cost",
2446                 data:           &ip_rt_error_cost,
2447                 maxlen:         sizeof(int),
2448                 mode:           0644,
2449                 proc_handler:   &proc_dointvec,
2450         },
2451         {
2452                 ctl_name:       NET_IPV4_ROUTE_ERROR_BURST,
2453                 procname:       "error_burst",
2454                 data:           &ip_rt_error_burst,
2455                 maxlen:         sizeof(int),
2456                 mode:           0644,
2457                 proc_handler:   &proc_dointvec,
2458         },
2459         {
2460                 ctl_name:       NET_IPV4_ROUTE_GC_ELASTICITY,
2461                 procname:       "gc_elasticity",
2462                 data:           &ip_rt_gc_elasticity,
2463                 maxlen:         sizeof(int),
2464                 mode:           0644,
2465                 proc_handler:   &proc_dointvec,
2466         },
2467         {
2468                 ctl_name:       NET_IPV4_ROUTE_MTU_EXPIRES,
2469                 procname:       "mtu_expires",
2470                 data:           &ip_rt_mtu_expires,
2471                 maxlen:         sizeof(int),
2472                 mode:           0644,
2473                 proc_handler:   &proc_dointvec_jiffies,
2474                 strategy:       &sysctl_jiffies,
2475         },
2476         {
2477                 ctl_name:       NET_IPV4_ROUTE_MIN_PMTU,
2478                 procname:       "min_pmtu",
2479                 data:           &ip_rt_min_pmtu,
2480                 maxlen:         sizeof(int),
2481                 mode:           0644,
2482                 proc_handler:   &proc_dointvec,
2483         },
2484         {
2485                 ctl_name:       NET_IPV4_ROUTE_MIN_ADVMSS,
2486                 procname:       "min_adv_mss",
2487                 data:           &ip_rt_min_advmss,
2488                 maxlen:         sizeof(int),
2489                 mode:           0644,
2490                 proc_handler:   &proc_dointvec,
2491         },
2492         {
2493                 ctl_name:       NET_IPV4_ROUTE_SECRET_INTERVAL,
2494                 procname:       "secret_interval",
2495                 data:           &ip_rt_secret_interval,
2496                 maxlen:         sizeof(int),
2497                 mode:           0644,
2498                 proc_handler:   &proc_dointvec_jiffies,
2499                 strategy:       &sysctl_jiffies,
2500         },
2501          { 0 }
2502 };
2503 #endif
2504
2505 #ifdef CONFIG_NET_CLS_ROUTE
2506 struct ip_rt_acct *ip_rt_acct;
2507
2508 /* This code sucks.  But you should have seen it before! --RR */
2509
2510 /* IP route accounting ptr for this logical cpu number. */
2511 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
2512
2513 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2514                            int length, int *eof, void *data)
2515 {
2516         unsigned int i;
2517
2518         if ((offset & 3) || (length & 3))
2519                 return -EIO;
2520
2521         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2522                 *eof = 1;
2523                 return 0;
2524         }
2525
2526         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2527                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2528                 *eof = 1;
2529         }
2530
2531         offset /= sizeof(u32);
2532
2533         if (length > 0) {
2534                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2535                 u32 *dst = (u32 *) buffer;
2536
2537                 /* Copy first cpu. */
2538                 *start = buffer;
2539                 memcpy(dst, src, length);
2540
2541                 /* Add the other cpus in, one int at a time */
2542                 for (i = 1; i < smp_num_cpus; i++) {
2543                         unsigned int j;
2544
2545                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2546
2547                         for (j = 0; j < length/4; j++)
2548                                 dst[j] += src[j];
2549                 }
2550         }
2551         return length;
2552 }
2553 #endif
2554
2555 void __init ip_rt_init(void)
2556 {
2557         int i, order, goal;
2558
2559         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2560                              (jiffies ^ (jiffies >> 7)));
2561
2562 #ifdef CONFIG_NET_CLS_ROUTE
2563         for (order = 0;
2564              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2565                 /* NOTHING */;
2566         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2567         if (!ip_rt_acct)
2568                 panic("IP: failed to allocate ip_rt_acct\n");
2569         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2570 #endif
2571
2572         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2573                                                      sizeof(struct rtable),
2574                                                      0, SLAB_HWCACHE_ALIGN,
2575                                                      NULL, NULL);
2576
2577         if (!ipv4_dst_ops.kmem_cachep)
2578                 panic("IP: failed to allocate ip_dst_cache\n");
2579
2580         goal = num_physpages >> (26 - PAGE_SHIFT);
2581
2582         for (order = 0; (1UL << order) < goal; order++)
2583                 /* NOTHING */;
2584
2585         do {
2586                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2587                         sizeof(struct rt_hash_bucket);
2588                 while (rt_hash_mask & (rt_hash_mask - 1))
2589                         rt_hash_mask--;
2590                 rt_hash_table = (struct rt_hash_bucket *)
2591                         __get_free_pages(GFP_ATOMIC, order);
2592         } while (rt_hash_table == NULL && --order > 0);
2593
2594         if (!rt_hash_table)
2595                 panic("Failed to allocate IP route cache hash table\n");
2596
2597         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2598                rt_hash_mask,
2599                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2600
2601         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2602                 /* NOTHING */;
2603
2604         rt_hash_mask--;
2605         for (i = 0; i <= rt_hash_mask; i++) {
2606                 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2607                 rt_hash_table[i].chain = NULL;
2608         }
2609
2610         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2611         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2612
2613         devinet_init();
2614         ip_fib_init();
2615
2616         rt_flush_timer.function = rt_run_flush;
2617         rt_periodic_timer.function = rt_check_expire;
2618         rt_secret_timer.function = rt_secret_rebuild;
2619
2620         /* All the timers, started at system startup tend
2621            to synchronize. Perturb it a bit.
2622          */
2623         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2624                                         ip_rt_gc_interval;
2625         add_timer(&rt_periodic_timer);
2626
2627         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2628                 ip_rt_secret_interval;
2629         add_timer(&rt_secret_timer);
2630
2631         proc_net_create ("rt_cache", 0, rt_cache_get_info);
2632         create_proc_info_entry ("rt_cache", 0, proc_net_stat,
2633                                 rt_cache_stat_get_info);
2634 #ifdef CONFIG_NET_CLS_ROUTE
2635         create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2636 #endif
2637 }