kernel/linux/net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.5 2006/09/05 08:03:56 michaelc Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *
  58  *              This program is free software; you can redistribute it and/or
  59  *              modify it under the terms of the GNU General Public License
  60  *              as published by the Free Software Foundation; either version
  61  *              2 of the License, or (at your option) any later version.
  62  */
  63
  64 #include <linux/config.h>
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <asm/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/sched.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/rtnetlink.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/protocol.h>
  94 #include <net/ip.h>
  95 #include <net/route.h>
  96 #include <net/inetpeer.h>
  97 #include <net/sock.h>
  98 #include <net/ip_fib.h>
  99 #include <net/arp.h>
 100 #include <net/tcp.h>
 101 #include <net/icmp.h>
 102 #include <net/xfrm.h>
 103 #ifdef CONFIG_SYSCTL
 104 #include <linux/sysctl.h>
 105 #endif
 106
 107 #define IP_MAX_MTU      0xFFF0
 108
 109 #define RT_GC_TIMEOUT (300*HZ)
 110
 111 int ip_rt_min_delay             = 2 * HZ;
 112 int ip_rt_max_delay             = 10 * HZ;
 113 int ip_rt_max_size;
 114 int ip_rt_gc_timeout            = RT_GC_TIMEOUT;
 115 int ip_rt_gc_interval           = 60 * HZ;
 116 int ip_rt_gc_min_interval       = HZ / 2;
 117 int ip_rt_redirect_number       = 9;
 118 int ip_rt_redirect_load         = HZ / 50;
 119 int ip_rt_redirect_silence      = ((HZ / 50) << (9 + 1));
 120 int ip_rt_error_cost            = HZ;
 121 int ip_rt_error_burst           = 5 * HZ;
 122 int ip_rt_gc_elasticity         = 1;
 123 int ip_rt_mtu_expires           = 10 * 60 * HZ;
 124 int ip_rt_min_pmtu              = 512 + 20 + 20;
 125 int ip_rt_min_advmss            = 256;
 126 int ip_rt_secret_interval       = 10 * 60 * HZ;
 127 static unsigned long rt_deadline;
 128
 129 #define RTprint(a...)   printk(KERN_DEBUG a)
 130
 131 static struct timer_list rt_flush_timer;
 132 static struct timer_list rt_periodic_timer;
 133 static struct timer_list rt_secret_timer;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static void              ipv4_dst_destroy(struct dst_entry *dst);
 141 static void              ipv4_dst_ifdown(struct dst_entry *dst, int how);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 145 static int rt_garbage_collect(void);
 146
 147
 148 static struct dst_ops ipv4_dst_ops = {
 149         .family =               AF_INET,
 150         .protocol =             __constant_htons(ETH_P_IP),
 151         .gc =                   rt_garbage_collect,
 152         .check =                ipv4_dst_check,
 153         .destroy =              ipv4_dst_destroy,
 154         .ifdown =               ipv4_dst_ifdown,
 155         .negative_advice =      ipv4_negative_advice,
 156         .link_failure =         ipv4_link_failure,
 157         .update_pmtu =          ip_rt_update_pmtu,
 158         .entry_size =           sizeof(struct rtable),
 159 };
 160
 161 #define ECN_OR_COST(class)      TC_PRIO_##class
 162
 163 __u8 ip_tos2prio[16] = {
 164         TC_PRIO_BESTEFFORT,
 165         ECN_OR_COST(FILLER),
 166         TC_PRIO_BESTEFFORT,
 167         ECN_OR_COST(BESTEFFORT),
 168         TC_PRIO_BULK,
 169         ECN_OR_COST(BULK),
 170         TC_PRIO_BULK,
 171         ECN_OR_COST(BULK),
 172         TC_PRIO_INTERACTIVE,
 173         ECN_OR_COST(INTERACTIVE),
 174         TC_PRIO_INTERACTIVE,
 175         ECN_OR_COST(INTERACTIVE),
 176         TC_PRIO_INTERACTIVE_BULK,
 177         ECN_OR_COST(INTERACTIVE_BULK),
 178         TC_PRIO_INTERACTIVE_BULK,
 179         ECN_OR_COST(INTERACTIVE_BULK)
 180 };
 181
 182
 183 /*
 184  * Route cache.
 185  */
 186
 187 /* The locking scheme is rather straight forward:
 188  *
 189  * 1) Read-Copy Update protects the buckets of the central route hash.
 190  * 2) Only writers remove entries, and they hold the lock
 191  *    as they look at rtable reference counts.
 192  * 3) Only readers acquire references to rtable entries,
 193  *    they do so with atomic increments and with the
 194  *    lock held.
 195  */
 196
 197 struct rt_hash_bucket {
 198         struct rtable   *chain;
 199         spinlock_t      lock;
 200 } __attribute__((__aligned__(8)));
 201
 202 static struct rt_hash_bucket    *rt_hash_table;
 203 static unsigned                 rt_hash_mask;
 204 static int                      rt_hash_log;
 205 static unsigned int             rt_hash_rnd;
 206
 207 struct rt_cache_stat *rt_cache_stat;
 208
 209 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 210                                 struct rtable **res);
 211
 212 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 213 {
 214         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 215                 & rt_hash_mask);
 216 }
 217
 218 #ifdef CONFIG_PROC_FS
 219 struct rt_cache_iter_state {
 220         int bucket;
 221 };
 222
 223 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 224 {
 225         struct rtable *r = NULL;
 226         struct rt_cache_iter_state *st = seq->private;
 227
 228         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 229                 rcu_read_lock();
 230                 r = rt_hash_table[st->bucket].chain;
 231                 if (r)
 232                         break;
 233                 rcu_read_unlock();
 234         }
 235         return r;
 236 }
 237
 238 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 239 {
 240         struct rt_cache_iter_state *st = seq->private;
 241
 242         smp_read_barrier_depends();
 243         r = r->u.rt_next;
 244         while (!r) {
 245                 rcu_read_unlock();
 246                 if (--st->bucket < 0)
 247                         break;
 248                 rcu_read_lock();
 249                 r = rt_hash_table[st->bucket].chain;
 250         }
 251         return r;
 252 }
 253
 254 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 255 {
 256         struct rtable *r = rt_cache_get_first(seq);
 257
 258         if (r)
 259                 while (pos && (r = rt_cache_get_next(seq, r)))
 260                         --pos;
 261         return pos ? NULL : r;
 262 }
 263
 264 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 265 {
 266         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 267 }
 268
 269 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 270 {
 271         struct rtable *r = NULL;
 272
 273         if (v == SEQ_START_TOKEN)
 274                 r = rt_cache_get_first(seq);
 275         else
 276                 r = rt_cache_get_next(seq, v);
 277         ++*pos;
 278         return r;
 279 }
 280
 281 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 282 {
 283         if (v && v != SEQ_START_TOKEN)
 284                 rcu_read_unlock();
 285 }
 286
 287 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 288 {
 289         if (v == SEQ_START_TOKEN)
 290                 seq_printf(seq, "%-127s\n",
 291                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 292                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 293                            "HHUptod\tSpecDst");
 294         else {
 295                 struct rtable *r = v;
 296                 char temp[256];
 297
 298                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 299                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 300                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 301                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 302                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 303                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 304                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 305                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 306                         dst_metric(&r->u.dst, RTAX_WINDOW),
 307                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 308                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 309                         r->fl.fl4_tos,
 310                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 311                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 312                                        dev_queue_xmit) : 0,
 313                         r->rt_spec_dst);
 314                 seq_printf(seq, "%-127s\n", temp);
 315         }
 316         return 0;
 317 }
 318
 319 static struct seq_operations rt_cache_seq_ops = {
 320         .start  = rt_cache_seq_start,
 321         .next   = rt_cache_seq_next,
 322         .stop   = rt_cache_seq_stop,
 323         .show   = rt_cache_seq_show,
 324 };
 325
 326 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 327 {
 328         struct seq_file *seq;
 329         int rc = -ENOMEM;
 330         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 331
 332         if (!s)
 333                 goto out;
 334         rc = seq_open(file, &rt_cache_seq_ops);
 335         if (rc)
 336                 goto out_kfree;
 337         seq          = file->private_data;
 338         seq->private = s;
 339         memset(s, 0, sizeof(*s));
 340 out:
 341         return rc;
 342 out_kfree:
 343         kfree(s);
 344         goto out;
 345 }
 346
 347 static struct file_operations rt_cache_seq_fops = {
 348         .owner   = THIS_MODULE,
 349         .open    = rt_cache_seq_open,
 350         .read    = seq_read,
 351         .llseek  = seq_lseek,
 352         .release = seq_release_private,
 353 };
 354
 355
 356 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 357 {
 358         int cpu;
 359
 360         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 361                 if (!cpu_possible(cpu))
 362                         continue;
 363                 *pos = cpu;
 364                 return per_cpu_ptr(rt_cache_stat, cpu);
 365         }
 366         return NULL;
 367 }
 368
 369 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 370 {
 371         int cpu;
 372
 373         for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
 374                 if (!cpu_possible(cpu))
 375                         continue;
 376                 *pos = cpu;
 377                 return per_cpu_ptr(rt_cache_stat, cpu);
 378         }
 379         return NULL;
 380
 381 }
 382
 383 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 384 {
 385
 386 }
 387
 388 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 389 {
 390         struct rt_cache_stat *st = v;
 391
 392         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 393                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 394                    atomic_read(&ipv4_dst_ops.entries),
 395                    st->in_hit,
 396                    st->in_slow_tot,
 397                    st->in_slow_mc,
 398                    st->in_no_route,
 399                    st->in_brd,
 400                    st->in_martian_dst,
 401                    st->in_martian_src,
 402
 403                    st->out_hit,
 404                    st->out_slow_tot,
 405                    st->out_slow_mc,
 406
 407                    st->gc_total,
 408                    st->gc_ignored,
 409                    st->gc_goal_miss,
 410                    st->gc_dst_overflow,
 411                    st->in_hlist_search,
 412                    st->out_hlist_search
 413                 );
 414         return 0;
 415 }
 416
 417 static struct seq_operations rt_cpu_seq_ops = {
 418         .start  = rt_cpu_seq_start,
 419         .next   = rt_cpu_seq_next,
 420         .stop   = rt_cpu_seq_stop,
 421         .show   = rt_cpu_seq_show,
 422 };
 423
 424
 425 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 426 {
 427         return seq_open(file, &rt_cpu_seq_ops);
 428 }
 429
 430 static struct file_operations rt_cpu_seq_fops = {
 431         .owner   = THIS_MODULE,
 432         .open    = rt_cpu_seq_open,
 433         .read    = seq_read,
 434         .llseek  = seq_lseek,
 435         .release = seq_release,
 436 };
 437
 438 #endif /* CONFIG_PROC_FS */
 439
 440 static __inline__ void rt_free(struct rtable *rt)
 441 {
 442         call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 443 }
 444
 445 static __inline__ void rt_drop(struct rtable *rt)
 446 {
 447         ip_rt_put(rt);
 448         call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 449 }
 450
 451 static __inline__ int rt_fast_clean(struct rtable *rth)
 452 {
 453         /* Kill broadcast/multicast entries very aggresively, if they
 454            collide in hash table with more useful entries */
 455         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 456                 rth->fl.iif && rth->u.rt_next;
 457 }
 458
 459 static __inline__ int rt_valuable(struct rtable *rth)
 460 {
 461         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 462                 rth->u.dst.expires;
 463 }
 464
 465 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 466 {
 467         unsigned long age;
 468         int ret = 0;
 469
 470         if (atomic_read(&rth->u.dst.__refcnt))
 471                 goto out;
 472
 473         ret = 1;
 474         if (rth->u.dst.expires &&
 475             time_after_eq(jiffies, rth->u.dst.expires))
 476                 goto out;
 477
 478         age = jiffies - rth->u.dst.lastuse;
 479         ret = 0;
 480         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 481             (age <= tmo2 && rt_valuable(rth)))
 482                 goto out;
 483         ret = 1;
 484 out:    return ret;
 485 }
 486
 487 /* Bits of score are:
 488  * 31: very valuable
 489  * 30: not quite useless
 490  * 29..0: usage counter
 491  */
 492 static inline u32 rt_score(struct rtable *rt)
 493 {
 494         u32 score = jiffies - rt->u.dst.lastuse;
 495
 496         score = ~score & ~(3<<30);
 497
 498         if (rt_valuable(rt))
 499                 score |= (1<<31);
 500
 501         if (!rt->fl.iif ||
 502             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 503                 score |= (1<<30);
 504
 505         return score;
 506 }
 507
 508 /* This runs via a timer and thus is always in BH context. */
 509 static void rt_check_expire(unsigned long dummy)
 510 {
 511         static int rover;
 512         int i = rover, t;
 513         struct rtable *rth, **rthp;
 514         unsigned long now = jiffies;
 515
 516         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 517              t -= ip_rt_gc_timeout) {
 518                 unsigned long tmo = ip_rt_gc_timeout;
 519
 520                 i = (i + 1) & rt_hash_mask;
 521                 rthp = &rt_hash_table[i].chain;
 522
 523                 spin_lock(&rt_hash_table[i].lock);
 524                 while ((rth = *rthp) != NULL) {
 525                         if (rth->u.dst.expires) {
 526                                 /* Entry is expired even if it is in use */
 527                                 if (time_before_eq(now, rth->u.dst.expires)) {
 528                                         tmo >>= 1;
 529                                         rthp = &rth->u.rt_next;
 530                                         continue;
 531                                 }
 532                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 533                                 tmo >>= 1;
 534                                 rthp = &rth->u.rt_next;
 535                                 continue;
 536                         }
 537
 538                         /* Cleanup aged off entries. */
 539                         *rthp = rth->u.rt_next;
 540                         rt_free(rth);
 541                 }
 542                 spin_unlock(&rt_hash_table[i].lock);
 543
 544                 /* Fallback loop breaker. */
 545                 if (time_after(jiffies, now))
 546                         break;
 547         }
 548         rover = i;
 549         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 550 }
 551
 552 /* This can run from both BH and non-BH contexts, the latter
 553  * in the case of a forced flush event.
 554  */
 555 static void rt_run_flush(unsigned long dummy)
 556 {
 557         int i;
 558         struct rtable *rth, *next;
 559
 560         rt_deadline = 0;
 561
 562         get_random_bytes(&rt_hash_rnd, 4);
 563
 564         for (i = rt_hash_mask; i >= 0; i--) {
 565                 spin_lock_bh(&rt_hash_table[i].lock);
 566                 rth = rt_hash_table[i].chain;
 567                 if (rth)
 568                         rt_hash_table[i].chain = NULL;
 569                 spin_unlock_bh(&rt_hash_table[i].lock);
 570
 571                 for (; rth; rth = next) {
 572                         next = rth->u.rt_next;
 573                         rt_free(rth);
 574                 }
 575         }
 576 }
 577
 578 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 579
 580 void rt_cache_flush(int delay)
 581 {
 582         unsigned long now = jiffies;
 583         int user_mode = !in_softirq();
 584
 585         if (delay < 0)
 586                 delay = ip_rt_min_delay;
 587
 588         spin_lock_bh(&rt_flush_lock);
 589
 590         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 591                 long tmo = (long)(rt_deadline - now);
 592
 593                 /* If flush timer is already running
 594                    and flush request is not immediate (delay > 0):
 595
 596                    if deadline is not achieved, prolongate timer to "delay",
 597                    otherwise fire it at deadline time.
 598                  */
 599
 600                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 601                         tmo = 0;
 602
 603                 if (delay > tmo)
 604                         delay = tmo;
 605         }
 606
 607         if (delay <= 0) {
 608                 spin_unlock_bh(&rt_flush_lock);
 609                 rt_run_flush(0);
 610                 return;
 611         }
 612
 613         if (rt_deadline == 0)
 614                 rt_deadline = now + ip_rt_max_delay;
 615
 616         mod_timer(&rt_flush_timer, now+delay);
 617         spin_unlock_bh(&rt_flush_lock);
 618 }
 619
 620 static void rt_secret_rebuild(unsigned long dummy)
 621 {
 622         unsigned long now = jiffies;
 623
 624         rt_cache_flush(0);
 625         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 626 }
 627
 628 /*
 629    Short description of GC goals.
 630
 631    We want to build algorithm, which will keep routing cache
 632    at some equilibrium point, when number of aged off entries
 633    is kept approximately equal to newly generated ones.
 634
 635    Current expiration strength is variable "expire".
 636    We try to adjust it dynamically, so that if networking
 637    is idle expires is large enough to keep enough of warm entries,
 638    and when load increases it reduces to limit cache size.
 639  */
 640
 641 static int rt_garbage_collect(void)
 642 {
 643         static unsigned long expire = RT_GC_TIMEOUT;
 644         static unsigned long last_gc;
 645         static int rover;
 646         static int equilibrium;
 647         struct rtable *rth, **rthp;
 648         unsigned long now = jiffies;
 649         int goal;
 650
 651         /*
 652          * Garbage collection is pretty expensive,
 653          * do not make it too frequently.
 654          */
 655
 656         RT_CACHE_STAT_INC(gc_total);
 657
 658         if (now - last_gc < ip_rt_gc_min_interval &&
 659             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 660                 RT_CACHE_STAT_INC(gc_ignored);
 661                 goto out;
 662         }
 663
 664         /* Calculate number of entries, which we want to expire now. */
 665         goal = atomic_read(&ipv4_dst_ops.entries) -
 666                 (ip_rt_gc_elasticity << rt_hash_log);
 667         if (goal <= 0) {
 668                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 669                         equilibrium = ipv4_dst_ops.gc_thresh;
 670                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 671                 if (goal > 0) {
 672                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 673                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 674                 }
 675         } else {
 676                 /* We are in dangerous area. Try to reduce cache really
 677                  * aggressively.
 678                  */
 679                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 680                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 681         }
 682
 683         if (now - last_gc >= ip_rt_gc_min_interval)
 684                 last_gc = now;
 685
 686         if (goal <= 0) {
 687                 equilibrium += goal;
 688                 goto work_done;
 689         }
 690
 691         do {
 692                 int i, k;
 693
 694                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 695                         unsigned long tmo = expire;
 696
 697                         k = (k + 1) & rt_hash_mask;
 698                         rthp = &rt_hash_table[k].chain;
 699                         spin_lock_bh(&rt_hash_table[k].lock);
 700                         while ((rth = *rthp) != NULL) {
 701                                 if (!rt_may_expire(rth, tmo, expire)) {
 702                                         tmo >>= 1;
 703                                         rthp = &rth->u.rt_next;
 704                                         continue;
 705                                 }
 706                                 *rthp = rth->u.rt_next;
 707                                 rt_free(rth);
 708                                 goal--;
 709                         }
 710                         spin_unlock_bh(&rt_hash_table[k].lock);
 711                         if (goal <= 0)
 712                                 break;
 713                 }
 714                 rover = k;
 715
 716                 if (goal <= 0)
 717                         goto work_done;
 718
 719                 /* Goal is not achieved. We stop process if:
 720
 721                    - if expire reduced to zero. Otherwise, expire is halfed.
 722                    - if table is not full.
 723                    - if we are called from interrupt.
 724                    - jiffies check is just fallback/debug loop breaker.
 725                      We will not spin here for long time in any case.
 726                  */
 727
 728                 RT_CACHE_STAT_INC(gc_goal_miss);
 729
 730                 if (expire == 0)
 731                         break;
 732
 733                 expire >>= 1;
 734 #if RT_CACHE_DEBUG >= 2
 735                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 736                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 737 #endif
 738
 739                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 740                         goto out;
 741         } while (!in_softirq() && time_before_eq(jiffies, now));
 742
 743         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 744                 goto out;
 745 #if 0   //vic
 746         if (net_ratelimit())
 747                 printk(KERN_WARNING "dst cache overflow\n");
 748 #endif
 749         RT_CACHE_STAT_INC(gc_dst_overflow);
 750         return 1;
 751
 752 work_done:
 753         expire += ip_rt_gc_min_interval;
 754         if (expire > ip_rt_gc_timeout ||
 755             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 756                 expire = ip_rt_gc_timeout;
 757 #if RT_CACHE_DEBUG >= 2
 758         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 759                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 760 #endif
 761 out:    return 0;
 762 }
 763
 764 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 765 {
 766         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 767                fl1->oif     == fl2->oif &&
 768                fl1->iif     == fl2->iif;
 769 }
 770
 771 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 772 {
 773         struct rtable   *rth, **rthp;
 774         unsigned long   now;
 775         struct rtable *cand, **candp;
 776         u32             min_score;
 777         int             chain_length;
 778         int attempts = !in_softirq();
 779
 780 restart:
 781         chain_length = 0;
 782         min_score = ~(u32)0;
 783         cand = NULL;
 784         candp = NULL;
 785         now = jiffies;
 786
 787         rthp = &rt_hash_table[hash].chain;
 788
 789         spin_lock_bh(&rt_hash_table[hash].lock);
 790         while ((rth = *rthp) != NULL) {
 791                 if (compare_keys(&rth->fl, &rt->fl)) {
 792                         /* Put it first */
 793                         *rthp = rth->u.rt_next;
 794                         /*
 795                          * Since lookup is lockfree, the deletion
 796                          * must be visible to another weakly ordered CPU before
 797                          * the insertion at the start of the hash chain.
 798                          */
 799                         smp_wmb();
 800                         rth->u.rt_next = rt_hash_table[hash].chain;
 801                         /*
 802                          * Since lookup is lockfree, the update writes
 803                          * must be ordered for consistency on SMP.
 804                          */
 805                         smp_wmb();
 806                         rt_hash_table[hash].chain = rth;
 807
 808                         rth->u.dst.__use++;
 809                         dst_hold(&rth->u.dst);
 810                         rth->u.dst.lastuse = now;
 811                         spin_unlock_bh(&rt_hash_table[hash].lock);
 812
 813                         rt_drop(rt);
 814                         *rp = rth;
 815                         return 0;
 816                 }
 817
 818                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 819                         u32 score = rt_score(rth);
 820
 821                         if (score <= min_score) {
 822                                 cand = rth;
 823                                 candp = rthp;
 824                                 min_score = score;
 825                         }
 826                 }
 827
 828                 chain_length++;
 829
 830                 rthp = &rth->u.rt_next;
 831         }
 832
 833         if (cand) {
 834                 /* ip_rt_gc_elasticity used to be average length of chain
 835                  * length, when exceeded gc becomes really aggressive.
 836                  *
 837                  * The second limit is less certain. At the moment it allows
 838                  * only 2 entries per bucket. We will see.
 839                  */
 840                 if (chain_length > ip_rt_gc_elasticity) {
 841                         *candp = cand->u.rt_next;
 842                         rt_free(cand);
 843                 }
 844         }
 845
 846         /* Try to bind route to arp only if it is output
 847            route or unicast forwarding path.
 848          */
 849         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 850                 int err = arp_bind_neighbour(&rt->u.dst);
 851                 if (err) {
 852                         spin_unlock_bh(&rt_hash_table[hash].lock);
 853
 854                         if (err != -ENOBUFS) {
 855                                 rt_drop(rt);
 856                                 return err;
 857                         }
 858
 859                         /* Neighbour tables are full and nothing
 860                            can be released. Try to shrink route cache,
 861                            it is most likely it holds some neighbour records.
 862                          */
 863                         if (attempts-- > 0) {
 864                                 int saved_elasticity = ip_rt_gc_elasticity;
 865                                 int saved_int = ip_rt_gc_min_interval;
 866                                 ip_rt_gc_elasticity     = 1;
 867                                 ip_rt_gc_min_interval   = 0;
 868                                 rt_garbage_collect();
 869                                 ip_rt_gc_min_interval   = saved_int;
 870                                 ip_rt_gc_elasticity     = saved_elasticity;
 871                                 goto restart;
 872                         }
 873
 874                         if (net_ratelimit())
 875                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 876                         rt_drop(rt);
 877                         return -ENOBUFS;
 878                 }
 879         }
 880
 881         rt->u.rt_next = rt_hash_table[hash].chain;
 882 #if RT_CACHE_DEBUG >= 2
 883         if (rt->u.rt_next) {
 884                 struct rtable *trt;
 885                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 886                        NIPQUAD(rt->rt_dst));
 887                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
 888                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 889                 printk("\n");
 890         }
 891 #endif
 892         rt_hash_table[hash].chain = rt;
 893         spin_unlock_bh(&rt_hash_table[hash].lock);
 894         *rp = rt;
 895         return 0;
 896 }
 897
 898 void rt_bind_peer(struct rtable *rt, int create)
 899 {
 900         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
 901         struct inet_peer *peer;
 902
 903         peer = inet_getpeer(rt->rt_dst, create);
 904
 905         spin_lock_bh(&rt_peer_lock);
 906         if (rt->peer == NULL) {
 907                 rt->peer = peer;
 908                 peer = NULL;
 909         }
 910         spin_unlock_bh(&rt_peer_lock);
 911         if (peer)
 912                 inet_putpeer(peer);
 913 }
 914
 915 /*
 916  * Peer allocation may fail only in serious out-of-memory conditions.  However
 917  * we still can generate some output.
 918  * Random ID selection looks a bit dangerous because we have no chances to
 919  * select ID being unique in a reasonable period of time.
 920  * But broken packet identifier may be better than no packet at all.
 921  */
 922 static void ip_select_fb_ident(struct iphdr *iph)
 923 {
 924         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
 925         static u32 ip_fallback_id;
 926         u32 salt;
 927
 928         spin_lock_bh(&ip_fb_id_lock);
 929         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
 930         iph->id = htons(salt & 0xFFFF);
 931         ip_fallback_id = salt;
 932         spin_unlock_bh(&ip_fb_id_lock);
 933 }
 934
 935 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 936 {
 937         struct rtable *rt = (struct rtable *) dst;
 938
 939         if (rt) {
 940                 if (rt->peer == NULL)
 941                         rt_bind_peer(rt, 1);
 942
 943                 /* If peer is attached to destination, it is never detached,
 944                    so that we need not to grab a lock to dereference it.
 945                  */
 946                 if (rt->peer) {
 947                         iph->id = htons(inet_getid(rt->peer, more));
 948                         return;
 949                 }
 950         } else
 951                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
 952
 953         ip_select_fb_ident(iph);
 954 }
 955
 956 static void rt_del(unsigned hash, struct rtable *rt)
 957 {
 958         struct rtable **rthp;
 959
 960         spin_lock_bh(&rt_hash_table[hash].lock);
 961         ip_rt_put(rt);
 962         for (rthp = &rt_hash_table[hash].chain; *rthp;
 963              rthp = &(*rthp)->u.rt_next)
 964                 if (*rthp == rt) {
 965                         *rthp = rt->u.rt_next;
 966                         rt_free(rt);
 967                         break;
 968                 }
 969         spin_unlock_bh(&rt_hash_table[hash].lock);
 970 }
 971
 972 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 973                     u32 saddr, u8 tos, struct net_device *dev)
 974 {
 975         int i, k;
 976         struct in_device *in_dev = in_dev_get(dev);
 977         struct rtable *rth, **rthp;
 978         u32  skeys[2] = { saddr, 0 };
 979         int  ikeys[2] = { dev->ifindex, 0 };
 980
 981         tos &= IPTOS_RT_MASK;
 982
 983         if (!in_dev)
 984                 return;
 985
 986         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 987             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 988                 goto reject_redirect;
 989
 990         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 991                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 992                         goto reject_redirect;
 993                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 994                         goto reject_redirect;
 995         } else {
 996                 if (inet_addr_type(new_gw) != RTN_UNICAST)
 997                         goto reject_redirect;
 998         }
 999
1000         for (i = 0; i < 2; i++) {
1001                 for (k = 0; k < 2; k++) {
1002                         unsigned hash = rt_hash_code(daddr,
1003                                                      skeys[i] ^ (ikeys[k] << 5),
1004                                                      tos);
1005
1006                         rthp=&rt_hash_table[hash].chain;
1007
1008                         rcu_read_lock();
1009                         while ((rth = *rthp) != NULL) {
1010                                 struct rtable *rt;
1011
1012                                 smp_read_barrier_depends();
1013                                 if (rth->fl.fl4_dst != daddr ||
1014                                     rth->fl.fl4_src != skeys[i] ||
1015                                     rth->fl.fl4_tos != tos ||
1016                                     rth->fl.oif != ikeys[k] ||
1017                                     rth->fl.iif != 0) {
1018                                         rthp = &rth->u.rt_next;
1019                                         continue;
1020                                 }
1021
1022                                 if (rth->rt_dst != daddr ||
1023                                     rth->rt_src != saddr ||
1024                                     rth->u.dst.error ||
1025                                     rth->rt_gateway != old_gw ||
1026                                     rth->u.dst.dev != dev)
1027                                         break;
1028
1029                                 dst_hold(&rth->u.dst);
1030                                 rcu_read_unlock();
1031
1032                                 rt = dst_alloc(&ipv4_dst_ops);
1033                                 if (rt == NULL) {
1034                                         ip_rt_put(rth);
1035                                         in_dev_put(in_dev);
1036                                         return;
1037                                 }
1038
1039                                 /* Copy all the information. */
1040                                 *rt = *rth;
1041                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1042                                 rt->u.dst.__use         = 1;
1043                                 atomic_set(&rt->u.dst.__refcnt, 1);
1044                                 rt->u.dst.child         = NULL;
1045                                 if (rt->u.dst.dev)
1046                                         dev_hold(rt->u.dst.dev);
1047                                 if (rt->idev)
1048                                         in_dev_hold(rt->idev);
1049                                 rt->u.dst.obsolete      = 0;
1050                                 rt->u.dst.lastuse       = jiffies;
1051                                 rt->u.dst.path          = &rt->u.dst;
1052                                 rt->u.dst.neighbour     = NULL;
1053                                 rt->u.dst.hh            = NULL;
1054                                 rt->u.dst.xfrm          = NULL;
1055
1056                                 rt->rt_flags            |= RTCF_REDIRECTED;
1057
1058                                 /* Gateway is different ... */
1059                                 rt->rt_gateway          = new_gw;
1060
1061                                 /* Redirect received -> path was valid */
1062                                 dst_confirm(&rth->u.dst);
1063
1064                                 if (rt->peer)
1065                                         atomic_inc(&rt->peer->refcnt);
1066
1067                                 if (arp_bind_neighbour(&rt->u.dst) ||
1068                                     !(rt->u.dst.neighbour->nud_state &
1069                                             NUD_VALID)) {
1070                                         if (rt->u.dst.neighbour)
1071                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1072                                         ip_rt_put(rth);
1073                                         rt_drop(rt);
1074                                         goto do_next;
1075                                 }
1076
1077                                 rt_del(hash, rth);
1078                                 if (!rt_intern_hash(hash, rt, &rt))
1079                                         ip_rt_put(rt);
1080                                 goto do_next;
1081                         }
1082                         rcu_read_unlock();
1083                 do_next:
1084                         ;
1085                 }
1086         }
1087         in_dev_put(in_dev);
1088         return;
1089
1090 reject_redirect:
1091 #ifdef CONFIG_IP_ROUTE_VERBOSE
1092         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1093                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1094                         "%u.%u.%u.%u ignored.\n"
1095                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1096                         "tos %02x\n",
1097                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1098                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1099 #endif
1100         in_dev_put(in_dev);
1101 }
1102
1103 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1104 {
1105         struct rtable *rt = (struct rtable*)dst;
1106         struct dst_entry *ret = dst;
1107
1108         if (rt) {
1109                 if (dst->obsolete) {
1110                         ip_rt_put(rt);
1111                         ret = NULL;
1112                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1113                            rt->u.dst.expires) {
1114                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1115                                                      rt->fl.fl4_src ^
1116                                                         (rt->fl.oif << 5),
1117                                                      rt->fl.fl4_tos);
1118 #if RT_CACHE_DEBUG >= 1
1119                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1120                                           "%u.%u.%u.%u/%02x dropped\n",
1121                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1122 #endif
1123                         rt_del(hash, rt);
1124                         ret = NULL;
1125                 }
1126         }
1127         return ret;
1128 }
1129
1130 /*
1131  * Algorithm:
1132  *      1. The first ip_rt_redirect_number redirects are sent
1133  *         with exponential backoff, then we stop sending them at all,
1134  *         assuming that the host ignores our redirects.
1135  *      2. If we did not see packets requiring redirects
1136  *         during ip_rt_redirect_silence, we assume that the host
1137  *         forgot redirected route and start to send redirects again.
1138  *
1139  * This algorithm is much cheaper and more intelligent than dumb load limiting
1140  * in icmp.c.
1141  *
1142  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1143  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1144  */
1145
1146 void ip_rt_send_redirect(struct sk_buff *skb)
1147 {
1148         struct rtable *rt = (struct rtable*)skb->dst;
1149         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1150
1151         if (!in_dev)
1152                 return;
1153
1154         if (!IN_DEV_TX_REDIRECTS(in_dev))
1155                 goto out;
1156
1157         /* No redirected packets during ip_rt_redirect_silence;
1158          * reset the algorithm.
1159          */
1160         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1161                 rt->u.dst.rate_tokens = 0;
1162
1163         /* Too many ignored redirects; do not send anything
1164          * set u.dst.rate_last to the last seen redirected packet.
1165          */
1166         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1167                 rt->u.dst.rate_last = jiffies;
1168                 goto out;
1169         }
1170
1171         /* Check for load limit; set rate_last to the latest sent
1172          * redirect.
1173          */
1174         if (time_after(jiffies,
1175                        (rt->u.dst.rate_last +
1176                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1177                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1178                 rt->u.dst.rate_last = jiffies;
1179                 ++rt->u.dst.rate_tokens;
1180 #ifdef CONFIG_IP_ROUTE_VERBOSE
1181                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1182                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1183                     net_ratelimit())
1184                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1185                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1186                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1187                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1188 #endif
1189         }
1190 out:
1191         in_dev_put(in_dev);
1192 }
1193
1194 static int ip_error(struct sk_buff *skb)
1195 {
1196         struct rtable *rt = (struct rtable*)skb->dst;
1197         unsigned long now;
1198         int code;
1199
1200         switch (rt->u.dst.error) {
1201                 case EINVAL:
1202                 default:
1203                         goto out;
1204                 case EHOSTUNREACH:
1205                         code = ICMP_HOST_UNREACH;
1206                         break;
1207                 case ENETUNREACH:
1208                         code = ICMP_NET_UNREACH;
1209                         break;
1210                 case EACCES:
1211                         code = ICMP_PKT_FILTERED;
1212                         break;
1213         }
1214
1215         now = jiffies;
1216         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1217         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1218                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1219         rt->u.dst.rate_last = now;
1220         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1221                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1222                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1223         }
1224
1225 out:    kfree_skb(skb);
1226         return 0;
1227 }
1228
1229 /*
1230  *      The last two values are not from the RFC but
1231  *      are needed for AMPRnet AX.25 paths.
1232  */
1233
1234 static unsigned short mtu_plateau[] =
1235 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1236
1237 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1238 {
1239         int i;
1240
1241         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1242                 if (old_mtu > mtu_plateau[i])
1243                         return mtu_plateau[i];
1244         return 68;
1245 }
1246
1247 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1248 {
1249         int i;
1250         unsigned short old_mtu = ntohs(iph->tot_len);
1251         struct rtable *rth;
1252         u32  skeys[2] = { iph->saddr, 0, };
1253         u32  daddr = iph->daddr;
1254         u8   tos = iph->tos & IPTOS_RT_MASK;
1255         unsigned short est_mtu = 0;
1256
1257         if (ipv4_config.no_pmtu_disc)
1258                 return 0;
1259
1260         for (i = 0; i < 2; i++) {
1261                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1262
1263                 rcu_read_lock();
1264                 for (rth = rt_hash_table[hash].chain; rth;
1265                      rth = rth->u.rt_next) {
1266                         smp_read_barrier_depends();
1267                         if (rth->fl.fl4_dst == daddr &&
1268                             rth->fl.fl4_src == skeys[i] &&
1269                             rth->rt_dst  == daddr &&
1270                             rth->rt_src  == iph->saddr &&
1271                             rth->fl.fl4_tos == tos &&
1272                             rth->fl.iif == 0 &&
1273                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1274                                 unsigned short mtu = new_mtu;
1275
1276                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1277
1278                                         /* BSD 4.2 compatibility hack :-( */
1279                                         if (mtu == 0 &&
1280                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1281                                             old_mtu >= 68 + (iph->ihl << 2))
1282                                                 old_mtu -= iph->ihl << 2;
1283
1284                                         mtu = guess_mtu(old_mtu);
1285                                 }
1286                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1287                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1288                                                 dst_confirm(&rth->u.dst);
1289                                                 if (mtu < ip_rt_min_pmtu) {
1290                                                         mtu = ip_rt_min_pmtu;
1291                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1292                                                                 (1 << RTAX_MTU);
1293                                                 }
1294                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1295                                                 dst_set_expires(&rth->u.dst,
1296                                                         ip_rt_mtu_expires);
1297                                         }
1298                                         est_mtu = mtu;
1299                                 }
1300                         }
1301                 }
1302                 rcu_read_unlock();
1303         }
1304         return est_mtu ? : new_mtu;
1305 }
1306
1307 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1308 {
1309         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1310             !(dst_metric_locked(dst, RTAX_MTU))) {
1311                 if (mtu < ip_rt_min_pmtu) {
1312                         mtu = ip_rt_min_pmtu;
1313                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1314                 }
1315                 dst->metrics[RTAX_MTU-1] = mtu;
1316                 dst_set_expires(dst, ip_rt_mtu_expires);
1317         }
1318 }
1319
1320 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1321 {
1322         dst_release(dst);
1323         return NULL;
1324 }
1325
1326 static void ipv4_dst_destroy(struct dst_entry *dst)
1327 {
1328         struct rtable *rt = (struct rtable *) dst;
1329         struct inet_peer *peer = rt->peer;
1330         struct in_device *idev = rt->idev;
1331
1332         if (peer) {
1333                 rt->peer = NULL;
1334                 inet_putpeer(peer);
1335         }
1336
1337         if (idev) {
1338                 rt->idev = NULL;
1339                 in_dev_put(idev);
1340         }
1341 }
1342
1343 static void ipv4_dst_ifdown(struct dst_entry *dst, int how)
1344 {
1345         struct rtable *rt = (struct rtable *) dst;
1346         struct in_device *idev = rt->idev;
1347         if (idev) {
1348                 rt->idev = NULL;
1349                 in_dev_put(idev);
1350         }
1351 }
1352
1353 static void ipv4_link_failure(struct sk_buff *skb)
1354 {
1355         struct rtable *rt;
1356
1357         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1358
1359         rt = (struct rtable *) skb->dst;
1360         if (rt)
1361                 dst_set_expires(&rt->u.dst, 0);
1362 }
1363
1364 static int ip_rt_bug(struct sk_buff **pskb)
1365 {
1366         struct sk_buff *skb = *pskb;
1367
1368         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1369                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1370                 skb->dev ? skb->dev->name : "?");
1371         kfree_skb(skb);
1372         return 0;
1373 }
1374
1375 /*
1376    We do not cache source address of outgoing interface,
1377    because it is used only by IP RR, TS and SRR options,
1378    so that it out of fast path.
1379
1380    BTW remember: "addr" is allowed to be not aligned
1381    in IP options!
1382  */
1383
1384 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1385 {
1386         u32 src;
1387         struct fib_result res;
1388
1389         if (rt->fl.iif == 0)
1390                 src = rt->rt_src;
1391         else if (fib_lookup(&rt->fl, &res) == 0) {
1392 #ifdef CONFIG_IP_ROUTE_NAT
1393                 if (res.type == RTN_NAT)
1394                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1395                                                 RT_SCOPE_UNIVERSE);
1396                 else
1397 #endif
1398                         src = FIB_RES_PREFSRC(res);
1399                 fib_res_put(&res);
1400         } else
1401                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1402                                         RT_SCOPE_UNIVERSE);
1403         memcpy(addr, &src, 4);
1404 }
1405
1406 #ifdef CONFIG_NET_CLS_ROUTE
1407 static void set_class_tag(struct rtable *rt, u32 tag)
1408 {
1409         if (!(rt->u.dst.tclassid & 0xFFFF))
1410                 rt->u.dst.tclassid |= tag & 0xFFFF;
1411         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1412                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1413 }
1414 #endif
1415
1416 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1417 {
1418         struct fib_info *fi = res->fi;
1419
1420         if (fi) {
1421                 if (FIB_RES_GW(*res) &&
1422                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1423                         rt->rt_gateway = FIB_RES_GW(*res);
1424                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1425                        sizeof(rt->u.dst.metrics));
1426                 if (fi->fib_mtu == 0) {
1427                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1428                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1429                             rt->rt_gateway != rt->rt_dst &&
1430                             rt->u.dst.dev->mtu > 576)
1431                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1432                 }
1433 #ifdef CONFIG_NET_CLS_ROUTE
1434                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1435 #endif
1436         } else
1437                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1438
1439         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1440                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1441         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1442                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1443         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1444                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1445                                        ip_rt_min_advmss);
1446         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1447                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1448
1449 #ifdef CONFIG_NET_CLS_ROUTE
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451         set_class_tag(rt, fib_rules_tclass(res));
1452 #endif
1453         set_class_tag(rt, itag);
1454 #endif
1455         rt->rt_type = res->type;
1456 }
1457
1458 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1459                                 u8 tos, struct net_device *dev, int our)
1460 {
1461         unsigned hash;
1462         struct rtable *rth;
1463         u32 spec_dst;
1464         struct in_device *in_dev = in_dev_get(dev);
1465         u32 itag = 0;
1466
1467         /* Primary sanity checks. */
1468
1469         if (in_dev == NULL)
1470                 return -EINVAL;
1471
1472         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1473             skb->protocol != htons(ETH_P_IP))
1474                 goto e_inval;
1475
1476         if (ZERONET(saddr)) {
1477                 if (!LOCAL_MCAST(daddr))
1478                         goto e_inval;
1479                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1480         } else if (fib_validate_source(saddr, 0, tos, 0,
1481                                         dev, &spec_dst, &itag) < 0)
1482                 goto e_inval;
1483
1484         rth = dst_alloc(&ipv4_dst_ops);
1485         if (!rth)
1486                 goto e_nobufs;
1487
1488         rth->u.dst.output= ip_rt_bug;
1489
1490         atomic_set(&rth->u.dst.__refcnt, 1);
1491         rth->u.dst.flags= DST_HOST;
1492         if (in_dev->cnf.no_policy)
1493                 rth->u.dst.flags |= DST_NOPOLICY;
1494         rth->fl.fl4_dst = daddr;
1495         rth->rt_dst     = daddr;
1496         rth->fl.fl4_tos = tos;
1497 #ifdef CONFIG_IP_ROUTE_FWMARK
1498         rth->fl.fl4_fwmark= skb->nfmark;
1499 #endif
1500         rth->fl.fl4_src = saddr;
1501         rth->rt_src     = saddr;
1502 #ifdef CONFIG_IP_ROUTE_NAT
1503         rth->rt_dst_map = daddr;
1504         rth->rt_src_map = saddr;
1505 #endif
1506 #ifdef CONFIG_NET_CLS_ROUTE
1507         rth->u.dst.tclassid = itag;
1508 #endif
1509         rth->rt_iif     =
1510         rth->fl.iif     = dev->ifindex;
1511         rth->u.dst.dev  = &loopback_dev;
1512         dev_hold(rth->u.dst.dev);
1513         rth->idev       = in_dev_get(rth->u.dst.dev);
1514         rth->fl.oif     = 0;
1515         rth->rt_gateway = daddr;
1516         rth->rt_spec_dst= spec_dst;
1517         rth->rt_type    = RTN_MULTICAST;
1518         rth->rt_flags   = RTCF_MULTICAST;
1519         if (our) {
1520                 rth->u.dst.input= ip_local_deliver;
1521                 rth->rt_flags |= RTCF_LOCAL;
1522         }
1523
1524 #ifdef CONFIG_IP_MROUTE
1525         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1526                 rth->u.dst.input = ip_mr_input;
1527 #endif
1528         RT_CACHE_STAT_INC(in_slow_mc);
1529
1530         in_dev_put(in_dev);
1531         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1532         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1533
1534 e_nobufs:
1535         in_dev_put(in_dev);
1536         return -ENOBUFS;
1537
1538 e_inval:
1539         in_dev_put(in_dev);
1540         return -EINVAL;
1541 }
1542
1543 /*
1544  *      NOTE. We drop all the packets that has local source
1545  *      addresses, because every properly looped back packet
1546  *      must have correct destination already attached by output routine.
1547  *
1548  *      Such approach solves two big problems:
1549  *      1. Not simplex devices are handled properly.
1550  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1551  */
1552
1553 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1554                         u8 tos, struct net_device *dev)
1555 {
1556         struct fib_result res;
1557         struct in_device *in_dev = in_dev_get(dev);
1558         struct in_device *out_dev = NULL;
1559         struct flowi fl = { .nl_u = { .ip4_u =
1560                                       { .daddr = daddr,
1561                                         .saddr = saddr,
1562                                         .tos = tos,
1563                                         .scope = RT_SCOPE_UNIVERSE,
1564 #ifdef CONFIG_IP_ROUTE_FWMARK
1565                                         .fwmark = skb->nfmark
1566 #endif
1567                                       } },
1568                             .iif = dev->ifindex };
1569         unsigned        flags = 0;
1570         u32             itag = 0;
1571         struct rtable * rth;
1572         unsigned        hash;
1573         u32             spec_dst;
1574         int             err = -EINVAL;
1575         int             free_res = 0;
1576
1577         /* IP on this device is disabled. */
1578
1579         if (!in_dev)
1580                 goto out;
1581
1582         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
1583
1584         /* Check for the most weird martians, which can be not detected
1585            by fib_lookup.
1586          */
1587
1588         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1589                 goto martian_source;
1590
1591         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1592                 goto brd_input;
1593
1594         /* Accept zero addresses only to limited broadcast;
1595          * I even do not know to fix it or not. Waiting for complains :-)
1596          */
1597         if (ZERONET(saddr))
1598                 goto martian_source;
1599
1600         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1601                 goto martian_destination;
1602
1603         /*
1604          *      Now we are ready to route packet.
1605          */
1606         if ((err = fib_lookup(&fl, &res)) != 0) {
1607                 if (!IN_DEV_FORWARD(in_dev))
1608                         goto e_inval;
1609                 goto no_route;
1610         }
1611         free_res = 1;
1612
1613         RT_CACHE_STAT_INC(in_slow_tot);
1614
1615 #ifdef CONFIG_IP_ROUTE_NAT
1616         /* Policy is applied before mapping destination,
1617            but rerouting after map should be made with old source.
1618          */
1619
1620         if (1) {
1621                 u32 src_map = saddr;
1622                 if (res.r)
1623                         src_map = fib_rules_policy(saddr, &res, &flags);
1624
1625                 if (res.type == RTN_NAT) {
1626                         fl.fl4_dst = fib_rules_map_destination(daddr, &res);
1627                         fib_res_put(&res);
1628                         free_res = 0;
1629                         if (fib_lookup(&fl, &res))
1630                                 goto e_inval;
1631                         free_res = 1;
1632                         if (res.type != RTN_UNICAST)
1633                                 goto e_inval;
1634                         flags |= RTCF_DNAT;
1635                 }
1636                 fl.fl4_src = src_map;
1637         }
1638 #endif
1639
1640         if (res.type == RTN_BROADCAST)
1641                 goto brd_input;
1642
1643         if (res.type == RTN_LOCAL) {
1644                 int result;
1645                 result = fib_validate_source(saddr, daddr, tos,
1646                                              loopback_dev.ifindex,
1647                                              dev, &spec_dst, &itag);
1648                 if (result < 0)
1649                         goto martian_source;
1650                 if (result)
1651                         flags |= RTCF_DIRECTSRC;
1652                 spec_dst = daddr;
1653                 goto local_input;
1654         }
1655
1656         if (!IN_DEV_FORWARD(in_dev))
1657                 goto e_inval;
1658         if (res.type != RTN_UNICAST)
1659                 goto martian_destination;
1660
1661 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1662         if (res.fi->fib_nhs > 1 && fl.oif == 0)
1663                 fib_select_multipath(&fl, &res);
1664 #endif
1665         out_dev = in_dev_get(FIB_RES_DEV(res));
1666         if (out_dev == NULL) {
1667                 if (net_ratelimit())
1668                         printk(KERN_CRIT "Bug in ip_route_input_slow(). "
1669                                          "Please, report\n");
1670                 goto e_inval;
1671         }
1672
1673         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev,
1674                                   &spec_dst, &itag);
1675         if (err < 0)
1676                 goto martian_source;
1677
1678         if (err)
1679                 flags |= RTCF_DIRECTSRC;
1680
1681         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1682             (IN_DEV_SHARED_MEDIA(out_dev) ||
1683              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1684                 flags |= RTCF_DOREDIRECT;
1685
1686         if (skb->protocol != htons(ETH_P_IP)) {
1687                 /* Not IP (i.e. ARP). Do not create route, if it is
1688                  * invalid for proxy arp. DNAT routes are always valid.
1689                  */
1690                 if (out_dev == in_dev && !(flags & RTCF_DNAT))
1691                         goto e_inval;
1692         }
1693
1694         rth = dst_alloc(&ipv4_dst_ops);
1695         if (!rth)
1696                 goto e_nobufs;
1697
1698         atomic_set(&rth->u.dst.__refcnt, 1);
1699         rth->u.dst.flags= DST_HOST;
1700         if (in_dev->cnf.no_policy)
1701                 rth->u.dst.flags |= DST_NOPOLICY;
1702         if (in_dev->cnf.no_xfrm)
1703                 rth->u.dst.flags |= DST_NOXFRM;
1704         rth->fl.fl4_dst = daddr;
1705         rth->rt_dst     = daddr;
1706         rth->fl.fl4_tos = tos;
1707 #ifdef CONFIG_IP_ROUTE_FWMARK
1708         rth->fl.fl4_fwmark= skb->nfmark;
1709 #endif
1710         rth->fl.fl4_src = saddr;
1711         rth->rt_src     = saddr;
1712         rth->rt_gateway = daddr;
1713 #ifdef CONFIG_IP_ROUTE_NAT
1714         rth->rt_src_map = fl.fl4_src;
1715         rth->rt_dst_map = fl.fl4_dst;
1716         if (flags&RTCF_DNAT)
1717                 rth->rt_gateway = fl.fl4_dst;
1718 #endif
1719         rth->rt_iif     =
1720         rth->fl.iif     = dev->ifindex;
1721         rth->u.dst.dev  = out_dev->dev;
1722         dev_hold(rth->u.dst.dev);
1723         rth->idev       = in_dev_get(rth->u.dst.dev);
1724         rth->fl.oif     = 0;
1725         rth->rt_spec_dst= spec_dst;
1726
1727         rth->u.dst.input = ip_forward;
1728         rth->u.dst.output = ip_output;
1729
1730         rt_set_nexthop(rth, &res, itag);
1731
1732         rth->rt_flags = flags;
1733
1734 intern:
1735         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1736 done:
1737         in_dev_put(in_dev);
1738         if (out_dev)
1739                 in_dev_put(out_dev);
1740         if (free_res)
1741                 fib_res_put(&res);
1742 out:    return err;
1743
1744 brd_input:
1745         if (skb->protocol != htons(ETH_P_IP))
1746                 goto e_inval;
1747
1748         if (ZERONET(saddr))
1749                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1750         else {
1751                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1752                                           &itag);
1753                 if (err < 0)
1754                         goto martian_source;
1755                 if (err)
1756                         flags |= RTCF_DIRECTSRC;
1757         }
1758         flags |= RTCF_BROADCAST;
1759         res.type = RTN_BROADCAST;
1760         RT_CACHE_STAT_INC(in_brd);
1761
1762 local_input:
1763         rth = dst_alloc(&ipv4_dst_ops);
1764         if (!rth)
1765                 goto e_nobufs;
1766
1767         rth->u.dst.output= ip_rt_bug;
1768
1769         atomic_set(&rth->u.dst.__refcnt, 1);
1770         rth->u.dst.flags= DST_HOST;
1771         if (in_dev->cnf.no_policy)
1772                 rth->u.dst.flags |= DST_NOPOLICY;
1773         rth->fl.fl4_dst = daddr;
1774         rth->rt_dst     = daddr;
1775         rth->fl.fl4_tos = tos;
1776 #ifdef CONFIG_IP_ROUTE_FWMARK
1777         rth->fl.fl4_fwmark= skb->nfmark;
1778 #endif
1779         rth->fl.fl4_src = saddr;
1780         rth->rt_src     = saddr;
1781 #ifdef CONFIG_IP_ROUTE_NAT
1782         rth->rt_dst_map = fl.fl4_dst;
1783         rth->rt_src_map = fl.fl4_src;
1784 #endif
1785 #ifdef CONFIG_NET_CLS_ROUTE
1786         rth->u.dst.tclassid = itag;
1787 #endif
1788         rth->rt_iif     =
1789         rth->fl.iif     = dev->ifindex;
1790         rth->u.dst.dev  = &loopback_dev;
1791         dev_hold(rth->u.dst.dev);
1792         rth->idev       = in_dev_get(rth->u.dst.dev);
1793         rth->rt_gateway = daddr;
1794         rth->rt_spec_dst= spec_dst;
1795         rth->u.dst.input= ip_local_deliver;
1796         rth->rt_flags   = flags|RTCF_LOCAL;
1797         if (res.type == RTN_UNREACHABLE) {
1798                 rth->u.dst.input= ip_error;
1799                 rth->u.dst.error= -err;
1800                 rth->rt_flags   &= ~RTCF_LOCAL;
1801         }
1802         rth->rt_type    = res.type;
1803         goto intern;
1804
1805 no_route:
1806         RT_CACHE_STAT_INC(in_no_route);
1807         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1808         res.type = RTN_UNREACHABLE;
1809         goto local_input;
1810
1811         /*
1812          *      Do not cache martian addresses: they should be logged (RFC1812)
1813          */
1814 martian_destination:
1815         RT_CACHE_STAT_INC(in_martian_dst);
1816 #ifdef CONFIG_IP_ROUTE_VERBOSE
1817         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1818                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1819                         "%u.%u.%u.%u, dev %s\n",
1820                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1821 #endif
1822 e_inval:
1823         err = -EINVAL;
1824         goto done;
1825
1826 e_nobufs:
1827         err = -ENOBUFS;
1828         goto done;
1829
1830 martian_source:
1831
1832         RT_CACHE_STAT_INC(in_martian_src);
1833 #ifdef CONFIG_IP_ROUTE_VERBOSE
1834         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1835                 /*
1836                  *      RFC1812 recommendation, if source is martian,
1837                  *      the only hint is MAC header.
1838                  */
1839                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1840                         "%u.%u.%u.%u, on dev %s\n",
1841                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1842                 if (dev->hard_header_len) {
1843                         int i;
1844                         unsigned char *p = skb->mac.raw;
1845                         printk(KERN_WARNING "ll header: ");
1846                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1847                                 printk("%02x", *p);
1848                                 if (i < (dev->hard_header_len - 1))
1849                                         printk(":");
1850                         }
1851                         printk("\n");
1852                 }
1853         }
1854 #endif
1855         goto e_inval;
1856 }
1857
1858 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1859                    u8 tos, struct net_device *dev)
1860 {
1861         struct rtable * rth;
1862         unsigned        hash;
1863         int iif = dev->ifindex;
1864
1865         tos &= IPTOS_RT_MASK;
1866         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
1867
1868         rcu_read_lock();
1869         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
1870                 smp_read_barrier_depends();
1871                 if (rth->fl.fl4_dst == daddr &&
1872                     rth->fl.fl4_src == saddr &&
1873                     rth->fl.iif == iif &&
1874                     rth->fl.oif == 0 &&
1875 #ifdef CONFIG_IP_ROUTE_FWMARK
1876                     rth->fl.fl4_fwmark == skb->nfmark &&
1877 #endif
1878                     rth->fl.fl4_tos == tos) {
1879                         rth->u.dst.lastuse = jiffies;
1880                         dst_hold(&rth->u.dst);
1881                         rth->u.dst.__use++;
1882                         RT_CACHE_STAT_INC(in_hit);
1883                         rcu_read_unlock();
1884                         skb->dst = (struct dst_entry*)rth;
1885                         return 0;
1886                 }
1887                 RT_CACHE_STAT_INC(in_hlist_search);
1888         }
1889         rcu_read_unlock();
1890
1891         /* Multicast recognition logic is moved from route cache to here.
1892            The problem was that too many Ethernet cards have broken/missing
1893            hardware multicast filters :-( As result the host on multicasting
1894            network acquires a lot of useless route cache entries, sort of
1895            SDR messages from all the world. Now we try to get rid of them.
1896            Really, provided software IP multicast filter is organized
1897            reasonably (at least, hashed), it does not result in a slowdown
1898            comparing with route cache reject entries.
1899            Note, that multicast routers are not affected, because
1900            route cache entry is created eventually.
1901          */
1902         if (MULTICAST(daddr)) {
1903                 struct in_device *in_dev;
1904
1905                 read_lock(&inetdev_lock);
1906                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1907                         int our = ip_check_mc(in_dev, daddr, saddr,
1908                                 skb->nh.iph->protocol);
1909                         if (our
1910 #ifdef CONFIG_IP_MROUTE
1911                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1912 #endif
1913                             ) {
1914                                 read_unlock(&inetdev_lock);
1915                                 return ip_route_input_mc(skb, daddr, saddr,
1916                                                          tos, dev, our);
1917                         }
1918                 }
1919                 read_unlock(&inetdev_lock);
1920                 return -EINVAL;
1921         }
1922         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1923 }
1924
1925 /*
1926  * Major route resolver routine.
1927  */
1928
1929 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
1930 {
1931         u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
1932         struct flowi fl = { .nl_u = { .ip4_u =
1933                                       { .daddr = oldflp->fl4_dst,
1934                                         .saddr = oldflp->fl4_src,
1935                                         .tos = tos & IPTOS_RT_MASK,
1936                                         .scope = ((tos & RTO_ONLINK) ?
1937                                                   RT_SCOPE_LINK :
1938                                                   RT_SCOPE_UNIVERSE),
1939 #ifdef CONFIG_IP_ROUTE_FWMARK
1940                                         .fwmark = oldflp->fl4_fwmark
1941 #endif
1942                                       } },
1943                             .iif = loopback_dev.ifindex,
1944                             .oif = oldflp->oif };
1945         struct fib_result res;
1946         unsigned flags = 0;
1947         struct rtable *rth;
1948         struct net_device *dev_out = NULL;
1949         struct in_device *in_dev = NULL;
1950         unsigned hash;
1951         int free_res = 0;
1952         int err;
1953
1954         res.fi          = NULL;
1955 #ifdef CONFIG_IP_MULTIPLE_TABLES
1956         res.r           = NULL;
1957 #endif
1958
1959         if (oldflp->fl4_src) {
1960                 err = -EINVAL;
1961                 if (MULTICAST(oldflp->fl4_src) ||
1962                     BADCLASS(oldflp->fl4_src) ||
1963                     ZERONET(oldflp->fl4_src))
1964                         goto out;
1965
1966                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1967                 dev_out = ip_dev_find(oldflp->fl4_src);
1968                 if (dev_out == NULL)
1969                         goto out;
1970
1971                 /* I removed check for oif == dev_out->oif here.
1972                    It was wrong for two reasons:
1973                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1974                       assigned to multiple interfaces.
1975                    2. Moreover, we are allowed to send packets with saddr
1976                       of another iface. --ANK
1977                  */
1978
1979                 if (oldflp->oif == 0
1980                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
1981                         /* Special hack: user can direct multicasts
1982                            and limited broadcast via necessary interface
1983                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1984                            This hack is not just for fun, it allows
1985                            vic,vat and friends to work.
1986                            They bind socket to loopback, set ttl to zero
1987                            and expect that it will work.
1988                            From the viewpoint of routing cache they are broken,
1989                            because we are not allowed to build multicast path
1990                            with loopback source addr (look, routing cache
1991                            cannot know, that ttl is zero, so that packet
1992                            will not leave this host and route is valid).
1993                            Luckily, this hack is good workaround.
1994                          */
1995
1996                         fl.oif = dev_out->ifindex;
1997                         goto make_route;
1998                 }
1999                 if (dev_out)
2000                         dev_put(dev_out);
2001                 dev_out = NULL;
2002         }
2003         if (oldflp->oif) {
2004                 dev_out = dev_get_by_index(oldflp->oif);
2005                 err = -ENODEV;
2006                 if (dev_out == NULL)
2007                         goto out;
2008                 if (__in_dev_get(dev_out) == NULL) {
2009                         dev_put(dev_out);
2010                         goto out;       /* Wrong error code */
2011                 }
2012
2013                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2014                         if (!fl.fl4_src)
2015                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2016                                                               RT_SCOPE_LINK);
2017                         goto make_route;
2018                 }
2019                 if (!fl.fl4_src) {
2020                         if (MULTICAST(oldflp->fl4_dst))
2021                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2022                                                               fl.fl4_scope);
2023                         else if (!oldflp->fl4_dst)
2024                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2025                                                               RT_SCOPE_HOST);
2026                 }
2027         }
2028
2029         if (!fl.fl4_dst) {
2030                 fl.fl4_dst = fl.fl4_src;
2031                 if (!fl.fl4_dst)
2032                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2033                 if (dev_out)
2034                         dev_put(dev_out);
2035                 dev_out = &loopback_dev;
2036                 dev_hold(dev_out);
2037                 fl.oif = loopback_dev.ifindex;
2038                 res.type = RTN_LOCAL;
2039                 flags |= RTCF_LOCAL;
2040                 goto make_route;
2041         }
2042
2043         if (fib_lookup(&fl, &res)) {
2044                 res.fi = NULL;
2045                 if (oldflp->oif) {
2046                         /* Apparently, routing tables are wrong. Assume,
2047                            that the destination is on link.
2048
2049                            WHY? DW.
2050                            Because we are allowed to send to iface
2051                            even if it has NO routes and NO assigned
2052                            addresses. When oif is specified, routing
2053                            tables are looked up with only one purpose:
2054                            to catch if destination is gatewayed, rather than
2055                            direct. Moreover, if MSG_DONTROUTE is set,
2056                            we send packet, ignoring both routing tables
2057                            and ifaddr state. --ANK
2058
2059
2060                            We could make it even if oif is unknown,
2061                            likely IPv6, but we do not.
2062                          */
2063
2064                         if (fl.fl4_src == 0)
2065                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2066                                                               RT_SCOPE_LINK);
2067                         res.type = RTN_UNICAST;
2068                         goto make_route;
2069                 }
2070                 if (dev_out)
2071                         dev_put(dev_out);
2072                 err = -ENETUNREACH;
2073                 goto out;
2074         }
2075         free_res = 1;
2076
2077         if (res.type == RTN_NAT)
2078                 goto e_inval;
2079
2080         if (res.type == RTN_LOCAL) {
2081                 if (!fl.fl4_src)
2082                         fl.fl4_src = fl.fl4_dst;
2083                 if (dev_out)
2084                         dev_put(dev_out);
2085                 dev_out = &loopback_dev;
2086                 dev_hold(dev_out);
2087                 fl.oif = dev_out->ifindex;
2088                 if (res.fi)
2089                         fib_info_put(res.fi);
2090                 res.fi = NULL;
2091                 flags |= RTCF_LOCAL;
2092                 goto make_route;
2093         }
2094
2095 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2096         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2097                 fib_select_multipath(&fl, &res);
2098         else
2099 #endif
2100         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2101                 fib_select_default(&fl, &res);
2102
2103         if (!fl.fl4_src)
2104                 fl.fl4_src = FIB_RES_PREFSRC(res);
2105
2106         if (dev_out)
2107                 dev_put(dev_out);
2108         dev_out = FIB_RES_DEV(res);
2109         dev_hold(dev_out);
2110         fl.oif = dev_out->ifindex;
2111
2112 make_route:
2113         if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2114                 goto e_inval;
2115
2116         if (fl.fl4_dst == 0xFFFFFFFF)
2117                 res.type = RTN_BROADCAST;
2118         else if (MULTICAST(fl.fl4_dst))
2119                 res.type = RTN_MULTICAST;
2120         else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
2121                 goto e_inval;
2122
2123         if (dev_out->flags & IFF_LOOPBACK)
2124                 flags |= RTCF_LOCAL;
2125
2126         in_dev = in_dev_get(dev_out);
2127         if (!in_dev)
2128                 goto e_inval;
2129
2130         if (res.type == RTN_BROADCAST) {
2131                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2132                 if (res.fi) {
2133                         fib_info_put(res.fi);
2134                         res.fi = NULL;
2135                 }
2136         } else if (res.type == RTN_MULTICAST) {
2137                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2138                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto))
2139                         flags &= ~RTCF_LOCAL;
2140                 /* If multicast route do not exist use
2141                    default one, but do not gateway in this case.
2142                    Yes, it is hack.
2143                  */
2144                 if (res.fi && res.prefixlen < 4) {
2145                         fib_info_put(res.fi);
2146                         res.fi = NULL;
2147                 }
2148         }
2149
2150         rth = dst_alloc(&ipv4_dst_ops);
2151         if (!rth)
2152                 goto e_nobufs;
2153
2154         atomic_set(&rth->u.dst.__refcnt, 1);
2155         rth->u.dst.flags= DST_HOST;
2156         if (in_dev->cnf.no_xfrm)
2157                 rth->u.dst.flags |= DST_NOXFRM;
2158         if (in_dev->cnf.no_policy)
2159                 rth->u.dst.flags |= DST_NOPOLICY;
2160         rth->fl.fl4_dst = oldflp->fl4_dst;
2161         rth->fl.fl4_tos = tos;
2162         rth->fl.fl4_src = oldflp->fl4_src;
2163         rth->fl.oif     = oldflp->oif;
2164 #ifdef CONFIG_IP_ROUTE_FWMARK
2165         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2166 #endif
2167         rth->rt_dst     = fl.fl4_dst;
2168         rth->rt_src     = fl.fl4_src;
2169 #ifdef CONFIG_IP_ROUTE_NAT
2170         rth->rt_dst_map = fl.fl4_dst;
2171         rth->rt_src_map = fl.fl4_src;
2172 #endif
2173         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2174         rth->u.dst.dev  = dev_out;
2175         dev_hold(dev_out);
2176         rth->idev       = in_dev_get(dev_out);
2177         rth->rt_gateway = fl.fl4_dst;
2178         rth->rt_spec_dst= fl.fl4_src;
2179
2180         rth->u.dst.output=ip_output;
2181
2182         RT_CACHE_STAT_INC(out_slow_tot);
2183
2184         if (flags & RTCF_LOCAL) {
2185                 rth->u.dst.input = ip_local_deliver;
2186                 rth->rt_spec_dst = fl.fl4_dst;
2187         }
2188         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2189                 rth->rt_spec_dst = fl.fl4_src;
2190                 if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
2191                         rth->u.dst.output = ip_mc_output;
2192                         RT_CACHE_STAT_INC(out_slow_mc);
2193                 }
2194 #ifdef CONFIG_IP_MROUTE
2195                 if (res.type == RTN_MULTICAST) {
2196                         if (IN_DEV_MFORWARD(in_dev) &&
2197                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2198                                 rth->u.dst.input = ip_mr_input;
2199                                 rth->u.dst.output = ip_mc_output;
2200                         }
2201                 }
2202 #endif
2203         }
2204
2205         rt_set_nexthop(rth, &res, 0);
2206
2207
2208         rth->rt_flags = flags;
2209
2210         hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2211         err = rt_intern_hash(hash, rth, rp);
2212 done:
2213         if (free_res)
2214                 fib_res_put(&res);
2215         if (dev_out)
2216                 dev_put(dev_out);
2217         if (in_dev)
2218                 in_dev_put(in_dev);
2219 out:    return err;
2220
2221 e_inval:
2222         err = -EINVAL;
2223         goto done;
2224 e_nobufs:
2225         err = -ENOBUFS;
2226         goto done;
2227 }
2228
2229 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2230 {
2231         unsigned hash;
2232         struct rtable *rth;
2233
2234         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2235
2236         rcu_read_lock();
2237         for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
2238                 smp_read_barrier_depends();
2239                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2240                     rth->fl.fl4_src == flp->fl4_src &&
2241                     rth->fl.iif == 0 &&
2242                     rth->fl.oif == flp->oif &&
2243 #ifdef CONFIG_IP_ROUTE_FWMARK
2244                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2245 #endif
2246                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2247                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2248                         rth->u.dst.lastuse = jiffies;
2249                         dst_hold(&rth->u.dst);
2250                         rth->u.dst.__use++;
2251                         RT_CACHE_STAT_INC(out_hit);
2252                         rcu_read_unlock();
2253                         *rp = rth;
2254                         return 0;
2255                 }
2256                 RT_CACHE_STAT_INC(out_hlist_search);
2257         }
2258         rcu_read_unlock();
2259
2260         return ip_route_output_slow(rp, flp);
2261 }
2262
2263 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2264 {
2265         int err;
2266
2267         if ((err = __ip_route_output_key(rp, flp)) != 0)
2268                 return err;
2269         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, NULL, 0) : 0;
2270 }
2271
2272 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2273 {
2274         int err;
2275
2276         if ((err = __ip_route_output_key(rp, flp)) != 0)
2277                 return err;
2278         return flp->proto ? xfrm_lookup((struct dst_entry**)rp, flp, sk, flags) : 0;
2279 }
2280
2281 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2282                         int nowait)
2283 {
2284         struct rtable *rt = (struct rtable*)skb->dst;
2285         struct rtmsg *r;
2286         struct nlmsghdr  *nlh;
2287         unsigned char    *b = skb->tail;
2288         struct rta_cacheinfo ci;
2289 #ifdef CONFIG_IP_MROUTE
2290         struct rtattr *eptr;
2291 #endif
2292         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2293         r = NLMSG_DATA(nlh);
2294         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2295         r->rtm_family    = AF_INET;
2296         r->rtm_dst_len  = 32;
2297         r->rtm_src_len  = 0;
2298         r->rtm_tos      = rt->fl.fl4_tos;
2299         r->rtm_table    = RT_TABLE_MAIN;
2300         r->rtm_type     = rt->rt_type;
2301         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2302         r->rtm_protocol = RTPROT_UNSPEC;
2303         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2304         if (rt->rt_flags & RTCF_NOTIFY)
2305                 r->rtm_flags |= RTM_F_NOTIFY;
2306         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2307         if (rt->fl.fl4_src) {
2308                 r->rtm_src_len = 32;
2309                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2310         }
2311         if (rt->u.dst.dev)
2312                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2313 #ifdef CONFIG_NET_CLS_ROUTE
2314         if (rt->u.dst.tclassid)
2315                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2316 #endif
2317         if (rt->fl.iif)
2318                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2319         else if (rt->rt_src != rt->fl.fl4_src)
2320                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2321         if (rt->rt_dst != rt->rt_gateway)
2322                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2323         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2324                 goto rtattr_failure;
2325         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2326         ci.rta_used     = rt->u.dst.__use;
2327         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2328         if (rt->u.dst.expires)
2329                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2330         else
2331                 ci.rta_expires = 0;
2332         ci.rta_error    = rt->u.dst.error;
2333         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2334         if (rt->peer) {
2335                 ci.rta_id = rt->peer->ip_id_count;
2336                 if (rt->peer->tcp_ts_stamp) {
2337                         ci.rta_ts = rt->peer->tcp_ts;
2338                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2339                 }
2340         }
2341 #ifdef CONFIG_IP_MROUTE
2342         eptr = (struct rtattr*)skb->tail;
2343 #endif
2344         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2345         if (rt->fl.iif) {
2346 #ifdef CONFIG_IP_MROUTE
2347                 u32 dst = rt->rt_dst;
2348
2349                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2350                     ipv4_devconf.mc_forwarding) {
2351                         int err = ipmr_get_route(skb, r, nowait);
2352                         if (err <= 0) {
2353                                 if (!nowait) {
2354                                         if (err == 0)
2355                                                 return 0;
2356                                         goto nlmsg_failure;
2357                                 } else {
2358                                         if (err == -EMSGSIZE)
2359                                                 goto nlmsg_failure;
2360                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2361                                 }
2362                         }
2363                 } else
2364 #endif
2365                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2366         }
2367
2368         nlh->nlmsg_len = skb->tail - b;
2369         return skb->len;
2370
2371 nlmsg_failure:
2372 rtattr_failure:
2373         skb_trim(skb, b - skb->data);
2374         return -1;
2375 }
2376
2377 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2378 {
2379         struct rtattr **rta = arg;
2380         struct rtmsg *rtm = NLMSG_DATA(nlh);
2381         struct rtable *rt = NULL;
2382         u32 dst = 0;
2383         u32 src = 0;
2384         int iif = 0;
2385         int err = -ENOBUFS;
2386         struct sk_buff *skb;
2387
2388         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2389         if (!skb)
2390                 goto out;
2391
2392         /* Reserve room for dummy headers, this skb can pass
2393            through good chunk of routing engine.
2394          */
2395         skb->mac.raw = skb->data;
2396         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2397
2398         if (rta[RTA_SRC - 1])
2399                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2400         if (rta[RTA_DST - 1])
2401                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2402         if (rta[RTA_IIF - 1])
2403                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2404
2405         if (iif) {
2406                 struct net_device *dev = __dev_get_by_index(iif);
2407                 err = -ENODEV;
2408                 if (!dev)
2409                         goto out_free;
2410                 skb->protocol   = htons(ETH_P_IP);
2411                 skb->dev        = dev;
2412                 local_bh_disable();
2413                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2414                 local_bh_enable();
2415                 rt = (struct rtable*)skb->dst;
2416                 if (!err && rt->u.dst.error)
2417                         err = -rt->u.dst.error;
2418         } else {
2419                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2420                                                          .saddr = src,
2421                                                          .tos = rtm->rtm_tos } } };
2422                 int oif = 0;
2423                 if (rta[RTA_OIF - 1])
2424                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2425                 fl.oif = oif;
2426                 err = ip_route_output_key(&rt, &fl);
2427         }
2428         if (err)
2429                 goto out_free;
2430
2431         skb->dst = &rt->u.dst;
2432         if (rtm->rtm_flags & RTM_F_NOTIFY)
2433                 rt->rt_flags |= RTCF_NOTIFY;
2434
2435         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2436
2437         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2438                                 RTM_NEWROUTE, 0);
2439         if (!err)
2440                 goto out_free;
2441         if (err < 0) {
2442                 err = -EMSGSIZE;
2443                 goto out_free;
2444         }
2445
2446         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2447         if (err > 0)
2448                 err = 0;
2449 out:    return err;
2450
2451 out_free:
2452         kfree_skb(skb);
2453         goto out;
2454 }
2455
2456 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2457 {
2458         struct rtable *rt;
2459         int h, s_h;
2460         int idx, s_idx;
2461
2462         s_h = cb->args[0];
2463         s_idx = idx = cb->args[1];
2464         for (h = 0; h <= rt_hash_mask; h++) {
2465                 if (h < s_h) continue;
2466                 if (h > s_h)
2467                         s_idx = 0;
2468                 rcu_read_lock();
2469                 for (rt = rt_hash_table[h].chain, idx = 0; rt;
2470                      rt = rt->u.rt_next, idx++) {
2471                         smp_read_barrier_depends();
2472                         if (idx < s_idx)
2473                                 continue;
2474                         skb->dst = dst_clone(&rt->u.dst);
2475                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2476                                          cb->nlh->nlmsg_seq,
2477                                          RTM_NEWROUTE, 1) <= 0) {
2478                                 dst_release(xchg(&skb->dst, NULL));
2479                                 rcu_read_unlock();
2480                                 goto done;
2481                         }
2482                         dst_release(xchg(&skb->dst, NULL));
2483                 }
2484                 rcu_read_unlock();
2485         }
2486
2487 done:
2488         cb->args[0] = h;
2489         cb->args[1] = idx;
2490         return skb->len;
2491 }
2492
2493 void ip_rt_multicast_event(struct in_device *in_dev)
2494 {
2495         rt_cache_flush(0);
2496 }
2497
2498 #ifdef CONFIG_SYSCTL
2499 static int flush_delay;
2500
2501 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2502                                         struct file *filp, void __user *buffer,
2503                                         size_t *lenp, loff_t *ppos)
2504 {
2505         if (write) {
2506                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2507                 rt_cache_flush(flush_delay);
2508                 return 0;
2509         }
2510
2511         return -EINVAL;
2512 }
2513
2514 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2515                                                 int __user *name,
2516                                                 int nlen,
2517                                                 void __user *oldval,
2518                                                 size_t __user *oldlenp,
2519                                                 void __user *newval,
2520                                                 size_t newlen,
2521                                                 void **context)
2522 {
2523         int delay;
2524         if (newlen != sizeof(int))
2525                 return -EINVAL;
2526         if (get_user(delay, (int __user *)newval))
2527                 return -EFAULT;
2528         rt_cache_flush(delay);
2529         return 0;
2530 }
2531
2532 ctl_table ipv4_route_table[] = {
2533         {
2534                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2535                 .procname       = "flush",
2536                 .data           = &flush_delay,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2540                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2541         },
2542         {
2543                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2544                 .procname       = "min_delay",
2545                 .data           = &ip_rt_min_delay,
2546                 .maxlen         = sizeof(int),
2547                 .mode           = 0644,
2548                 .proc_handler   = &proc_dointvec_jiffies,
2549                 .strategy       = &sysctl_jiffies,
2550         },
2551         {
2552                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2553                 .procname       = "max_delay",
2554                 .data           = &ip_rt_max_delay,
2555                 .maxlen         = sizeof(int),
2556                 .mode           = 0644,
2557                 .proc_handler   = &proc_dointvec_jiffies,
2558                 .strategy       = &sysctl_jiffies,
2559         },
2560         {
2561                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2562                 .procname       = "gc_thresh",
2563                 .data           = &ipv4_dst_ops.gc_thresh,
2564                 .maxlen         = sizeof(int),
2565                 .mode           = 0644,
2566                 .proc_handler   = &proc_dointvec,
2567         },
2568         {
2569                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2570                 .procname       = "max_size",
2571                 .data           = &ip_rt_max_size,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = &proc_dointvec,
2575         },
2576         {
2577                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2578                 .procname       = "gc_min_interval",
2579                 .data           = &ip_rt_gc_min_interval,
2580                 .maxlen         = sizeof(int),
2581                 .mode           = 0644,
2582                 .proc_handler   = &proc_dointvec_jiffies,
2583                 .strategy       = &sysctl_jiffies,
2584         },
2585         {
2586                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2587                 .procname       = "gc_timeout",
2588                 .data           = &ip_rt_gc_timeout,
2589                 .maxlen         = sizeof(int),
2590                 .mode           = 0644,
2591                 .proc_handler   = &proc_dointvec_jiffies,
2592                 .strategy       = &sysctl_jiffies,
2593         },
2594         {
2595                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2596                 .procname       = "gc_interval",
2597                 .data           = &ip_rt_gc_interval,
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0644,
2600                 .proc_handler   = &proc_dointvec_jiffies,
2601                 .strategy       = &sysctl_jiffies,
2602         },
2603         {
2604                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2605                 .procname       = "redirect_load",
2606                 .data           = &ip_rt_redirect_load,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = &proc_dointvec,
2610         },
2611         {
2612                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2613                 .procname       = "redirect_number",
2614                 .data           = &ip_rt_redirect_number,
2615                 .maxlen         = sizeof(int),
2616                 .mode           = 0644,
2617                 .proc_handler   = &proc_dointvec,
2618         },
2619         {
2620                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2621                 .procname       = "redirect_silence",
2622                 .data           = &ip_rt_redirect_silence,
2623                 .maxlen         = sizeof(int),
2624                 .mode           = 0644,
2625                 .proc_handler   = &proc_dointvec,
2626         },
2627         {
2628                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2629                 .procname       = "error_cost",
2630                 .data           = &ip_rt_error_cost,
2631                 .maxlen         = sizeof(int),
2632                 .mode           = 0644,
2633                 .proc_handler   = &proc_dointvec,
2634         },
2635         {
2636                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2637                 .procname       = "error_burst",
2638                 .data           = &ip_rt_error_burst,
2639                 .maxlen         = sizeof(int),
2640                 .mode           = 0644,
2641                 .proc_handler   = &proc_dointvec,
2642         },
2643         {
2644                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2645                 .procname       = "gc_elasticity",
2646                 .data           = &ip_rt_gc_elasticity,
2647                 .maxlen         = sizeof(int),
2648                 .mode           = 0644,
2649                 .proc_handler   = &proc_dointvec,
2650         },
2651         {
2652                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2653                 .procname       = "mtu_expires",
2654                 .data           = &ip_rt_mtu_expires,
2655                 .maxlen         = sizeof(int),
2656                 .mode           = 0644,
2657                 .proc_handler   = &proc_dointvec_jiffies,
2658                 .strategy       = &sysctl_jiffies,
2659         },
2660         {
2661                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2662                 .procname       = "min_pmtu",
2663                 .data           = &ip_rt_min_pmtu,
2664                 .maxlen         = sizeof(int),
2665                 .mode           = 0644,
2666                 .proc_handler   = &proc_dointvec,
2667         },
2668         {
2669                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2670                 .procname       = "min_adv_mss",
2671                 .data           = &ip_rt_min_advmss,
2672                 .maxlen         = sizeof(int),
2673                 .mode           = 0644,
2674                 .proc_handler   = &proc_dointvec,
2675         },
2676         {
2677                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2678                 .procname       = "secret_interval",
2679                 .data           = &ip_rt_secret_interval,
2680                 .maxlen         = sizeof(int),
2681                 .mode           = 0644,
2682                 .proc_handler   = &proc_dointvec_jiffies,
2683                 .strategy       = &sysctl_jiffies,
2684         },
2685         { .ctl_name = 0 }
2686 };
2687 #endif
2688
2689 #ifdef CONFIG_NET_CLS_ROUTE
2690 struct ip_rt_acct *ip_rt_acct;
2691
2692 /* This code sucks.  But you should have seen it before! --RR */
2693
2694 /* IP route accounting ptr for this logical cpu number. */
2695 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2696
2697 #ifdef CONFIG_PROC_FS
2698 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2699                            int length, int *eof, void *data)
2700 {
2701         unsigned int i;
2702
2703         if ((offset & 3) || (length & 3))
2704                 return -EIO;
2705
2706         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2707                 *eof = 1;
2708                 return 0;
2709         }
2710
2711         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2712                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2713                 *eof = 1;
2714         }
2715
2716         offset /= sizeof(u32);
2717
2718         if (length > 0) {
2719                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2720                 u32 *dst = (u32 *) buffer;
2721
2722                 /* Copy first cpu. */
2723                 *start = buffer;
2724                 memcpy(dst, src, length);
2725
2726                 /* Add the other cpus in, one int at a time */
2727                 for_each_cpu(i) {
2728                         unsigned int j;
2729
2730                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2731
2732                         for (j = 0; j < length/4; j++)
2733                                 dst[j] += src[j];
2734                 }
2735         }
2736         return length;
2737 }
2738 #endif /* CONFIG_PROC_FS */
2739 #endif /* CONFIG_NET_CLS_ROUTE */
2740
2741 static __initdata unsigned long rhash_entries;
2742 static int __init set_rhash_entries(char *str)
2743 {
2744         if (!str)
2745                 return 0;
2746         rhash_entries = simple_strtoul(str, &str, 0);
2747         return 1;
2748 }
2749 __setup("rhash_entries=", set_rhash_entries);
2750
2751 int __init ip_rt_init(void)
2752 {
2753         int i, order, goal, rc = 0;
2754
2755         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2756                              (jiffies ^ (jiffies >> 7)));
2757
2758 #ifdef CONFIG_NET_CLS_ROUTE
2759         for (order = 0;
2760              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2761                 /* NOTHING */;
2762         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2763         if (!ip_rt_acct)
2764                 panic("IP: failed to allocate ip_rt_acct\n");
2765         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2766 #endif
2767
2768         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2769                                                      sizeof(struct rtable),
2770                                                      0, SLAB_HWCACHE_ALIGN,
2771                                                      NULL, NULL);
2772
2773         if (!ipv4_dst_ops.kmem_cachep)
2774                 panic("IP: failed to allocate ip_dst_cache\n");
2775
2776         goal = num_physpages >> (26 - PAGE_SHIFT);
2777         if (rhash_entries)
2778                 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
2779         for (order = 0; (1UL << order) < goal; order++)
2780                 /* NOTHING */;
2781
2782         do {
2783                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2784                         sizeof(struct rt_hash_bucket);
2785                 while (rt_hash_mask & (rt_hash_mask - 1))
2786                         rt_hash_mask--;
2787                 rt_hash_table = (struct rt_hash_bucket *)
2788                         __get_free_pages(GFP_ATOMIC, order);
2789         } while (rt_hash_table == NULL && --order > 0);
2790
2791         if (!rt_hash_table)
2792                 panic("Failed to allocate IP route cache hash table\n");
2793
2794         printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
2795                rt_hash_mask,
2796                (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
2797
2798         for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
2799                 /* NOTHING */;
2800
2801         rt_hash_mask--;
2802         for (i = 0; i <= rt_hash_mask; i++) {
2803                 rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
2804                 rt_hash_table[i].chain = NULL;
2805         }
2806
2807         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2808 #if defined(CONFIG_MIPS_BRCM)
2809 #if defined(SDRAM_8M)  //vic
2810         ip_rt_max_size = (rt_hash_mask + 1) * 1;
2811 #elif (defined(SDRAM_16M) && defined(SUPPORT_TR69C))  //vic
2812         ip_rt_max_size = (rt_hash_mask + 1) * 1;
2813 #else
2814         ip_rt_max_size = (rt_hash_mask + 1) * 2;
2815 #endif
2816 #else
2817         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2818 #endif
2819
2820         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
2821         if (!rt_cache_stat)
2822                 return -ENOMEM;
2823
2824         devinet_init();
2825         ip_fib_init();
2826
2827         init_timer(&rt_flush_timer);
2828         rt_flush_timer.function = rt_run_flush;
2829         init_timer(&rt_periodic_timer);
2830         rt_periodic_timer.function = rt_check_expire;
2831         init_timer(&rt_secret_timer);
2832         rt_secret_timer.function = rt_secret_rebuild;
2833
2834         /* All the timers, started at system startup tend
2835            to synchronize. Perturb it a bit.
2836          */
2837         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
2838                                         ip_rt_gc_interval;
2839         add_timer(&rt_periodic_timer);
2840
2841         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2842                 ip_rt_secret_interval;
2843         add_timer(&rt_secret_timer);
2844
2845 #ifdef CONFIG_PROC_FS
2846         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2847             !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
2848                 free_percpu(rt_cache_stat);
2849                 return -ENOMEM;
2850         }
2851
2852 #ifdef CONFIG_NET_CLS_ROUTE
2853         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
2854 #endif
2855 #endif
2856 #ifdef CONFIG_XFRM
2857         xfrm_init();
2858         xfrm4_init();
2859 #endif
2860         return rc;
2861 }
2862
2863 EXPORT_SYMBOL(__ip_select_ident);
2864 EXPORT_SYMBOL(ip_route_input);
2865 EXPORT_SYMBOL(ip_route_output_key);