net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.99.2.1 2002/03/10 04:26:08 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently drop skb instead of failing with -EPERM.
  40  *              Detlev Wengorz  :       Copy protocol for fragments.
  41  */
  42
  43 #include <asm/uaccess.h>
  44 #include <asm/system.h>
  45 #include <linux/types.h>
  46 #include <linux/kernel.h>
  47 #include <linux/sched.h>
  48 #include <linux/mm.h>
  49 #include <linux/string.h>
  50 #include <linux/errno.h>
  51 #include <linux/config.h>
  52
  53 #include <linux/socket.h>
  54 #include <linux/sockios.h>
  55 #include <linux/in.h>
  56 #include <linux/inet.h>
  57 #include <linux/netdevice.h>
  58 #include <linux/etherdevice.h>
  59 #include <linux/proc_fs.h>
  60 #include <linux/stat.h>
  61 #include <linux/init.h>
  62
  63 #include <net/snmp.h>
  64 #include <net/ip.h>
  65 #include <net/protocol.h>
  66 #include <net/route.h>
  67 #include <net/tcp.h>
  68 #include <net/udp.h>
  69 #include <linux/skbuff.h>
  70 #include <net/sock.h>
  71 #include <net/arp.h>
  72 #include <net/icmp.h>
  73 #include <net/raw.h>
  74 #include <net/checksum.h>
  75 #include <net/inetpeer.h>
  76 #include <linux/igmp.h>
  77 #include <linux/netfilter_ipv4.h>
  78 #include <linux/mroute.h>
  79 #include <linux/netlink.h>
  80
  81 /*
  82  *      Shall we try to damage output packets if routing dev changes?
  83  */
  84
  85 int sysctl_ip_dynaddr = 0;
  86 int sysctl_ip_default_ttl = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         newskb->mac.raw = newskb->data;
  99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103
 104 #ifdef CONFIG_NETFILTER_DEBUG
 105         nf_debug_ip_loopback_xmit(newskb);
 106 #endif
 107         netif_rx(newskb);
 108         return 0;
 109 }
 110
 111 /* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
 112    changes route */
 113 static inline int
 114 output_maybe_reroute(struct sk_buff *skb)
 115 {
 116         return skb->dst->output(skb);
 117 }
 118
 119 /*
 120  *              Add an ip header to a skbuff and send it out.
 121  */
 122 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 123                           u32 saddr, u32 daddr, struct ip_options *opt)
 124 {
 125         struct rtable *rt = (struct rtable *)skb->dst;
 126         struct iphdr *iph;
 127
 128         /* Build the IP header. */
 129         if (opt)
 130                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 131         else
 132                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 133
 134         iph->version  = 4;
 135         iph->ihl      = 5;
 136         iph->tos      = sk->protinfo.af_inet.tos;
 137         if (ip_dont_fragment(sk, &rt->u.dst))
 138                 iph->frag_off = htons(IP_DF);
 139         else
 140                 iph->frag_off = 0;
 141         iph->ttl      = sk->protinfo.af_inet.ttl;
 142         iph->daddr    = rt->rt_dst;
 143         iph->saddr    = rt->rt_src;
 144         iph->protocol = sk->protocol;
 145         iph->tot_len  = htons(skb->len);
 146         ip_select_ident(iph, &rt->u.dst, sk);
 147         skb->nh.iph   = iph;
 148
 149         if (opt && opt->optlen) {
 150                 iph->ihl += opt->optlen>>2;
 151                 ip_options_build(skb, opt, daddr, rt, 0);
 152         }
 153         ip_send_check(iph);
 154
 155         /* Send it out. */
 156         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 157                        output_maybe_reroute);
 158 }
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct hh_cache *hh = dst->hh;
 164
 165 #ifdef CONFIG_NETFILTER_DEBUG
 166         nf_debug_ip_finish_output2(skb);
 167 #endif /*CONFIG_NETFILTER_DEBUG*/
 168
 169         if (hh) {
 170                 int hh_alen;
 171
 172                 read_lock_bh(&hh->hh_lock);
 173                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 174                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 175                 read_unlock_bh(&hh->hh_lock);
 176                 skb_push(skb, hh->hh_len);
 177                 return hh->hh_output(skb);
 178         } else if (dst->neighbour)
 179                 return dst->neighbour->output(skb);
 180
 181         if (net_ratelimit())
 182                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 183         kfree_skb(skb);
 184         return -EINVAL;
 185 }
 186
 187 static __inline__ int __ip_finish_output(struct sk_buff *skb)
 188 {
 189         struct net_device *dev = skb->dst->dev;
 190
 191         skb->dev = dev;
 192         skb->protocol = htons(ETH_P_IP);
 193
 194         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 195                        ip_finish_output2);
 196 }
 197
 198 int ip_finish_output(struct sk_buff *skb)
 199 {
 200         return __ip_finish_output(skb);
 201 }
 202
 203 int ip_mc_output(struct sk_buff *skb)
 204 {
 205         struct sock *sk = skb->sk;
 206         struct rtable *rt = (struct rtable*)skb->dst;
 207         struct net_device *dev = rt->u.dst.dev;
 208
 209         /*
 210          *      If the indicated interface is up and running, send the packet.
 211          */
 212         IP_INC_STATS(IpOutRequests);
 213 #ifdef CONFIG_IP_ROUTE_NAT
 214         if (rt->rt_flags & RTCF_NAT)
 215                 ip_do_nat(skb);
 216 #endif
 217
 218         skb->dev = dev;
 219         skb->protocol = htons(ETH_P_IP);
 220
 221         /*
 222          *      Multicasts are looped back for other local users
 223          */
 224
 225         if (rt->rt_flags&RTCF_MULTICAST) {
 226                 if ((!sk || sk->protinfo.af_inet.mc_loop)
 227 #ifdef CONFIG_IP_MROUTE
 228                 /* Small optimization: do not loopback not local frames,
 229                    which returned after forwarding; they will be  dropped
 230                    by ip_mr_input in any case.
 231                    Note, that local frames are looped back to be delivered
 232                    to local recipients.
 233
 234                    This check is duplicated in ip_mr_input at the moment.
 235                  */
 236                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 237 #endif
 238                 ) {
 239                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 240                         if (newskb)
 241                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 242                                         newskb->dev,
 243                                         ip_dev_loopback_xmit);
 244                 }
 245
 246                 /* Multicasts with ttl 0 must not go beyond the host */
 247
 248                 if (skb->nh.iph->ttl == 0) {
 249                         kfree_skb(skb);
 250                         return 0;
 251                 }
 252         }
 253
 254         if (rt->rt_flags&RTCF_BROADCAST) {
 255                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 256                 if (newskb)
 257                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 258                                 newskb->dev, ip_dev_loopback_xmit);
 259         }
 260
 261         return __ip_finish_output(skb);
 262 }
 263
 264 int ip_output(struct sk_buff *skb)
 265 {
 266 #ifdef CONFIG_IP_ROUTE_NAT
 267         struct rtable *rt = (struct rtable*)skb->dst;
 268 #endif
 269
 270         IP_INC_STATS(IpOutRequests);
 271
 272 #ifdef CONFIG_IP_ROUTE_NAT
 273         if (rt->rt_flags&RTCF_NAT)
 274                 ip_do_nat(skb);
 275 #endif
 276
 277         return __ip_finish_output(skb);
 278 }
 279
 280 /* Queues a packet to be sent, and starts the transmitter if necessary.
 281  * This routine also needs to put in the total length and compute the
 282  * checksum.  We use to do this in two stages, ip_build_header() then
 283  * this, but that scheme created a mess when routes disappeared etc.
 284  * So we do it all here, and the TCP send engine has been changed to
 285  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 286  * most likely make other reliable transport layers above IP easier
 287  * to implement under Linux.
 288  */
 289 static inline int ip_queue_xmit2(struct sk_buff *skb)
 290 {
 291         struct sock *sk = skb->sk;
 292         struct rtable *rt = (struct rtable *)skb->dst;
 293         struct net_device *dev;
 294         struct iphdr *iph = skb->nh.iph;
 295
 296         dev = rt->u.dst.dev;
 297
 298         /* This can happen when the transport layer has segments queued
 299          * with a cached route, and by the time we get here things are
 300          * re-routed to a device with a different MTU than the original
 301          * device.  Sick, but we must cover it.
 302          */
 303         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 304                 struct sk_buff *skb2;
 305
 306                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 307                 kfree_skb(skb);
 308                 if (skb2 == NULL)
 309                         return -ENOMEM;
 310                 if (sk)
 311                         skb_set_owner_w(skb2, sk);
 312                 skb = skb2;
 313                 iph = skb->nh.iph;
 314         }
 315
 316         if (skb->len > rt->u.dst.pmtu)
 317                 goto fragment;
 318
 319         ip_select_ident(iph, &rt->u.dst, sk);
 320
 321         /* Add an IP checksum. */
 322         ip_send_check(iph);
 323
 324         skb->priority = sk->priority;
 325         return skb->dst->output(skb);
 326
 327 fragment:
 328         if (ip_dont_fragment(sk, &rt->u.dst)) {
 329                 /* Reject packet ONLY if TCP might fragment
 330                  * it itself, if were careful enough.
 331                  */
 332                 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
 333                                 skb->len, rt->u.dst.pmtu));
 334
 335                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 336                           htonl(rt->u.dst.pmtu));
 337                 kfree_skb(skb);
 338                 return -EMSGSIZE;
 339         }
 340         ip_select_ident(iph, &rt->u.dst, sk);
 341         if (skb->ip_summed == CHECKSUM_HW &&
 342             (skb = skb_checksum_help(skb)) == NULL)
 343                 return -ENOMEM;
 344         return ip_fragment(skb, skb->dst->output);
 345 }
 346
 347 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 348 {
 349         struct sock *sk = skb->sk;
 350         struct ip_options *opt = sk->protinfo.af_inet.opt;
 351         struct rtable *rt;
 352         struct iphdr *iph;
 353
 354         /* Skip all of this if the packet is already routed,
 355          * f.e. by something like SCTP.
 356          */
 357         rt = (struct rtable *) skb->dst;
 358         if (rt != NULL)
 359                 goto packet_routed;
 360
 361         /* Make sure we can route this packet. */
 362         rt = (struct rtable *)__sk_dst_check(sk, 0);
 363         if (rt == NULL) {
 364                 u32 daddr;
 365
 366                 /* Use correct destination address if we have options. */
 367                 daddr = sk->daddr;
 368                 if(opt && opt->srr)
 369                         daddr = opt->faddr;
 370
 371                 /* If this fails, retransmit mechanism of transport layer will
 372                  * keep trying until route appears or the connection times itself
 373                  * out.
 374                  */
 375                 if (ip_route_output(&rt, daddr, sk->saddr,
 376                                     RT_CONN_FLAGS(sk),
 377                                     sk->bound_dev_if))
 378                         goto no_route;
 379                 __sk_dst_set(sk, &rt->u.dst);
 380                 sk->route_caps = rt->u.dst.dev->features;
 381         }
 382         skb->dst = dst_clone(&rt->u.dst);
 383
 384 packet_routed:
 385         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 386                 goto no_route;
 387
 388         /* OK, we know where to send it, allocate and build IP header. */
 389         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 390         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
 391         iph->tot_len = htons(skb->len);
 392         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 393                 iph->frag_off = htons(IP_DF);
 394         else
 395                 iph->frag_off = 0;
 396         iph->ttl      = sk->protinfo.af_inet.ttl;
 397         iph->protocol = sk->protocol;
 398         iph->saddr    = rt->rt_src;
 399         iph->daddr    = rt->rt_dst;
 400         skb->nh.iph   = iph;
 401         /* Transport layer set skb->h.foo itself. */
 402
 403         if(opt && opt->optlen) {
 404                 iph->ihl += opt->optlen >> 2;
 405                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 406         }
 407
 408         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 409                        ip_queue_xmit2);
 410
 411 no_route:
 412         IP_INC_STATS(IpOutNoRoutes);
 413         kfree_skb(skb);
 414         return -EHOSTUNREACH;
 415 }
 416
 417 /*
 418  *      Build and send a packet, with as little as one copy
 419  *
 420  *      Doesn't care much about ip options... option length can be
 421  *      different for fragment at 0 and other fragments.
 422  *
 423  *      Note that the fragment at the highest offset is sent first,
 424  *      so the getfrag routine can fill in the TCP/UDP checksum header
 425  *      field in the last fragment it sends... actually it also helps
 426  *      the reassemblers, they can put most packets in at the head of
 427  *      the fragment queue, and they know the total size in advance. This
 428  *      last feature will measurably improve the Linux fragment handler one
 429  *      day.
 430  *
 431  *      The callback has five args, an arbitrary pointer (copy of frag),
 432  *      the source IP address (may depend on the routing table), the
 433  *      destination address (char *), the offset to copy from, and the
 434  *      length to be copied.
 435  */
 436
 437 static int ip_build_xmit_slow(struct sock *sk,
 438                   int getfrag (const void *,
 439                                char *,
 440                                unsigned int,
 441                                unsigned int,
 442                                struct sk_buff *),
 443                   const void *frag,
 444                   unsigned length,
 445                   struct ipcm_cookie *ipc,
 446                   struct rtable *rt,
 447                   int flags)
 448 {
 449         unsigned int fraglen, maxfraglen, fragheaderlen;
 450         int err;
 451         int offset, mf;
 452         int mtu;
 453         u16 id;
 454
 455         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 456         int nfrags=0;
 457         struct ip_options *opt = ipc->opt;
 458         int df = 0;
 459
 460         mtu = rt->u.dst.pmtu;
 461         if (ip_dont_fragment(sk, &rt->u.dst))
 462                 df = htons(IP_DF);
 463
 464         length -= sizeof(struct iphdr);
 465
 466         if (opt) {
 467                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 468                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 469         } else {
 470                 fragheaderlen = sizeof(struct iphdr);
 471
 472                 /*
 473                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 474                  *      out the size of the frames to send.
 475                  */
 476
 477                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 478         }
 479
 480         if (length + fragheaderlen > 0xFFFF) {
 481                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 482                 return -EMSGSIZE;
 483         }
 484
 485         /*
 486          *      Start at the end of the frame by handling the remainder.
 487          */
 488
 489         offset = length - (length % (maxfraglen - fragheaderlen));
 490
 491         /*
 492          *      Amount of memory to allocate for final fragment.
 493          */
 494
 495         fraglen = length - offset + fragheaderlen;
 496
 497         if (length-offset==0) {
 498                 fraglen = maxfraglen;
 499                 offset -= maxfraglen-fragheaderlen;
 500         }
 501
 502         /*
 503          *      The last fragment will not have MF (more fragments) set.
 504          */
 505
 506         mf = 0;
 507
 508         /*
 509          *      Don't fragment packets for path mtu discovery.
 510          */
 511
 512         if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
 513                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 514                 return -EMSGSIZE;
 515         }
 516         if (flags&MSG_PROBE)
 517                 goto out;
 518
 519         /*
 520          *      Begin outputting the bytes.
 521          */
 522
 523         id = sk->protinfo.af_inet.id++;
 524
 525         do {
 526                 char *data;
 527                 struct sk_buff * skb;
 528
 529                 /*
 530                  *      Get the memory we require with some space left for alignment.
 531                  */
 532                 if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
 533                         skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
 534                                                   (flags & MSG_DONTWAIT), &err);
 535                 } else {
 536                         /* On a non-blocking write, we check for send buffer
 537                          * usage on the first fragment only.
 538                          */
 539                         skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
 540                                            sk->allocation);
 541                         if (!skb)
 542                                 err = -ENOBUFS;
 543                 }
 544                 if (skb == NULL)
 545                         goto error;
 546
 547                 /*
 548                  *      Fill in the control structures
 549                  */
 550
 551                 skb->priority = sk->priority;
 552                 skb->dst = dst_clone(&rt->u.dst);
 553                 skb_reserve(skb, hh_len);
 554
 555                 /*
 556                  *      Find where to start putting bytes.
 557                  */
 558
 559                 data = skb_put(skb, fraglen);
 560                 skb->nh.iph = (struct iphdr *)data;
 561
 562                 /*
 563                  *      Only write IP header onto non-raw packets
 564                  */
 565
 566                 {
 567                         struct iphdr *iph = (struct iphdr *)data;
 568
 569                         iph->version = 4;
 570                         iph->ihl = 5;
 571                         if (opt) {
 572                                 iph->ihl += opt->optlen>>2;
 573                                 ip_options_build(skb, opt,
 574                                                  ipc->addr, rt, offset);
 575                         }
 576                         iph->tos = sk->protinfo.af_inet.tos;
 577                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 578                         iph->frag_off = htons(offset>>3)|mf|df;
 579                         iph->id = id;
 580                         if (!mf) {
 581                                 if (offset || !df) {
 582                                         /* Select an unpredictable ident only
 583                                          * for packets without DF or having
 584                                          * been fragmented.
 585                                          */
 586                                         __ip_select_ident(iph, &rt->u.dst);
 587                                         id = iph->id;
 588                                 }
 589
 590                                 /*
 591                                  *      Any further fragments will have MF set.
 592                                  */
 593                                 mf = htons(IP_MF);
 594                         }
 595                         if (rt->rt_type == RTN_MULTICAST)
 596                                 iph->ttl = sk->protinfo.af_inet.mc_ttl;
 597                         else
 598                                 iph->ttl = sk->protinfo.af_inet.ttl;
 599                         iph->protocol = sk->protocol;
 600                         iph->check = 0;
 601                         iph->saddr = rt->rt_src;
 602                         iph->daddr = rt->rt_dst;
 603                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 604                         data += iph->ihl*4;
 605                 }
 606
 607                 /*
 608                  *      User data callback
 609                  */
 610
 611                 if (getfrag(frag, data, offset, fraglen-fragheaderlen, skb)) {
 612                         err = -EFAULT;
 613                         kfree_skb(skb);
 614                         goto error;
 615                 }
 616
 617                 offset -= (maxfraglen-fragheaderlen);
 618                 fraglen = maxfraglen;
 619
 620                 nfrags++;
 621
 622                 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 623                               skb->dst->dev, output_maybe_reroute);
 624                 if (err) {
 625                         if (err > 0)
 626                                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 627                         if (err)
 628                                 goto error;
 629                 }
 630         } while (offset >= 0);
 631
 632         if (nfrags>1)
 633                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 634 out:
 635         return 0;
 636
 637 error:
 638         IP_INC_STATS(IpOutDiscards);
 639         if (nfrags>1)
 640                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 641         return err;
 642 }
 643
 644 /*
 645  *      Fast path for unfragmented packets.
 646  */
 647 int ip_build_xmit(struct sock *sk,
 648                   int getfrag (const void *,
 649                                char *,
 650                                unsigned int,
 651                                unsigned int,
 652                                struct sk_buff *),
 653                   const void *frag,
 654                   unsigned length,
 655                   struct ipcm_cookie *ipc,
 656                   struct rtable *rt,
 657                   int flags)
 658 {
 659         int err;
 660         struct sk_buff *skb;
 661         int df;
 662         struct iphdr *iph;
 663
 664         /*
 665          *      Try the simple case first. This leaves fragmented frames, and by
 666          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 667          */
 668
 669         if (!sk->protinfo.af_inet.hdrincl) {
 670                 length += sizeof(struct iphdr);
 671
 672                 /*
 673                  *      Check for slow path.
 674                  */
 675                 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 676                         return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 677         } else {
 678                 if (length > rt->u.dst.dev->mtu) {
 679                         ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 680                         return -EMSGSIZE;
 681                 }
 682         }
 683         if (flags&MSG_PROBE)
 684                 goto out;
 685
 686         /*
 687          *      Do path mtu discovery if needed.
 688          */
 689         df = 0;
 690         if (ip_dont_fragment(sk, &rt->u.dst))
 691                 df = htons(IP_DF);
 692
 693         /*
 694          *      Fast path for unfragmented frames without options.
 695          */
 696         {
 697         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 698
 699         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 700                                   flags&MSG_DONTWAIT, &err);
 701         if(skb==NULL)
 702                 goto error;
 703         skb_reserve(skb, hh_len);
 704         }
 705
 706         skb->priority = sk->priority;
 707         skb->dst = dst_clone(&rt->u.dst);
 708
 709         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 710
 711         if(!sk->protinfo.af_inet.hdrincl) {
 712                 iph->version=4;
 713                 iph->ihl=5;
 714                 iph->tos=sk->protinfo.af_inet.tos;
 715                 iph->tot_len = htons(length);
 716                 iph->frag_off = df;
 717                 iph->ttl=sk->protinfo.af_inet.mc_ttl;
 718                 ip_select_ident(iph, &rt->u.dst, sk);
 719                 if (rt->rt_type != RTN_MULTICAST)
 720                         iph->ttl=sk->protinfo.af_inet.ttl;
 721                 iph->protocol=sk->protocol;
 722                 iph->saddr=rt->rt_src;
 723                 iph->daddr=rt->rt_dst;
 724                 iph->check=0;
 725                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 726                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4, skb);
 727         }
 728         else
 729                 err = getfrag(frag, (void *)iph, 0, length, skb);
 730
 731         if (err)
 732                 goto error_fault;
 733
 734         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 735                       output_maybe_reroute);
 736         if (err > 0)
 737                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 738         if (err)
 739                 goto error;
 740 out:
 741         return 0;
 742
 743 error_fault:
 744         err = -EFAULT;
 745         kfree_skb(skb);
 746 error:
 747         IP_INC_STATS(IpOutDiscards);
 748         return err;
 749 }
 750
 751 /*
 752  *      This IP datagram is too large to be sent in one piece.  Break it up into
 753  *      smaller pieces (each of size equal to IP header plus
 754  *      a block of the data of the original IP data part) that will yet fit in a
 755  *      single device frame, and queue such a frame for sending.
 756  *
 757  *      Yes this is inefficient, feel free to submit a quicker one.
 758  */
 759
 760 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 761 {
 762         struct iphdr *iph;
 763         int raw = 0;
 764         int ptr;
 765         struct net_device *dev;
 766         struct sk_buff *skb2;
 767         unsigned int mtu, hlen, left, len;
 768         int offset;
 769         int not_last_frag;
 770         struct rtable *rt = (struct rtable*)skb->dst;
 771         int err = 0;
 772
 773         dev = rt->u.dst.dev;
 774
 775         /*
 776          *      Point into the IP datagram header.
 777          */
 778
 779         iph = skb->nh.iph;
 780
 781         /*
 782          *      Setup starting values.
 783          */
 784
 785         hlen = iph->ihl * 4;
 786         left = skb->len - hlen;         /* Space per frame */
 787         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 788         ptr = raw + hlen;               /* Where to start from */
 789
 790         /*
 791          *      Fragment the datagram.
 792          */
 793
 794         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 795         not_last_frag = iph->frag_off & htons(IP_MF);
 796
 797         /*
 798          *      Keep copying data until we run out.
 799          */
 800
 801         while(left > 0) {
 802                 len = left;
 803                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 804                 if (len > mtu)
 805                         len = mtu;
 806                 /* IF: we are not sending upto and including the packet end
 807                    then align the next start on an eight byte boundary */
 808                 if (len < left) {
 809                         len &= ~7;
 810                 }
 811                 /*
 812                  *      Allocate buffer.
 813                  */
 814
 815                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 816                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 817                         err = -ENOMEM;
 818                         goto fail;
 819                 }
 820
 821                 /*
 822                  *      Set up data on packet
 823                  */
 824
 825                 skb2->pkt_type = skb->pkt_type;
 826                 skb2->priority = skb->priority;
 827                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 828                 skb_put(skb2, len + hlen);
 829                 skb2->nh.raw = skb2->data;
 830                 skb2->h.raw = skb2->data + hlen;
 831                 skb2->protocol = skb->protocol;
 832                 skb2->security = skb->security;
 833
 834                 /*
 835                  *      Charge the memory for the fragment to any owner
 836                  *      it might possess
 837                  */
 838
 839                 if (skb->sk)
 840                         skb_set_owner_w(skb2, skb->sk);
 841                 skb2->dst = dst_clone(skb->dst);
 842                 skb2->dev = skb->dev;
 843
 844                 /*
 845                  *      Copy the packet header into the new buffer.
 846                  */
 847
 848                 memcpy(skb2->nh.raw, skb->data, hlen);
 849
 850                 /*
 851                  *      Copy a block of the IP datagram.
 852                  */
 853                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 854                         BUG();
 855                 left -= len;
 856
 857                 /*
 858                  *      Fill in the new header fields.
 859                  */
 860                 iph = skb2->nh.iph;
 861                 iph->frag_off = htons((offset >> 3));
 862
 863                 /* ANK: dirty, but effective trick. Upgrade options only if
 864                  * the segment to be fragmented was THE FIRST (otherwise,
 865                  * options are already fixed) and make it ONCE
 866                  * on the initial skb, so that all the following fragments
 867                  * will inherit fixed options.
 868                  */
 869                 if (offset == 0)
 870                         ip_options_fragment(skb);
 871
 872                 /* Copy the flags to each fragment. */
 873                 IPCB(skb2)->flags = IPCB(skb)->flags;
 874
 875                 /*
 876                  *      Added AC : If we are fragmenting a fragment that's not the
 877                  *                 last fragment then keep MF on each bit
 878                  */
 879                 if (left > 0 || not_last_frag)
 880                         iph->frag_off |= htons(IP_MF);
 881                 ptr += len;
 882                 offset += len;
 883
 884 #ifdef CONFIG_NET_SCHED
 885                 skb2->tc_index = skb->tc_index;
 886 #endif
 887 #ifdef CONFIG_NETFILTER
 888                 skb2->nfmark = skb->nfmark;
 889                 skb2->nfcache = skb->nfcache;
 890                 /* Connection association is same as pre-frag packet */
 891                 skb2->nfct = skb->nfct;
 892                 nf_conntrack_get(skb2->nfct);
 893 #ifdef CONFIG_NETFILTER_DEBUG
 894                 skb2->nf_debug = skb->nf_debug;
 895 #endif
 896 #endif
 897
 898                 /*
 899                  *      Put this fragment into the sending queue.
 900                  */
 901
 902                 IP_INC_STATS(IpFragCreates);
 903
 904                 iph->tot_len = htons(len + hlen);
 905
 906                 ip_send_check(iph);
 907
 908                 err = output(skb2);
 909                 if (err)
 910                         goto fail;
 911         }
 912         kfree_skb(skb);
 913         IP_INC_STATS(IpFragOKs);
 914         return err;
 915
 916 fail:
 917         kfree_skb(skb);
 918         IP_INC_STATS(IpFragFails);
 919         return err;
 920 }
 921
 922 /*
 923  *      Fetch data from kernel space and fill in checksum if needed.
 924  */
 925 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 926                               unsigned int fraglen, struct sk_buff *skb)
 927 {
 928         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 929         u16 *pktp = (u16 *)to;
 930         struct iovec *iov;
 931         int len;
 932         int hdrflag = 1;
 933
 934         iov = &dp->iov[0];
 935         if (offset >= iov->iov_len) {
 936                 offset -= iov->iov_len;
 937                 iov++;
 938                 hdrflag = 0;
 939         }
 940         len = iov->iov_len - offset;
 941         if (fraglen > len) { /* overlapping. */
 942                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 943                                              dp->csum);
 944                 offset = 0;
 945                 fraglen -= len;
 946                 to += len;
 947                 iov++;
 948         }
 949
 950         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 951                                              dp->csum);
 952
 953         if (hdrflag && dp->csumoffset)
 954                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 955         return 0;
 956 }
 957
 958 /*
 959  *      Generic function to send a packet as reply to another packet.
 960  *      Used to send TCP resets so far. ICMP should use this function too.
 961  *
 962  *      Should run single threaded per socket because it uses the sock
 963  *      structure to pass arguments.
 964  */
 965 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 966                    unsigned int len)
 967 {
 968         struct {
 969                 struct ip_options       opt;
 970                 char                    data[40];
 971         } replyopts;
 972         struct ipcm_cookie ipc;
 973         u32 daddr;
 974         struct rtable *rt = (struct rtable*)skb->dst;
 975
 976         if (ip_options_echo(&replyopts.opt, skb))
 977                 return;
 978
 979         daddr = ipc.addr = rt->rt_src;
 980         ipc.opt = NULL;
 981
 982         if (replyopts.opt.optlen) {
 983                 ipc.opt = &replyopts.opt;
 984
 985                 if (ipc.opt->srr)
 986                         daddr = replyopts.opt.faddr;
 987         }
 988
 989         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 990                 return;
 991
 992         /* And let IP do all the hard work.
 993
 994            This chunk is not reenterable, hence spinlock.
 995            Note that it uses the fact, that this function is called
 996            with locally disabled BH and that sk cannot be already spinlocked.
 997          */
 998         bh_lock_sock(sk);
 999         sk->protinfo.af_inet.tos = skb->nh.iph->tos;
1000         sk->priority = skb->priority;
1001         sk->protocol = skb->nh.iph->protocol;
1002         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
1003         bh_unlock_sock(sk);
1004
1005         ip_rt_put(rt);
1006 }
1007
1008 /*
1009  *      IP protocol layer initialiser
1010  */
1011
1012 static struct packet_type ip_packet_type =
1013 {
1014         __constant_htons(ETH_P_IP),
1015         NULL,   /* All devices */
1016         ip_rcv,
1017         (void*)1,
1018         NULL,
1019 };
1020
1021 /*
1022  *      IP registers the packet type and then calls the subprotocol initialisers
1023  */
1024
1025 void __init ip_init(void)
1026 {
1027         dev_add_pack(&ip_packet_type);
1028
1029         ip_rt_init();
1030         inet_initpeers();
1031
1032 #ifdef CONFIG_IP_MULTICAST
1033         proc_net_create("igmp", 0, ip_mc_procinfo);
1034 #endif
1035         proc_net_create("mcfilter", 0, ip_mcf_procinfo);
1036 }