net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.1.1.1 2005/04/11 02:51:13 jack Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently drop skb instead of failing with -EPERM.
  40  *              Detlev Wengorz  :       Copy protocol for fragments.
  41  */
  42
  43 #include <asm/uaccess.h>
  44 #include <asm/system.h>
  45 #include <linux/types.h>
  46 #include <linux/kernel.h>
  47 #include <linux/sched.h>
  48 #include <linux/mm.h>
  49 #include <linux/string.h>
  50 #include <linux/errno.h>
  51 #include <linux/config.h>
  52
  53 #include <linux/socket.h>
  54 #include <linux/sockios.h>
  55 #include <linux/in.h>
  56 #include <linux/inet.h>
  57 #include <linux/netdevice.h>
  58 #include <linux/etherdevice.h>
  59 #include <linux/proc_fs.h>
  60 #include <linux/stat.h>
  61 #include <linux/init.h>
  62
  63 #include <net/snmp.h>
  64 #include <net/ip.h>
  65 #include <net/protocol.h>
  66 #include <net/route.h>
  67 #include <net/tcp.h>
  68 #include <net/udp.h>
  69 #include <linux/skbuff.h>
  70 #include <net/sock.h>
  71 #include <net/arp.h>
  72 #include <net/icmp.h>
  73 #include <net/raw.h>
  74 #include <net/checksum.h>
  75 #include <net/inetpeer.h>
  76 #include <linux/igmp.h>
  77 #include <linux/netfilter_ipv4.h>
  78 #include <linux/mroute.h>
  79 #include <linux/netlink.h>
  80
  81 /*
  82  *      Shall we try to damage output packets if routing dev changes?
  83  */
  84
  85 int sysctl_ip_dynaddr = 0;
  86 int sysctl_ip_default_ttl = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         newskb->mac.raw = newskb->data;
  99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103
 104 #ifdef CONFIG_NETFILTER_DEBUG
 105         nf_debug_ip_loopback_xmit(newskb);
 106 #endif
 107         netif_rx(newskb);
 108         return 0;
 109 }
 110
 111 /* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
 112    changes route */
 113 static inline int
 114 output_maybe_reroute(struct sk_buff *skb)
 115 {
 116         return skb->dst->output(skb);
 117 }
 118
 119 /*
 120  *              Add an ip header to a skbuff and send it out.
 121  */
 122 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 123                           u32 saddr, u32 daddr, struct ip_options *opt)
 124 {
 125         struct rtable *rt = (struct rtable *)skb->dst;
 126         struct iphdr *iph;
 127
 128         /* Build the IP header. */
 129         if (opt)
 130                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 131         else
 132                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 133
 134         iph->version  = 4;
 135         iph->ihl      = 5;
 136         iph->tos      = sk->protinfo.af_inet.tos;
 137         if (ip_dont_fragment(sk, &rt->u.dst))
 138                 iph->frag_off = htons(IP_DF);
 139         else
 140                 iph->frag_off = 0;
 141         iph->ttl      = sk->protinfo.af_inet.ttl;
 142         iph->daddr    = rt->rt_dst;
 143         iph->saddr    = rt->rt_src;
 144         iph->protocol = sk->protocol;
 145         iph->tot_len  = htons(skb->len);
 146         ip_select_ident(iph, &rt->u.dst, sk);
 147         skb->nh.iph   = iph;
 148
 149         if (opt && opt->optlen) {
 150                 iph->ihl += opt->optlen>>2;
 151                 ip_options_build(skb, opt, daddr, rt, 0);
 152         }
 153         ip_send_check(iph);
 154
 155         /* Send it out. */
 156         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 157                        output_maybe_reroute);
 158 }
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct hh_cache *hh = dst->hh;
 164
 165 #ifdef CONFIG_NETFILTER_DEBUG
 166         nf_debug_ip_finish_output2(skb);
 167 #endif /*CONFIG_NETFILTER_DEBUG*/
 168
 169         if (hh) {
 170                 read_lock_bh(&hh->hh_lock);
 171                 memcpy(skb->data - 16, hh->hh_data, 16);
 172                 read_unlock_bh(&hh->hh_lock);
 173                 skb_push(skb, hh->hh_len);
 174                 return hh->hh_output(skb);
 175         } else if (dst->neighbour)
 176                 return dst->neighbour->output(skb);
 177
 178         if (net_ratelimit())
 179                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 180         kfree_skb(skb);
 181         return -EINVAL;
 182 }
 183
 184 __inline__ int ip_finish_output(struct sk_buff *skb)
 185 {
 186         struct net_device *dev = skb->dst->dev;
 187
 188         skb->dev = dev;
 189         skb->protocol = htons(ETH_P_IP);
 190
 191         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 192                        ip_finish_output2);
 193 }
 194
 195 int ip_mc_output(struct sk_buff *skb)
 196 {
 197         struct sock *sk = skb->sk;
 198         struct rtable *rt = (struct rtable*)skb->dst;
 199         struct net_device *dev = rt->u.dst.dev;
 200
 201         /*
 202          *      If the indicated interface is up and running, send the packet.
 203          */
 204         IP_INC_STATS(IpOutRequests);
 205 #ifdef CONFIG_IP_ROUTE_NAT
 206         if (rt->rt_flags & RTCF_NAT)
 207                 ip_do_nat(skb);
 208 #endif
 209
 210         skb->dev = dev;
 211         skb->protocol = htons(ETH_P_IP);
 212
 213         /*
 214          *      Multicasts are looped back for other local users
 215          */
 216
 217         if (rt->rt_flags&RTCF_MULTICAST) {
 218                 if ((!sk || sk->protinfo.af_inet.mc_loop)
 219 #ifdef CONFIG_IP_MROUTE
 220                 /* Small optimization: do not loopback not local frames,
 221                    which returned after forwarding; they will be  dropped
 222                    by ip_mr_input in any case.
 223                    Note, that local frames are looped back to be delivered
 224                    to local recipients.
 225
 226                    This check is duplicated in ip_mr_input at the moment.
 227                  */
 228                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 229 #endif
 230                 ) {
 231                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 232                         if (newskb)
 233                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 234                                         newskb->dev,
 235                                         ip_dev_loopback_xmit);
 236                 }
 237
 238                 /* Multicasts with ttl 0 must not go beyond the host */
 239
 240                 if (skb->nh.iph->ttl == 0) {
 241                         kfree_skb(skb);
 242                         return 0;
 243                 }
 244         }
 245
 246         if (rt->rt_flags&RTCF_BROADCAST) {
 247                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 248                 if (newskb)
 249                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 250                                 newskb->dev, ip_dev_loopback_xmit);
 251         }
 252
 253         return ip_finish_output(skb);
 254 }
 255
 256 int ip_output(struct sk_buff *skb)
 257 {
 258 #ifdef CONFIG_IP_ROUTE_NAT
 259         struct rtable *rt = (struct rtable*)skb->dst;
 260 #endif
 261
 262         IP_INC_STATS(IpOutRequests);
 263
 264 #ifdef CONFIG_IP_ROUTE_NAT
 265         if (rt->rt_flags&RTCF_NAT)
 266                 ip_do_nat(skb);
 267 #endif
 268
 269         return ip_finish_output(skb);
 270 }
 271
 272 /* Queues a packet to be sent, and starts the transmitter if necessary.
 273  * This routine also needs to put in the total length and compute the
 274  * checksum.  We use to do this in two stages, ip_build_header() then
 275  * this, but that scheme created a mess when routes disappeared etc.
 276  * So we do it all here, and the TCP send engine has been changed to
 277  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 278  * most likely make other reliable transport layers above IP easier
 279  * to implement under Linux.
 280  */
 281 static inline int ip_queue_xmit2(struct sk_buff *skb)
 282 {
 283         struct sock *sk = skb->sk;
 284         struct rtable *rt = (struct rtable *)skb->dst;
 285         struct net_device *dev;
 286         struct iphdr *iph = skb->nh.iph;
 287
 288         dev = rt->u.dst.dev;
 289
 290         /* This can happen when the transport layer has segments queued
 291          * with a cached route, and by the time we get here things are
 292          * re-routed to a device with a different MTU than the original
 293          * device.  Sick, but we must cover it.
 294          */
 295         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 296                 struct sk_buff *skb2;
 297
 298                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 299                 kfree_skb(skb);
 300                 if (skb2 == NULL)
 301                         return -ENOMEM;
 302                 if (sk)
 303                         skb_set_owner_w(skb2, sk);
 304                 skb = skb2;
 305                 iph = skb->nh.iph;
 306         }
 307
 308         if (skb->len > rt->u.dst.pmtu)
 309                 goto fragment;
 310
 311         ip_select_ident(iph, &rt->u.dst, sk);
 312
 313         /* Add an IP checksum. */
 314         ip_send_check(iph);
 315
 316         skb->priority = sk->priority;
 317         return skb->dst->output(skb);
 318
 319 fragment:
 320         if (ip_dont_fragment(sk, &rt->u.dst)) {
 321                 /* Reject packet ONLY if TCP might fragment
 322                  * it itself, if were careful enough.
 323                  */
 324                 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
 325                                 skb->len, rt->u.dst.pmtu));
 326
 327                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 328                           htonl(rt->u.dst.pmtu));
 329                 kfree_skb(skb);
 330                 return -EMSGSIZE;
 331         }
 332         ip_select_ident(iph, &rt->u.dst, sk);
 333         if (skb->ip_summed == CHECKSUM_HW &&
 334             (skb = skb_checksum_help(skb)) == NULL)
 335                 return -ENOMEM;
 336         return ip_fragment(skb, skb->dst->output);
 337 }
 338
 339 int ip_queue_xmit(struct sk_buff *skb)
 340 {
 341         struct sock *sk = skb->sk;
 342         struct ip_options *opt = sk->protinfo.af_inet.opt;
 343         struct rtable *rt;
 344         struct iphdr *iph;
 345
 346         /* Skip all of this if the packet is already routed,
 347          * f.e. by something like SCTP.
 348          */
 349         rt = (struct rtable *) skb->dst;
 350         if (rt != NULL)
 351                 goto packet_routed;
 352
 353         /* Make sure we can route this packet. */
 354         rt = (struct rtable *)__sk_dst_check(sk, 0);
 355         if (rt == NULL) {
 356                 u32 daddr;
 357
 358                 /* Use correct destination address if we have options. */
 359                 daddr = sk->daddr;
 360                 if(opt && opt->srr)
 361                         daddr = opt->faddr;
 362
 363                 /* If this fails, retransmit mechanism of transport layer will
 364                  * keep trying until route appears or the connection times itself
 365                  * out.
 366                  */
 367                 if (ip_route_output(&rt, daddr, sk->saddr,
 368                                     RT_CONN_FLAGS(sk),
 369                                     sk->bound_dev_if))
 370                         goto no_route;
 371                 __sk_dst_set(sk, &rt->u.dst);
 372                 sk->route_caps = rt->u.dst.dev->features;
 373         }
 374         skb->dst = dst_clone(&rt->u.dst);
 375
 376 packet_routed:
 377         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 378                 goto no_route;
 379
 380         /* OK, we know where to send it, allocate and build IP header. */
 381         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 382         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
 383         iph->tot_len = htons(skb->len);
 384         if (ip_dont_fragment(sk, &rt->u.dst))
 385                 iph->frag_off = htons(IP_DF);
 386         else
 387                 iph->frag_off = 0;
 388         iph->ttl      = sk->protinfo.af_inet.ttl;
 389         iph->protocol = sk->protocol;
 390         iph->saddr    = rt->rt_src;
 391         iph->daddr    = rt->rt_dst;
 392         skb->nh.iph   = iph;
 393         /* Transport layer set skb->h.foo itself. */
 394
 395         if(opt && opt->optlen) {
 396                 iph->ihl += opt->optlen >> 2;
 397                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 398         }
 399
 400         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 401                        ip_queue_xmit2);
 402
 403 no_route:
 404         IP_INC_STATS(IpOutNoRoutes);
 405         kfree_skb(skb);
 406         return -EHOSTUNREACH;
 407 }
 408
 409 /*
 410  *      Build and send a packet, with as little as one copy
 411  *
 412  *      Doesn't care much about ip options... option length can be
 413  *      different for fragment at 0 and other fragments.
 414  *
 415  *      Note that the fragment at the highest offset is sent first,
 416  *      so the getfrag routine can fill in the TCP/UDP checksum header
 417  *      field in the last fragment it sends... actually it also helps
 418  *      the reassemblers, they can put most packets in at the head of
 419  *      the fragment queue, and they know the total size in advance. This
 420  *      last feature will measurably improve the Linux fragment handler one
 421  *      day.
 422  *
 423  *      The callback has five args, an arbitrary pointer (copy of frag),
 424  *      the source IP address (may depend on the routing table), the
 425  *      destination address (char *), the offset to copy from, and the
 426  *      length to be copied.
 427  */
 428
 429 static int ip_build_xmit_slow(struct sock *sk,
 430                   int getfrag (const void *,
 431                                char *,
 432                                unsigned int,
 433                                unsigned int),
 434                   const void *frag,
 435                   unsigned length,
 436                   struct ipcm_cookie *ipc,
 437                   struct rtable *rt,
 438                   int flags)
 439 {
 440         unsigned int fraglen, maxfraglen, fragheaderlen;
 441         int err;
 442         int offset, mf;
 443         int mtu;
 444         u16 id;
 445
 446         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 447         int nfrags=0;
 448         struct ip_options *opt = ipc->opt;
 449         int df = 0;
 450
 451         mtu = rt->u.dst.pmtu;
 452         if (ip_dont_fragment(sk, &rt->u.dst))
 453                 df = htons(IP_DF);
 454
 455         length -= sizeof(struct iphdr);
 456
 457         if (opt) {
 458                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 459                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 460         } else {
 461                 fragheaderlen = sizeof(struct iphdr);
 462
 463                 /*
 464                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 465                  *      out the size of the frames to send.
 466                  */
 467
 468                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 469         }
 470
 471         if (length + fragheaderlen > 0xFFFF) {
 472                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 473                 return -EMSGSIZE;
 474         }
 475
 476         /*
 477          *      Start at the end of the frame by handling the remainder.
 478          */
 479
 480         offset = length - (length % (maxfraglen - fragheaderlen));
 481
 482         /*
 483          *      Amount of memory to allocate for final fragment.
 484          */
 485
 486         fraglen = length - offset + fragheaderlen;
 487
 488         if (length-offset==0) {
 489                 fraglen = maxfraglen;
 490                 offset -= maxfraglen-fragheaderlen;
 491         }
 492
 493         /*
 494          *      The last fragment will not have MF (more fragments) set.
 495          */
 496
 497         mf = 0;
 498
 499         /*
 500          *      Don't fragment packets for path mtu discovery.
 501          */
 502
 503         if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
 504                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 505                 return -EMSGSIZE;
 506         }
 507         if (flags&MSG_PROBE)
 508                 goto out;
 509
 510         /*
 511          *      Begin outputting the bytes.
 512          */
 513
 514         id = sk->protinfo.af_inet.id++;
 515
 516         do {
 517                 char *data;
 518                 struct sk_buff * skb;
 519
 520                 /*
 521                  *      Get the memory we require with some space left for alignment.
 522                  */
 523                 if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
 524                         skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
 525                                                   (flags & MSG_DONTWAIT), &err);
 526                 } else {
 527                         /* On a non-blocking write, we check for send buffer
 528                          * usage on the first fragment only.
 529                          */
 530                         skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
 531                                            sk->allocation);
 532                         if (!skb)
 533                                 err = -ENOBUFS;
 534                 }
 535                 if (skb == NULL)
 536                         goto error;
 537
 538                 /*
 539                  *      Fill in the control structures
 540                  */
 541
 542                 skb->priority = sk->priority;
 543                 skb->dst = dst_clone(&rt->u.dst);
 544                 skb_reserve(skb, hh_len);
 545
 546                 /*
 547                  *      Find where to start putting bytes.
 548                  */
 549
 550                 data = skb_put(skb, fraglen);
 551                 skb->nh.iph = (struct iphdr *)data;
 552
 553                 /*
 554                  *      Only write IP header onto non-raw packets
 555                  */
 556
 557                 {
 558                         struct iphdr *iph = (struct iphdr *)data;
 559
 560                         iph->version = 4;
 561                         iph->ihl = 5;
 562                         if (opt) {
 563                                 iph->ihl += opt->optlen>>2;
 564                                 ip_options_build(skb, opt,
 565                                                  ipc->addr, rt, offset);
 566                         }
 567                         iph->tos = sk->protinfo.af_inet.tos;
 568                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 569                         iph->frag_off = htons(offset>>3)|mf|df;
 570                         iph->id = id;
 571                         if (!mf) {
 572                                 if (offset || !df) {
 573                                         /* Select an unpredictable ident only
 574                                          * for packets without DF or having
 575                                          * been fragmented.
 576                                          */
 577                                         __ip_select_ident(iph, &rt->u.dst);
 578                                         id = iph->id;
 579                                 }
 580
 581                                 /*
 582                                  *      Any further fragments will have MF set.
 583                                  */
 584                                 mf = htons(IP_MF);
 585                         }
 586                         if (rt->rt_type == RTN_MULTICAST)
 587                                 iph->ttl = sk->protinfo.af_inet.mc_ttl;
 588                         else
 589                                 iph->ttl = sk->protinfo.af_inet.ttl;
 590                         iph->protocol = sk->protocol;
 591                         iph->check = 0;
 592                         iph->saddr = rt->rt_src;
 593                         iph->daddr = rt->rt_dst;
 594                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 595                         data += iph->ihl*4;
 596                 }
 597
 598                 /*
 599                  *      User data callback
 600                  */
 601
 602                 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 603                         err = -EFAULT;
 604                         kfree_skb(skb);
 605                         goto error;
 606                 }
 607
 608                 offset -= (maxfraglen-fragheaderlen);
 609                 fraglen = maxfraglen;
 610
 611                 nfrags++;
 612
 613                 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 614                               skb->dst->dev, output_maybe_reroute);
 615                 if (err) {
 616                         if (err > 0)
 617                                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 618                         if (err)
 619                                 goto error;
 620                 }
 621         } while (offset >= 0);
 622
 623         if (nfrags>1)
 624                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 625 out:
 626         return 0;
 627
 628 error:
 629         IP_INC_STATS(IpOutDiscards);
 630         if (nfrags>1)
 631                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 632         return err;
 633 }
 634
 635 /*
 636  *      Fast path for unfragmented packets.
 637  */
 638 int ip_build_xmit(struct sock *sk,
 639                   int getfrag (const void *,
 640                                char *,
 641                                unsigned int,
 642                                unsigned int),
 643                   const void *frag,
 644                   unsigned length,
 645                   struct ipcm_cookie *ipc,
 646                   struct rtable *rt,
 647                   int flags)
 648 {
 649         int err;
 650         struct sk_buff *skb;
 651         int df;
 652         struct iphdr *iph;
 653
 654         /*
 655          *      Try the simple case first. This leaves fragmented frames, and by
 656          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 657          */
 658
 659         if (!sk->protinfo.af_inet.hdrincl) {
 660                 length += sizeof(struct iphdr);
 661
 662                 /*
 663                  *      Check for slow path.
 664                  */
 665                 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 666                         return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 667         } else {
 668                 if (length > rt->u.dst.dev->mtu) {
 669                         ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 670                         return -EMSGSIZE;
 671                 }
 672         }
 673         if (flags&MSG_PROBE)
 674                 goto out;
 675
 676         /*
 677          *      Do path mtu discovery if needed.
 678          */
 679         df = 0;
 680         if (ip_dont_fragment(sk, &rt->u.dst))
 681                 df = htons(IP_DF);
 682
 683         /*
 684          *      Fast path for unfragmented frames without options.
 685          */
 686         {
 687         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 688
 689         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 690                                   flags&MSG_DONTWAIT, &err);
 691         if(skb==NULL)
 692                 goto error;
 693         skb_reserve(skb, hh_len);
 694         }
 695
 696         skb->priority = sk->priority;
 697         skb->dst = dst_clone(&rt->u.dst);
 698
 699         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 700
 701         if(!sk->protinfo.af_inet.hdrincl) {
 702                 iph->version=4;
 703                 iph->ihl=5;
 704                 iph->tos=sk->protinfo.af_inet.tos;
 705                 iph->tot_len = htons(length);
 706                 iph->frag_off = df;
 707                 iph->ttl=sk->protinfo.af_inet.mc_ttl;
 708                 ip_select_ident(iph, &rt->u.dst, sk);
 709                 if (rt->rt_type != RTN_MULTICAST)
 710                         iph->ttl=sk->protinfo.af_inet.ttl;
 711                 iph->protocol=sk->protocol;
 712                 iph->saddr=rt->rt_src;
 713                 iph->daddr=rt->rt_dst;
 714                 iph->check=0;
 715                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 716                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 717         }
 718         else
 719                 err = getfrag(frag, (void *)iph, 0, length);
 720
 721         if (err)
 722                 goto error_fault;
 723
 724         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 725                       output_maybe_reroute);
 726         if (err > 0)
 727                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 728         if (err)
 729                 goto error;
 730 out:
 731         return 0;
 732
 733 error_fault:
 734         err = -EFAULT;
 735         kfree_skb(skb);
 736 error:
 737         IP_INC_STATS(IpOutDiscards);
 738         return err;
 739 }
 740
 741 /*
 742  *      This IP datagram is too large to be sent in one piece.  Break it up into
 743  *      smaller pieces (each of size equal to IP header plus
 744  *      a block of the data of the original IP data part) that will yet fit in a
 745  *      single device frame, and queue such a frame for sending.
 746  *
 747  *      Yes this is inefficient, feel free to submit a quicker one.
 748  */
 749
 750 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 751 {
 752         struct iphdr *iph;
 753         int raw = 0;
 754         int ptr;
 755         struct net_device *dev;
 756         struct sk_buff *skb2;
 757         unsigned int mtu, hlen, left, len;
 758         int offset;
 759         int not_last_frag;
 760         struct rtable *rt = (struct rtable*)skb->dst;
 761         int err = 0;
 762
 763         dev = rt->u.dst.dev;
 764
 765         /*
 766          *      Point into the IP datagram header.
 767          */
 768
 769         iph = skb->nh.iph;
 770
 771         /*
 772          *      Setup starting values.
 773          */
 774
 775         hlen = iph->ihl * 4;
 776         left = skb->len - hlen;         /* Space per frame */
 777         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 778         ptr = raw + hlen;               /* Where to start from */
 779
 780         /*
 781          *      Fragment the datagram.
 782          */
 783
 784         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 785         not_last_frag = iph->frag_off & htons(IP_MF);
 786
 787         /*
 788          *      Keep copying data until we run out.
 789          */
 790
 791         while(left > 0) {
 792                 len = left;
 793                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 794                 if (len > mtu)
 795                         len = mtu;
 796                 /* IF: we are not sending upto and including the packet end
 797                    then align the next start on an eight byte boundary */
 798                 if (len < left) {
 799                         len &= ~7;
 800                 }
 801                 /*
 802                  *      Allocate buffer.
 803                  */
 804
 805                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 806                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 807                         err = -ENOMEM;
 808                         goto fail;
 809                 }
 810
 811                 /*
 812                  *      Set up data on packet
 813                  */
 814
 815                 skb2->pkt_type = skb->pkt_type;
 816                 skb2->priority = skb->priority;
 817                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 818                 skb_put(skb2, len + hlen);
 819                 skb2->nh.raw = skb2->data;
 820                 skb2->h.raw = skb2->data + hlen;
 821                 skb2->protocol = skb->protocol;
 822                 skb2->security = skb->security;
 823
 824                 /*
 825                  *      Charge the memory for the fragment to any owner
 826                  *      it might possess
 827                  */
 828
 829                 if (skb->sk)
 830                         skb_set_owner_w(skb2, skb->sk);
 831                 skb2->dst = dst_clone(skb->dst);
 832                 skb2->dev = skb->dev;
 833
 834                 /*
 835                  *      Copy the packet header into the new buffer.
 836                  */
 837
 838                 memcpy(skb2->nh.raw, skb->data, hlen);
 839
 840                 /*
 841                  *      Copy a block of the IP datagram.
 842                  */
 843                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 844                         BUG();
 845                 left -= len;
 846
 847                 /*
 848                  *      Fill in the new header fields.
 849                  */
 850                 iph = skb2->nh.iph;
 851                 iph->frag_off = htons((offset >> 3));
 852
 853                 /* ANK: dirty, but effective trick. Upgrade options only if
 854                  * the segment to be fragmented was THE FIRST (otherwise,
 855                  * options are already fixed) and make it ONCE
 856                  * on the initial skb, so that all the following fragments
 857                  * will inherit fixed options.
 858                  */
 859                 if (offset == 0)
 860                         ip_options_fragment(skb);
 861
 862                 /* Copy the flags to each fragment. */
 863                 IPCB(skb2)->flags = IPCB(skb)->flags;
 864
 865                 /*
 866                  *      Added AC : If we are fragmenting a fragment that's not the
 867                  *                 last fragment then keep MF on each bit
 868                  */
 869                 if (left > 0 || not_last_frag)
 870                         iph->frag_off |= htons(IP_MF);
 871                 ptr += len;
 872                 offset += len;
 873
 874 #ifdef CONFIG_NET_SCHED
 875                 skb2->tc_index = skb->tc_index;
 876 #endif
 877 #ifdef CONFIG_NETFILTER
 878                 skb2->nfmark = skb->nfmark;
 879                 /* Connection association is same as pre-frag packet */
 880                 skb2->nfct = skb->nfct;
 881                 nf_conntrack_get(skb2->nfct);
 882 #ifdef CONFIG_NETFILTER_DEBUG
 883                 skb2->nf_debug = skb->nf_debug;
 884 #endif
 885 #endif
 886
 887                 /*
 888                  *      Put this fragment into the sending queue.
 889                  */
 890
 891                 IP_INC_STATS(IpFragCreates);
 892
 893                 iph->tot_len = htons(len + hlen);
 894
 895                 ip_send_check(iph);
 896
 897                 err = output(skb2);
 898                 if (err)
 899                         goto fail;
 900         }
 901         kfree_skb(skb);
 902         IP_INC_STATS(IpFragOKs);
 903         return err;
 904
 905 fail:
 906         kfree_skb(skb);
 907         IP_INC_STATS(IpFragFails);
 908         return err;
 909 }
 910
 911 /*
 912  *      Fetch data from kernel space and fill in checksum if needed.
 913  */
 914 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 915                               unsigned int fraglen)
 916 {
 917         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 918         u16 *pktp = (u16 *)to;
 919         struct iovec *iov;
 920         int len;
 921         int hdrflag = 1;
 922
 923         iov = &dp->iov[0];
 924         if (offset >= iov->iov_len) {
 925                 offset -= iov->iov_len;
 926                 iov++;
 927                 hdrflag = 0;
 928         }
 929         len = iov->iov_len - offset;
 930         if (fraglen > len) { /* overlapping. */
 931                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 932                                              dp->csum);
 933                 offset = 0;
 934                 fraglen -= len;
 935                 to += len;
 936                 iov++;
 937         }
 938
 939         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 940                                              dp->csum);
 941
 942         if (hdrflag && dp->csumoffset)
 943                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 944         return 0;
 945 }
 946
 947 /*
 948  *      Generic function to send a packet as reply to another packet.
 949  *      Used to send TCP resets so far. ICMP should use this function too.
 950  *
 951  *      Should run single threaded per socket because it uses the sock
 952  *      structure to pass arguments.
 953  */
 954 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 955                    unsigned int len)
 956 {
 957         struct {
 958                 struct ip_options       opt;
 959                 char                    data[40];
 960         } replyopts;
 961         struct ipcm_cookie ipc;
 962         u32 daddr;
 963         struct rtable *rt = (struct rtable*)skb->dst;
 964
 965         if (ip_options_echo(&replyopts.opt, skb))
 966                 return;
 967
 968         daddr = ipc.addr = rt->rt_src;
 969         ipc.opt = NULL;
 970
 971         if (replyopts.opt.optlen) {
 972                 ipc.opt = &replyopts.opt;
 973
 974                 if (ipc.opt->srr)
 975                         daddr = replyopts.opt.faddr;
 976         }
 977
 978         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 979                 return;
 980
 981         /* And let IP do all the hard work.
 982
 983            This chunk is not reenterable, hence spinlock.
 984            Note that it uses the fact, that this function is called
 985            with locally disabled BH and that sk cannot be already spinlocked.
 986          */
 987         bh_lock_sock(sk);
 988         sk->protinfo.af_inet.tos = skb->nh.iph->tos;
 989         sk->priority = skb->priority;
 990         sk->protocol = skb->nh.iph->protocol;
 991         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 992         bh_unlock_sock(sk);
 993
 994         ip_rt_put(rt);
 995 }
 996
 997 /*
 998  *      IP protocol layer initialiser
 999  */
1000
1001 static struct packet_type ip_packet_type =
1002 {
1003         __constant_htons(ETH_P_IP),
1004         NULL,   /* All devices */
1005         ip_rcv,
1006         (void*)1,
1007         NULL,
1008 };
1009
1010 /*
1011  *      IP registers the packet type and then calls the subprotocol initialisers
1012  */
1013
1014 void __init ip_init(void)
1015 {
1016         dev_add_pack(&ip_packet_type);
1017
1018         ip_rt_init();
1019         inet_initpeers();
1020
1021 #ifdef CONFIG_IP_MULTICAST
1022         proc_net_create("igmp", 0, ip_mc_procinfo);
1023 #endif
1024 }