www.usr.com/support/gpl/USR9113_release1.0.tar.gz
[bcm963xx.git] / kernel / linux / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <linux/igmp.h>
82 #include <linux/netfilter_ipv4.h>
83 #include <linux/netfilter_bridge.h>
84 #include <linux/mroute.h>
85 #include <linux/netlink.h>
86
87 /*
88  *      Shall we try to damage output packets if routing dev changes?
89  */
90
91 int sysctl_ip_dynaddr;
92 int sysctl_ip_default_ttl = IPDEFTTL;
93
94 /* Generate a checksum for an outgoing IP datagram. */
95 __inline__ void ip_send_check(struct iphdr *iph)
96 {
97         iph->check = 0;
98         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
99 }
100
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104         newskb->mac.raw = newskb->data;
105         __skb_pull(newskb, newskb->nh.raw - newskb->data);
106         newskb->pkt_type = PACKET_LOOPBACK;
107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
108         BUG_TRAP(newskb->dst);
109
110 #ifdef CONFIG_NETFILTER_DEBUG
111         nf_debug_ip_loopback_xmit(newskb);
112 #endif
113         netif_rx(newskb);
114         return 0;
115 }
116
117 static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst)
118 {
119         int ttl = inet->uc_ttl;
120
121         if (ttl < 0)
122                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
123         return ttl;
124 }
125
126 /* 
127  *              Add an ip header to a skbuff and send it out.
128  *
129  */
130 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
131                           u32 saddr, u32 daddr, struct ip_options *opt)
132 {
133         struct inet_opt *inet = inet_sk(sk);
134         struct rtable *rt = (struct rtable *)skb->dst;
135         struct iphdr *iph;
136
137         /* Build the IP header. */
138         if (opt)
139                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
140         else
141                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
142
143         iph->version  = 4;
144         iph->ihl      = 5;
145         iph->tos      = inet->tos;
146         if (ip_dont_fragment(sk, &rt->u.dst))
147                 iph->frag_off = htons(IP_DF);
148         else
149                 iph->frag_off = 0;
150         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
151         iph->daddr    = rt->rt_dst;
152         iph->saddr    = rt->rt_src;
153         iph->protocol = sk->sk_protocol;
154         iph->tot_len  = htons(skb->len);
155         ip_select_ident(iph, &rt->u.dst, sk);
156         skb->nh.iph   = iph;
157
158         if (opt && opt->optlen) {
159                 iph->ihl += opt->optlen>>2;
160                 ip_options_build(skb, opt, daddr, rt, 0);
161         }
162         ip_send_check(iph);
163
164         skb->priority = sk->sk_priority;
165
166         /* Send it out. */
167         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
168                        dst_output);
169 }
170
171 static inline int ip_finish_output2(struct sk_buff *skb)
172 {
173         struct dst_entry *dst = skb->dst;
174         struct hh_cache *hh = dst->hh;
175         struct net_device *dev = dst->dev;
176         int hh_len = LL_RESERVED_SPACE(dev);
177
178         /* Be paranoid, rather than too clever. */
179         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
180                 struct sk_buff *skb2;
181
182                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
183                 if (skb2 == NULL) {
184                         kfree_skb(skb);
185                         return -ENOMEM;
186                 }
187                 if (skb->sk)
188                         skb_set_owner_w(skb2, skb->sk);
189                 kfree_skb(skb);
190                 skb = skb2;
191         }
192
193 #ifdef CONFIG_NETFILTER_DEBUG
194         nf_debug_ip_finish_output2(skb);
195 #endif /*CONFIG_NETFILTER_DEBUG*/
196
197         if (hh) {
198                 int hh_alen;
199
200                 read_lock_bh(&hh->hh_lock);
201                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
202                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
203                 read_unlock_bh(&hh->hh_lock);
204                 skb_push(skb, hh->hh_len);
205                 return hh->hh_output(skb);
206         } else if (dst->neighbour)
207                 return dst->neighbour->output(skb);
208
209         if (net_ratelimit())
210                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211         kfree_skb(skb);
212         return -EINVAL;
213 }
214
215 int ip_finish_output(struct sk_buff *skb)
216 {
217         struct net_device *dev = skb->dst->dev;
218
219         skb->dev = dev;
220         skb->protocol = htons(ETH_P_IP);
221
222         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
223                        ip_finish_output2);
224 }
225
226 int ip_mc_output(struct sk_buff **pskb)
227 {
228         struct sk_buff *skb = *pskb;
229         struct sock *sk = skb->sk;
230         struct rtable *rt = (struct rtable*)skb->dst;
231         struct net_device *dev = rt->u.dst.dev;
232
233         /*
234          *      If the indicated interface is up and running, send the packet.
235          */
236         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
237
238         skb->dev = dev;
239         skb->protocol = htons(ETH_P_IP);
240
241         /*
242          *      Multicasts are looped back for other local users
243          */
244
245         if (rt->rt_flags&RTCF_MULTICAST) {
246                 if ((!sk || inet_sk(sk)->mc_loop)
247 #ifdef CONFIG_IP_MROUTE
248                 /* Small optimization: do not loopback not local frames,
249                    which returned after forwarding; they will be  dropped
250                    by ip_mr_input in any case.
251                    Note, that local frames are looped back to be delivered
252                    to local recipients.
253
254                    This check is duplicated in ip_mr_input at the moment.
255                  */
256                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
257 #endif
258                 ) {
259                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
260                         if (newskb)
261                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
262                                         newskb->dev, 
263                                         ip_dev_loopback_xmit);
264                 }
265
266                 /* Multicasts with ttl 0 must not go beyond the host */
267
268                 if (skb->nh.iph->ttl == 0) {
269                         kfree_skb(skb);
270                         return 0;
271                 }
272         }
273
274         if (rt->rt_flags&RTCF_BROADCAST) {
275                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
276                 if (newskb)
277                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
278                                 newskb->dev, ip_dev_loopback_xmit);
279         }
280
281         if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list)
282                 return ip_fragment(skb, ip_finish_output);
283         else
284                 return ip_finish_output(skb);
285 }
286
287 int ip_output(struct sk_buff **pskb)
288 {
289         struct sk_buff *skb = *pskb;
290
291         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
292
293         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) &&
294             !skb_shinfo(skb)->tso_size)
295                 return ip_fragment(skb, ip_finish_output);
296         else
297                 return ip_finish_output(skb);
298 }
299
300 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
301 {
302         struct sock *sk = skb->sk;
303         struct inet_opt *inet = inet_sk(sk);
304         struct ip_options *opt = inet->opt;
305         struct rtable *rt;
306         struct iphdr *iph;
307         u32 mtu;
308
309         /* Skip all of this if the packet is already routed,
310          * f.e. by something like SCTP.
311          */
312         rt = (struct rtable *) skb->dst;
313         if (rt != NULL)
314                 goto packet_routed;
315
316         /* Make sure we can route this packet. */
317         rt = (struct rtable *)__sk_dst_check(sk, 0);
318         if (rt == NULL) {
319                 u32 daddr;
320
321                 /* Use correct destination address if we have options. */
322                 daddr = inet->daddr;
323                 if(opt && opt->srr)
324                         daddr = opt->faddr;
325
326                 {
327                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
328                                             .nl_u = { .ip4_u =
329                                                       { .daddr = daddr,
330                                                         .saddr = inet->saddr,
331                                                         .tos = RT_CONN_FLAGS(sk) } },
332                                             .proto = sk->sk_protocol,
333                                             .uli_u = { .ports =
334                                                        { .sport = inet->sport,
335                                                          .dport = inet->dport } } };
336
337                         /* If this fails, retransmit mechanism of transport layer will
338                          * keep trying until route appears or the connection times
339                          * itself out.
340                          */
341                         if (ip_route_output_flow(&rt, &fl, sk, 0))
342                                 goto no_route;
343                 }
344                 __sk_dst_set(sk, &rt->u.dst);
345                 tcp_v4_setup_caps(sk, &rt->u.dst);
346         }
347         skb->dst = dst_clone(&rt->u.dst);
348
349 packet_routed:
350         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
351                 goto no_route;
352
353         /* OK, we know where to send it, allocate and build IP header. */
354         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
355         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
356         iph->tot_len = htons(skb->len);
357         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
358                 iph->frag_off = htons(IP_DF);
359         else
360                 iph->frag_off = 0;
361         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
362         iph->protocol = sk->sk_protocol;
363         iph->saddr    = rt->rt_src;
364         iph->daddr    = rt->rt_dst;
365         skb->nh.iph   = iph;
366         /* Transport layer set skb->h.foo itself. */
367
368         if(opt && opt->optlen) {
369                 iph->ihl += opt->optlen >> 2;
370                 ip_options_build(skb, opt, inet->daddr, rt, 0);
371         }
372
373         mtu = dst_pmtu(&rt->u.dst);
374         if (skb->len > mtu && (sk->sk_route_caps & NETIF_F_TSO)) {
375                 unsigned int hlen;
376
377                 /* Hack zone: all this must be done by TCP. */
378                 hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
379                 skb_shinfo(skb)->tso_size = mtu - hlen;
380                 skb_shinfo(skb)->tso_segs =
381                         (skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
382                                 skb_shinfo(skb)->tso_size - 1;
383         }
384
385         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
386
387         /* Add an IP checksum. */
388         ip_send_check(iph);
389
390         skb->priority = sk->sk_priority;
391
392         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
393                        dst_output);
394
395 no_route:
396         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
397         kfree_skb(skb);
398         return -EHOSTUNREACH;
399 }
400
401
402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403 {
404         to->pkt_type = from->pkt_type;
405         to->priority = from->priority;
406         to->protocol = from->protocol;
407         to->security = from->security;
408         to->dst = dst_clone(from->dst);
409         to->dev = from->dev;
410
411         /* Copy the flags to each fragment. */
412         IPCB(to)->flags = IPCB(from)->flags;
413
414 #ifdef CONFIG_NET_SCHED
415         to->tc_index = from->tc_index;
416 #endif
417 #ifdef CONFIG_NETFILTER
418         to->nfmark = from->nfmark;
419         to->nfcache = from->nfcache;
420         /* Connection association is same as pre-frag packet */
421         nf_conntrack_put(to->nfct);
422         to->nfct = from->nfct;
423         nf_conntrack_get(to->nfct);
424 #ifdef CONFIG_BRIDGE_NETFILTER
425         nf_bridge_put(to->nf_bridge);
426         to->nf_bridge = from->nf_bridge;
427         nf_bridge_get(to->nf_bridge);
428 #endif
429 #ifdef CONFIG_NETFILTER_DEBUG
430         to->nf_debug = from->nf_debug;
431 #endif
432 #endif
433 }
434
435 /*
436  *      This IP datagram is too large to be sent in one piece.  Break it up into
437  *      smaller pieces (each of size equal to IP header plus
438  *      a block of the data of the original IP data part) that will yet fit in a
439  *      single device frame, and queue such a frame for sending.
440  */
441
442 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
443 {
444         struct iphdr *iph;
445         int raw = 0;
446         int ptr;
447         struct net_device *dev;
448         struct sk_buff *skb2;
449         unsigned int mtu, hlen, left, len, ll_rs;
450         int offset;
451         int not_last_frag;
452         struct rtable *rt = (struct rtable*)skb->dst;
453         int err = 0;
454
455         dev = rt->u.dst.dev;
456
457         /*
458          *      Point into the IP datagram header.
459          */
460
461         iph = skb->nh.iph;
462
463         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
464                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
465                           htonl(dst_pmtu(&rt->u.dst)));
466                 kfree_skb(skb);
467                 return -EMSGSIZE;
468         }
469
470         /*
471          *      Setup starting values.
472          */
473
474         hlen = iph->ihl * 4;
475         mtu = dst_pmtu(&rt->u.dst) - hlen;      /* Size of data space */
476
477         /* When frag_list is given, use it. First, check its validity:
478          * some transformers could create wrong frag_list or break existing
479          * one, it is not prohibited. In this case fall back to copying.
480          *
481          * LATER: this step can be merged to real generation of fragments,
482          * we can switch to copy when see the first bad fragment.
483          */
484         if (skb_shinfo(skb)->frag_list) {
485                 struct sk_buff *frag;
486                 int first_len = skb_pagelen(skb);
487
488                 if (first_len - hlen > mtu ||
489                     ((first_len - hlen) & 7) ||
490                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
491                     skb_cloned(skb))
492                         goto slow_path;
493
494                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
495                         /* Correct geometry. */
496                         if (frag->len > mtu ||
497                             ((frag->len & 7) && frag->next) ||
498                             skb_headroom(frag) < hlen)
499                             goto slow_path;
500
501                         /* Partially cloned skb? */
502                         if (skb_shared(frag))
503                                 goto slow_path;
504                 }
505
506                 /* Everything is OK. Generate! */
507
508                 err = 0;
509                 offset = 0;
510                 frag = skb_shinfo(skb)->frag_list;
511                 skb_shinfo(skb)->frag_list = NULL;
512                 skb->data_len = first_len - skb_headlen(skb);
513                 skb->len = first_len;
514                 iph->tot_len = htons(first_len);
515                 iph->frag_off |= htons(IP_MF);
516                 ip_send_check(iph);
517
518                 for (;;) {
519                         /* Prepare header of the next frame,
520                          * before previous one went down. */
521                         if (frag) {
522                                 frag->h.raw = frag->data;
523                                 frag->nh.raw = __skb_push(frag, hlen);
524                                 memcpy(frag->nh.raw, iph, hlen);
525                                 iph = frag->nh.iph;
526                                 iph->tot_len = htons(frag->len);
527                                 if ( frag->dst )
528                                         atomic_dec(&frag->dst->__refcnt);
529
530                                 ip_copy_metadata(frag, skb);
531                                 if (offset == 0)
532                                         ip_options_fragment(frag);
533                                 offset += skb->len - hlen;
534                                 iph->frag_off = htons(offset>>3);
535                                 if (frag->next != NULL)
536                                         iph->frag_off |= htons(IP_MF);
537                                 /* Ready, complete checksum */
538                                 ip_send_check(iph);
539                         }
540
541                         err = output(skb);
542
543                         if (err || !frag)
544                                 break;
545
546                         skb = frag;
547                         frag = skb->next;
548                         skb->next = NULL;
549                 }
550
551                 if (err == 0) {
552                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
553                         return 0;
554                 }
555
556                 while (frag) {
557                         skb = frag->next;
558                         kfree_skb(frag);
559                         frag = skb;
560                 }
561                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
562                 return err;
563         }
564
565 slow_path:
566         left = skb->len - hlen;         /* Space per frame */
567         ptr = raw + hlen;               /* Where to start from */
568
569 #ifdef CONFIG_BRIDGE_NETFILTER
570         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
571          * we need to make room for the encapsulating header */
572         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
573         mtu -= nf_bridge_pad(skb);
574 #else
575         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
576 #endif
577         /*
578          *      Fragment the datagram.
579          */
580
581         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
582         not_last_frag = iph->frag_off & htons(IP_MF);
583
584         /*
585          *      Keep copying data until we run out.
586          */
587
588         while(left > 0) {
589                 len = left;
590                 /* IF: it doesn't fit, use 'mtu' - the data space left */
591                 if (len > mtu)
592                         len = mtu;
593                 /* IF: we are not sending upto and including the packet end
594                    then align the next start on an eight byte boundary */
595                 if (len < left) {
596                         len &= ~7;
597                 }
598                 /*
599                  *      Allocate buffer.
600                  */
601
602                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
603                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
604                         err = -ENOMEM;
605                         goto fail;
606                 }
607
608                 /*
609                  *      Set up data on packet
610                  */
611
612                 ip_copy_metadata(skb2, skb);
613                 skb_reserve(skb2, ll_rs);
614                 skb_put(skb2, len + hlen);
615                 skb2->nh.raw = skb2->data;
616                 skb2->h.raw = skb2->data + hlen;
617
618                 /*
619                  *      Charge the memory for the fragment to any owner
620                  *      it might possess
621                  */
622
623                 if (skb->sk)
624                         skb_set_owner_w(skb2, skb->sk);
625
626                 /*
627                  *      Copy the packet header into the new buffer.
628                  */
629
630                 memcpy(skb2->nh.raw, skb->data, hlen);
631
632                 /*
633                  *      Copy a block of the IP datagram.
634                  */
635                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
636                         BUG();
637                 left -= len;
638
639                 /*
640                  *      Fill in the new header fields.
641                  */
642                 iph = skb2->nh.iph;
643                 iph->frag_off = htons((offset >> 3));
644
645                 /* ANK: dirty, but effective trick. Upgrade options only if
646                  * the segment to be fragmented was THE FIRST (otherwise,
647                  * options are already fixed) and make it ONCE
648                  * on the initial skb, so that all the following fragments
649                  * will inherit fixed options.
650                  */
651                 if (offset == 0)
652                         ip_options_fragment(skb);
653
654                 /*
655                  *      Added AC : If we are fragmenting a fragment that's not the
656                  *                 last fragment then keep MF on each bit
657                  */
658                 if (left > 0 || not_last_frag)
659                         iph->frag_off |= htons(IP_MF);
660                 ptr += len;
661                 offset += len;
662
663                 /*
664                  *      Put this fragment into the sending queue.
665                  */
666
667                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
668
669                 iph->tot_len = htons(len + hlen);
670
671                 ip_send_check(iph);
672
673                 err = output(skb2);
674                 if (err)
675                         goto fail;
676         }
677         kfree_skb(skb);
678         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
679         return err;
680
681 fail:
682         kfree_skb(skb); 
683         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
684         return err;
685 }
686
687 int
688 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
689 {
690         struct iovec *iov = from;
691
692         if (skb->ip_summed == CHECKSUM_HW) {
693                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
694                         return -EFAULT;
695         } else {
696                 unsigned int csum = 0;
697                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
698                         return -EFAULT;
699                 skb->csum = csum_block_add(skb->csum, csum, odd);
700         }
701         return 0;
702 }
703
704 static inline unsigned int
705 csum_page(struct page *page, int offset, int copy)
706 {
707         char *kaddr;
708         unsigned int csum;
709         kaddr = kmap(page);
710         csum = csum_partial(kaddr + offset, copy, 0);
711         kunmap(page);
712         return csum;
713 }
714
715 /*
716  *      ip_append_data() and ip_append_page() can make one large IP datagram
717  *      from many pieces of data. Each pieces will be holded on the socket
718  *      until ip_push_pending_frames() is called. Eache pieces can be a page
719  *      or non-page data.
720  *      
721  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
722  *      this interface potentially.
723  *
724  *      LATER: length must be adjusted by pad at tail, when it is required.
725  */
726 int ip_append_data(struct sock *sk,
727                    int getfrag(void *from, char *to, int offset, int len,
728                                int odd, struct sk_buff *skb),
729                    void *from, int length, int transhdrlen,
730                    struct ipcm_cookie *ipc, struct rtable *rt,
731                    unsigned int flags)
732 {
733         struct inet_opt *inet = inet_sk(sk);
734         struct sk_buff *skb;
735
736         struct ip_options *opt = NULL;
737         int hh_len;
738         int exthdrlen;
739         int mtu;
740         int copy;
741         int err;
742         int offset = 0;
743         unsigned int maxfraglen, fragheaderlen;
744         int csummode = CHECKSUM_NONE;
745
746         if (flags&MSG_PROBE)
747                 return 0;
748
749         if (skb_queue_empty(&sk->sk_write_queue)) {
750                 /*
751                  * setup for corking.
752                  */
753                 opt = ipc->opt;
754                 if (opt) {
755                         if (inet->cork.opt == NULL) {
756                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
757                                 if (unlikely(inet->cork.opt == NULL))
758                                         return -ENOBUFS;
759                         }
760                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
761                         inet->cork.flags |= IPCORK_OPT;
762                         inet->cork.addr = ipc->addr;
763                 }
764                 dst_hold(&rt->u.dst);
765                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
766                 inet->cork.rt = rt;
767                 inet->cork.length = 0;
768                 sk->sk_sndmsg_page = NULL;
769                 sk->sk_sndmsg_off = 0;
770                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
771                         length += exthdrlen;
772                         transhdrlen += exthdrlen;
773                 }
774         } else {
775                 rt = inet->cork.rt;
776                 if (inet->cork.flags & IPCORK_OPT)
777                         opt = inet->cork.opt;
778
779                 transhdrlen = 0;
780                 exthdrlen = 0;
781                 mtu = inet->cork.fragsize;
782         }
783         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
784
785         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
786         maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
787
788         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
789                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
790                 return -EMSGSIZE;
791         }
792
793         /*
794          * transhdrlen > 0 means that this is the first fragment and we wish
795          * it won't be fragmented in the future.
796          */
797         if (transhdrlen &&
798             length + fragheaderlen <= maxfraglen &&
799             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
800             !exthdrlen)
801                 csummode = CHECKSUM_HW;
802
803         inet->cork.length += length;
804
805         /* So, what's going on in the loop below?
806          *
807          * We use calculated fragment length to generate chained skb,
808          * each of segments is IP fragment ready for sending to network after
809          * adding appropriate IP header.
810          *
811          * Mistake is:
812          *
813          *    If mtu-fragheaderlen is not 0 modulo 8, we generate additional
814          *    small fragment of length (mtu-fragheaderlen)%8, even though
815          *    it is not necessary. Not a big bug, but needs a fix.
816          */
817
818         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
819                 goto alloc_new_skb;
820
821         while (length > 0) {
822                 if ((copy = maxfraglen - skb->len) <= 0) {
823                         char *data;
824                         unsigned int datalen;
825                         unsigned int fraglen;
826                         unsigned int alloclen;
827                         BUG_TRAP(copy == 0);
828
829 alloc_new_skb:
830                         datalen = maxfraglen - fragheaderlen;
831                         if (datalen > length)
832                                 datalen = length;
833
834                         fraglen = datalen + fragheaderlen;
835                         if ((flags & MSG_MORE) && 
836                             !(rt->u.dst.dev->features&NETIF_F_SG))
837                                 alloclen = maxfraglen;
838                         else
839                                 alloclen = datalen + fragheaderlen;
840
841                         /* The last fragment gets additional space at tail.
842                          * Note, with MSG_MORE we overallocate on fragments,
843                          * because we have no idea what fragment will be
844                          * the last.
845                          */
846                         if (datalen == length)
847                                 alloclen += rt->u.dst.trailer_len;
848
849                         if (transhdrlen) {
850                                 skb = sock_alloc_send_skb(sk, 
851                                                 alloclen + hh_len + 15,
852                                                 (flags & MSG_DONTWAIT), &err);
853                         } else {
854                                 skb = NULL;
855                                 if (atomic_read(&sk->sk_wmem_alloc) <=
856                                     2 * sk->sk_sndbuf)
857                                         skb = sock_wmalloc(sk, 
858                                                            alloclen + hh_len + 15, 1,
859                                                            sk->sk_allocation);
860                                 if (unlikely(skb == NULL))
861                                         err = -ENOBUFS;
862                         }
863                         if (skb == NULL)
864                                 goto error;
865
866                         /*
867                          *      Fill in the control structures
868                          */
869                         skb->ip_summed = csummode;
870                         skb->csum = 0;
871                         skb_reserve(skb, hh_len);
872
873                         /*
874                          *      Find where to start putting bytes.
875                          */
876                         data = skb_put(skb, fraglen);
877                         skb->nh.raw = data + exthdrlen;
878                         data += fragheaderlen;
879                         skb->h.raw = data + exthdrlen;
880
881                         copy = datalen - transhdrlen;
882                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
883                                 err = -EFAULT;
884                                 kfree_skb(skb);
885                                 goto error;
886                         }
887
888                         offset += copy;
889                         length -= datalen;
890                         transhdrlen = 0;
891                         exthdrlen = 0;
892                         csummode = CHECKSUM_NONE;
893
894                         /*
895                          * Put the packet on the pending queue.
896                          */
897                         __skb_queue_tail(&sk->sk_write_queue, skb);
898                         continue;
899                 }
900
901                 if (copy > length)
902                         copy = length;
903
904                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
905                         unsigned int off;
906
907                         off = skb->len;
908                         if (getfrag(from, skb_put(skb, copy), 
909                                         offset, copy, off, skb) < 0) {
910                                 __skb_trim(skb, off);
911                                 err = -EFAULT;
912                                 goto error;
913                         }
914                 } else {
915                         int i = skb_shinfo(skb)->nr_frags;
916                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
917                         struct page *page = sk->sk_sndmsg_page;
918                         int off = sk->sk_sndmsg_off;
919                         unsigned int left;
920
921                         if (page && (left = PAGE_SIZE - off) > 0) {
922                                 if (copy >= left)
923                                         copy = left;
924                                 if (page != frag->page) {
925                                         if (i == MAX_SKB_FRAGS) {
926                                                 err = -EMSGSIZE;
927                                                 goto error;
928                                         }
929                                         get_page(page);
930                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
931                                         frag = &skb_shinfo(skb)->frags[i];
932                                 }
933                         } else if (i < MAX_SKB_FRAGS) {
934                                 if (copy > PAGE_SIZE)
935                                         copy = PAGE_SIZE;
936                                 page = alloc_pages(sk->sk_allocation, 0);
937                                 if (page == NULL)  {
938                                         err = -ENOMEM;
939                                         goto error;
940                                 }
941                                 sk->sk_sndmsg_page = page;
942                                 sk->sk_sndmsg_off = 0;
943
944                                 skb_fill_page_desc(skb, i, page, 0, 0);
945                                 frag = &skb_shinfo(skb)->frags[i];
946                                 skb->truesize += PAGE_SIZE;
947                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
948                         } else {
949                                 err = -EMSGSIZE;
950                                 goto error;
951                         }
952                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
953                                 err = -EFAULT;
954                                 goto error;
955                         }
956                         sk->sk_sndmsg_off += copy;
957                         frag->size += copy;
958                         skb->len += copy;
959                         skb->data_len += copy;
960                 }
961                 offset += copy;
962                 length -= copy;
963         }
964
965         return 0;
966
967 error:
968         inet->cork.length -= length;
969         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
970         return err; 
971 }
972
973 ssize_t ip_append_page(struct sock *sk, struct page *page,
974                        int offset, size_t size, int flags)
975 {
976         struct inet_opt *inet = inet_sk(sk);
977         struct sk_buff *skb;
978         struct rtable *rt;
979         struct ip_options *opt = NULL;
980         int hh_len;
981         int mtu;
982         int len;
983         int err;
984         unsigned int maxfraglen, fragheaderlen;
985
986         if (inet->hdrincl)
987                 return -EPERM;
988
989         if (flags&MSG_PROBE)
990                 return 0;
991
992         if (skb_queue_empty(&sk->sk_write_queue))
993                 return -EINVAL;
994
995         rt = inet->cork.rt;
996         if (inet->cork.flags & IPCORK_OPT)
997                 opt = inet->cork.opt;
998
999         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1000                 return -EOPNOTSUPP;
1001
1002         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1003         mtu = inet->cork.fragsize;
1004
1005         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1006         maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
1007
1008         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1009                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1010                 return -EMSGSIZE;
1011         }
1012
1013         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1014                 return -EINVAL;
1015
1016         inet->cork.length += size;
1017
1018         while (size > 0) {
1019                 int i;
1020                 if ((len = maxfraglen - skb->len) <= 0) {
1021                         char *data;
1022                         struct iphdr *iph;
1023                         BUG_TRAP(len == 0);
1024
1025                         skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1,
1026                                            sk->sk_allocation);
1027                         if (unlikely(!skb)) {
1028                                 err = -ENOBUFS;
1029                                 goto error;
1030                         }
1031
1032                         /*
1033                          *      Fill in the control structures
1034                          */
1035                         skb->ip_summed = CHECKSUM_NONE;
1036                         skb->csum = 0;
1037                         skb_reserve(skb, hh_len);
1038
1039                         /*
1040                          *      Find where to start putting bytes.
1041                          */
1042                         data = skb_put(skb, fragheaderlen);
1043                         skb->nh.iph = iph = (struct iphdr *)data;
1044                         data += fragheaderlen;
1045                         skb->h.raw = data;
1046
1047                         /*
1048                          * Put the packet on the pending queue.
1049                          */
1050                         __skb_queue_tail(&sk->sk_write_queue, skb);
1051                         continue;
1052                 }
1053
1054                 i = skb_shinfo(skb)->nr_frags;
1055                 if (len > size)
1056                         len = size;
1057                 if (skb_can_coalesce(skb, i, page, offset)) {
1058                         skb_shinfo(skb)->frags[i-1].size += len;
1059                 } else if (i < MAX_SKB_FRAGS) {
1060                         get_page(page);
1061                         skb_fill_page_desc(skb, i, page, offset, len);
1062                 } else {
1063                         err = -EMSGSIZE;
1064                         goto error;
1065                 }
1066
1067                 if (skb->ip_summed == CHECKSUM_NONE) {
1068                         unsigned int csum;
1069                         csum = csum_page(page, offset, len);
1070                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1071                 }
1072
1073                 skb->len += len;
1074                 skb->data_len += len;
1075                 offset += len;
1076                 size -= len;
1077         }
1078         return 0;
1079
1080 error:
1081         inet->cork.length -= size;
1082         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1083         return err;
1084 }
1085
1086 /*
1087  *      Combined all pending IP fragments on the socket as one IP datagram
1088  *      and push them out.
1089  */
1090 int ip_push_pending_frames(struct sock *sk)
1091 {
1092         struct sk_buff *skb, *tmp_skb;
1093         struct sk_buff **tail_skb;
1094         struct inet_opt *inet = inet_sk(sk);
1095         struct ip_options *opt = NULL;
1096         struct rtable *rt = inet->cork.rt;
1097         struct iphdr *iph;
1098         int df = 0;
1099         __u8 ttl;
1100         int err = 0;
1101
1102         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1103                 goto out;
1104         tail_skb = &(skb_shinfo(skb)->frag_list);
1105
1106         /* move skb->data to ip header from ext header */
1107         if (skb->data < skb->nh.raw)
1108                 __skb_pull(skb, skb->nh.raw - skb->data);
1109         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1110                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1111                 *tail_skb = tmp_skb;
1112                 tail_skb = &(tmp_skb->next);
1113                 skb->len += tmp_skb->len;
1114                 skb->data_len += tmp_skb->len;
1115                 skb->truesize += tmp_skb->truesize;
1116                 __sock_put(tmp_skb->sk);
1117                 tmp_skb->destructor = NULL;
1118                 tmp_skb->sk = NULL;
1119         }
1120
1121         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1122          * to fragment the frame generated here. No matter, what transforms
1123          * how transforms change size of the packet, it will come out.
1124          */
1125         if (inet->pmtudisc != IP_PMTUDISC_DO)
1126                 skb->local_df = 1;
1127
1128         /* DF bit is set when we want to see DF on outgoing frames.
1129          * If local_df is set too, we still allow to fragment this frame
1130          * locally. */
1131         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1132             (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
1133                 df = htons(IP_DF);
1134
1135         if (inet->cork.flags & IPCORK_OPT)
1136                 opt = inet->cork.opt;
1137
1138         if (rt->rt_type == RTN_MULTICAST)
1139                 ttl = inet->mc_ttl;
1140         else
1141                 ttl = ip_select_ttl(inet, &rt->u.dst);
1142
1143         iph = (struct iphdr *)skb->data;
1144         iph->version = 4;
1145         iph->ihl = 5;
1146         if (opt) {
1147                 iph->ihl += opt->optlen>>2;
1148                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1149         }
1150         iph->tos = inet->tos;
1151         iph->tot_len = htons(skb->len);
1152         iph->frag_off = df;
1153         if (!df) {
1154                 __ip_select_ident(iph, &rt->u.dst, 0);
1155         } else {
1156                 iph->id = htons(inet->id++);
1157         }
1158         iph->ttl = ttl;
1159         iph->protocol = sk->sk_protocol;
1160         iph->saddr = rt->rt_src;
1161         iph->daddr = rt->rt_dst;
1162         ip_send_check(iph);
1163
1164         skb->priority = sk->sk_priority;
1165         skb->dst = dst_clone(&rt->u.dst);
1166
1167         /* Netfilter gets whole the not fragmented skb. */
1168         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1169                       skb->dst->dev, dst_output);
1170         if (err) {
1171                 if (err > 0)
1172                         err = inet->recverr ? net_xmit_errno(err) : 0;
1173                 if (err)
1174                         goto error;
1175         }
1176
1177 out:
1178         inet->cork.flags &= ~IPCORK_OPT;
1179         if (inet->cork.opt) {
1180                 kfree(inet->cork.opt);
1181                 inet->cork.opt = NULL;
1182         }
1183         if (inet->cork.rt) {
1184                 ip_rt_put(inet->cork.rt);
1185                 inet->cork.rt = NULL;
1186         }
1187         return err;
1188
1189 error:
1190         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1191         goto out;
1192 }
1193
1194 /*
1195  *      Throw away all pending data on the socket.
1196  */
1197 void ip_flush_pending_frames(struct sock *sk)
1198 {
1199         struct inet_opt *inet = inet_sk(sk);
1200         struct sk_buff *skb;
1201
1202         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1203                 kfree_skb(skb);
1204
1205         inet->cork.flags &= ~IPCORK_OPT;
1206         if (inet->cork.opt) {
1207                 kfree(inet->cork.opt);
1208                 inet->cork.opt = NULL;
1209         }
1210         if (inet->cork.rt) {
1211                 ip_rt_put(inet->cork.rt);
1212                 inet->cork.rt = NULL;
1213         }
1214 }
1215
1216
1217 /*
1218  *      Fetch data from kernel space and fill in checksum if needed.
1219  */
1220 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1221                               int len, int odd, struct sk_buff *skb)
1222 {
1223         unsigned int csum;
1224
1225         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1226         skb->csum = csum_block_add(skb->csum, csum, odd);
1227         return 0;  
1228 }
1229
1230 /* 
1231  *      Generic function to send a packet as reply to another packet.
1232  *      Used to send TCP resets so far. ICMP should use this function too.
1233  *
1234  *      Should run single threaded per socket because it uses the sock 
1235  *      structure to pass arguments.
1236  *
1237  *      LATER: switch from ip_build_xmit to ip_append_*
1238  */
1239 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1240                    unsigned int len)
1241 {
1242         struct inet_opt *inet = inet_sk(sk);
1243         struct {
1244                 struct ip_options       opt;
1245                 char                    data[40];
1246         } replyopts;
1247         struct ipcm_cookie ipc;
1248         u32 daddr;
1249         struct rtable *rt = (struct rtable*)skb->dst;
1250
1251         if (ip_options_echo(&replyopts.opt, skb))
1252                 return;
1253
1254         daddr = ipc.addr = rt->rt_src;
1255         ipc.opt = NULL;
1256
1257         if (replyopts.opt.optlen) {
1258                 ipc.opt = &replyopts.opt;
1259
1260                 if (ipc.opt->srr)
1261                         daddr = replyopts.opt.faddr;
1262         }
1263
1264         {
1265                 struct flowi fl = { .nl_u = { .ip4_u =
1266                                               { .daddr = daddr,
1267                                                 .saddr = rt->rt_spec_dst,
1268                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1269                                     /* Not quite clean, but right. */
1270                                     .uli_u = { .ports =
1271                                                { .sport = skb->h.th->dest,
1272                                                  .dport = skb->h.th->source } },
1273                                     .proto = sk->sk_protocol };
1274                 if (ip_route_output_key(&rt, &fl))
1275                         return;
1276         }
1277
1278         /* And let IP do all the hard work.
1279
1280            This chunk is not reenterable, hence spinlock.
1281            Note that it uses the fact, that this function is called
1282            with locally disabled BH and that sk cannot be already spinlocked.
1283          */
1284         bh_lock_sock(sk);
1285         inet->tos = skb->nh.iph->tos;
1286         sk->sk_priority = skb->priority;
1287         sk->sk_protocol = skb->nh.iph->protocol;
1288         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1289                        &ipc, rt, MSG_DONTWAIT);
1290         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1291                 if (arg->csumoffset >= 0)
1292                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1293                 skb->ip_summed = CHECKSUM_NONE;
1294                 ip_push_pending_frames(sk);
1295         }
1296
1297         bh_unlock_sock(sk);
1298
1299         ip_rt_put(rt);
1300 }
1301
1302 /*
1303  *      IP protocol layer initialiser
1304  */
1305
1306 static struct packet_type ip_packet_type = {
1307         .type = __constant_htons(ETH_P_IP),
1308         .func = ip_rcv,
1309 };
1310
1311 /*
1312  *      IP registers the packet type and then calls the subprotocol initialisers
1313  */
1314
1315 void __init ip_init(void)
1316 {
1317         dev_add_pack(&ip_packet_type);
1318
1319         ip_rt_init();
1320         inet_initpeers();
1321
1322 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1323         igmp_mc_proc_init();
1324 #endif
1325 }
1326
1327 EXPORT_SYMBOL(ip_finish_output);
1328 EXPORT_SYMBOL(ip_fragment);
1329 EXPORT_SYMBOL(ip_generic_getfrag);
1330 EXPORT_SYMBOL(ip_queue_xmit);
1331 EXPORT_SYMBOL(ip_send_check);
1332
1333 #ifdef CONFIG_SYSCTL
1334 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1335 #endif