2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <pedro_m@yahoo.com>
8 * $Id: ip6_output.c,v 1.33 2001/09/20 00:35:35 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/route.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
46 #include <net/ndisc.h>
47 #include <net/protocol.h>
48 #include <net/ip6_route.h>
49 #include <net/addrconf.h>
50 #include <net/rawv6.h>
53 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
55 static u32 ipv6_fragmentation_id = 1;
56 static spinlock_t ip6_id_lock = SPIN_LOCK_UNLOCKED;
58 spin_lock_bh(&ip6_id_lock);
59 fhdr->identification = htonl(ipv6_fragmentation_id);
60 if (++ipv6_fragmentation_id == 0)
61 ipv6_fragmentation_id = 1;
62 spin_unlock_bh(&ip6_id_lock);
65 static inline int ip6_output_finish(struct sk_buff *skb)
68 struct dst_entry *dst = skb->dst;
69 struct hh_cache *hh = dst->hh;
74 read_lock_bh(&hh->hh_lock);
75 hh_alen = HH_DATA_ALIGN(hh->hh_len);
76 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
77 read_unlock_bh(&hh->hh_lock);
78 skb_push(skb, hh->hh_len);
79 return hh->hh_output(skb);
80 } else if (dst->neighbour)
81 return dst->neighbour->output(skb);
88 /* dev_loopback_xmit for use with netfilter. */
89 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
91 newskb->mac.raw = newskb->data;
92 __skb_pull(newskb, newskb->nh.raw - newskb->data);
93 newskb->pkt_type = PACKET_LOOPBACK;
94 newskb->ip_summed = CHECKSUM_UNNECESSARY;
95 BUG_TRAP(newskb->dst);
102 int ip6_output(struct sk_buff *skb)
104 struct dst_entry *dst = skb->dst;
105 struct net_device *dev = dst->dev;
107 skb->protocol = htons(ETH_P_IPV6);
110 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
111 if (!(dev->flags&IFF_LOOPBACK) &&
112 (skb->sk == NULL || skb->sk->net_pinfo.af_inet6.mc_loop) &&
113 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
114 &skb->nh.ipv6h->saddr)) {
115 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 /* Do not check for IFF_ALLMULTI; multicast routing
118 is not supported in any case.
121 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
123 ip6_dev_loopback_xmit);
125 if (skb->nh.ipv6h->hop_limit == 0) {
131 IP6_INC_STATS(Ip6OutMcastPkts);
134 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
138 #ifdef CONFIG_NETFILTER
139 int ip6_route_me_harder(struct sk_buff *skb)
141 struct ipv6hdr *iph = skb->nh.ipv6h;
142 struct dst_entry *dst;
145 fl.proto = iph->nexthdr;
146 fl.fl6_dst = &iph->daddr;
147 fl.fl6_src = &iph->saddr;
148 fl.oif = skb->sk ? skb->sk->bound_dev_if : 0;
149 fl.fl6_flowlabel = 0;
150 fl.uli_u.ports.dport = 0;
151 fl.uli_u.ports.sport = 0;
153 dst = ip6_route_output(skb->sk, &fl);
157 printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
162 /* Drop old route. */
163 dst_release(skb->dst);
170 static inline int ip6_maybe_reroute(struct sk_buff *skb)
172 #ifdef CONFIG_NETFILTER
173 if (skb->nfcache & NFC_ALTERED){
174 if (ip6_route_me_harder(skb) != 0){
179 #endif /* CONFIG_NETFILTER */
180 return skb->dst->output(skb);
184 * xmit an sk_buff (used by TCP)
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt)
190 struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL;
191 struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr;
192 struct dst_entry *dst = skb->dst;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
201 /* First: exthdrs may take lots of space (~8K for now)
202 MAX_HEADER is not enough.
204 head_room = opt->opt_nflen + opt->opt_flen;
205 seg_len += head_room;
206 head_room += sizeof(struct ipv6hdr) + ((dst->dev->hard_header_len + 15)&~15);
208 if (skb_headroom(skb) < head_room) {
209 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
226 * Fill in the IPv6 header
229 *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
232 hlimit = np->hop_limit;
234 hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
236 hdr->payload_len = htons(seg_len);
237 hdr->nexthdr = proto;
238 hdr->hop_limit = hlimit;
240 ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
241 ipv6_addr_copy(&hdr->daddr, first_hop);
243 if (skb->len <= dst->pmtu) {
244 IP6_INC_STATS(Ip6OutRequests);
245 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
249 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
251 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
257 * To avoid extra problems ND packets are send through this
258 * routine. It's code duplication but I really want to avoid
259 * extra checks since ipv6_build_header is used by TCP (which
260 * is for us performace critical)
263 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
264 struct in6_addr *saddr, struct in6_addr *daddr,
267 struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
271 skb->protocol = htons(ETH_P_IPV6);
274 totlen = len + sizeof(struct ipv6hdr);
276 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
279 *(u32*)hdr = htonl(0x60000000);
281 hdr->payload_len = htons(len);
282 hdr->nexthdr = proto;
283 hdr->hop_limit = np->hop_limit;
285 ipv6_addr_copy(&hdr->saddr, saddr);
286 ipv6_addr_copy(&hdr->daddr, daddr);
291 static struct ipv6hdr * ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
292 int hlimit, unsigned pktlength)
296 skb->nh.raw = skb_put(skb, sizeof(struct ipv6hdr));
299 *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
301 hdr->payload_len = htons(pktlength - sizeof(struct ipv6hdr));
302 hdr->hop_limit = hlimit;
303 hdr->nexthdr = fl->proto;
305 ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
306 ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
310 static __inline__ u8 * ipv6_build_fraghdr(struct sk_buff *skb, u8* prev_hdr, unsigned offset)
312 struct frag_hdr *fhdr;
314 fhdr = (struct frag_hdr *) skb_put(skb, sizeof(struct frag_hdr));
316 fhdr->nexthdr = *prev_hdr;
317 *prev_hdr = NEXTHDR_FRAGMENT;
318 prev_hdr = &fhdr->nexthdr;
321 fhdr->frag_off = htons(offset);
322 ipv6_select_ident(skb, fhdr);
323 return &fhdr->nexthdr;
326 static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
327 const void *data, struct dst_entry *dst,
328 struct flowi *fl, struct ipv6_txoptions *opt,
329 struct in6_addr *final_dst,
330 int hlimit, int flags, unsigned length, int mtu)
333 struct sk_buff *last_skb;
347 * Extension header order:
348 * Hop-by-hop -> Dest0 -> Routing -> Fragment -> Auth -> Dest1 -> rest (...)
350 * We must build the non-fragmented part that
351 * will be in every packet... this also means
352 * that other extension headers (Dest, Auth, etc)
353 * must be considered in the data to be fragmented
356 unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr);
360 unfrag_len += opt->opt_nflen;
361 last_len += opt->opt_flen;
365 * Length of fragmented part on every packet but
366 * the last must be an:
367 * "integer multiple of 8 octects".
370 frag_len = (mtu - unfrag_len) & ~0x7;
372 /* Unfragmentable part exceeds mtu. */
374 ipv6_local_error(sk, EMSGSIZE, fl, mtu);
378 nfrags = last_len / frag_len;
381 * We must send from end to start because of
382 * UDP/ICMP checksums. We do a funny trick:
383 * fill the last skb first with the fixed
384 * header (and its data) and then use it
385 * to create the following segments and send it
386 * in the end. If the peer is checking the M_flag
387 * to trigger the reassembly code then this
388 * might be a good idea.
391 frag_off = nfrags * frag_len;
392 last_len -= frag_off;
396 frag_off -= frag_len;
401 /* And it is implementation problem: for now we assume, that
402 all the exthdrs will fit to the first fragment.
405 if (frag_len < opt->opt_flen) {
406 ipv6_local_error(sk, EMSGSIZE, fl, mtu);
409 data_off = frag_off - opt->opt_flen;
415 last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len +
416 dst->dev->hard_header_len + 15,
417 flags & MSG_DONTWAIT, &err);
419 if (last_skb == NULL)
422 last_skb->dst = dst_clone(dst);
424 skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15);
426 hdr = ip6_bld_1(sk, last_skb, fl, hlimit, frag_len+unfrag_len);
427 prev_hdr = &hdr->nexthdr;
429 if (opt && opt->opt_nflen)
430 prev_hdr = ipv6_build_nfrag_opts(last_skb, prev_hdr, opt, final_dst, 0);
432 prev_hdr = ipv6_build_fraghdr(last_skb, prev_hdr, frag_off);
433 fhdr_dist = prev_hdr - last_skb->data;
435 err = getfrag(data, &hdr->saddr, last_skb->tail, data_off, last_len);
441 struct frag_hdr *fhdr2;
443 skb = skb_copy(last_skb, sk->allocation);
446 IP6_INC_STATS(Ip6FragFails);
451 frag_off -= frag_len;
452 data_off -= frag_len;
454 fhdr2 = (struct frag_hdr *) (skb->data + fhdr_dist);
457 fhdr2->frag_off = htons(frag_off | 1);
459 /* Write fragmentable exthdrs to the first chunk */
460 if (nfrags == 0 && opt && opt->opt_flen) {
461 ipv6_build_frag_opts(skb, &fhdr2->nexthdr, opt);
462 frag_len -= opt->opt_flen;
466 err = getfrag(data, &hdr->saddr,skb_put(skb, frag_len),
474 IP6_INC_STATS(Ip6FragCreates);
475 IP6_INC_STATS(Ip6OutRequests);
476 err = NF_HOOK(PF_INET6,NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
485 IP6_INC_STATS(Ip6FragFails);
490 hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr));
493 * update last_skb to reflect the getfrag we did
497 skb_put(last_skb, last_len);
499 IP6_INC_STATS(Ip6FragCreates);
500 IP6_INC_STATS(Ip6FragOKs);
501 IP6_INC_STATS(Ip6OutRequests);
502 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, last_skb, NULL,dst->dev, ip6_maybe_reroute);
505 int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
506 struct flowi *fl, unsigned length,
507 struct ipv6_txoptions *opt, int hlimit, int flags)
509 struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
510 struct in6_addr *final_dst = NULL;
511 struct dst_entry *dst;
513 unsigned int pktlength, jumbolen, mtu;
514 struct in6_addr saddr;
516 if (opt && opt->srcrt) {
517 struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
518 final_dst = fl->fl6_dst;
519 fl->fl6_dst = rt0->addr;
522 if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr))
523 fl->oif = np->mcast_oif;
525 dst = __sk_dst_check(sk, np->dst_cookie);
527 struct rt6_info *rt = (struct rt6_info*)dst;
529 /* Yes, checking route validity in not connected
530 case is not very simple. Take into account,
531 that we do not support routing by source, TOS,
532 and MSG_DONTROUTE --ANK (980726)
534 1. If route was host route, check that
535 cached destination is current.
536 If it is network route, we still may
537 check its validity using saved pointer
538 to the last used address: daddr_cache.
539 We do not want to save whole address now,
540 (because main consumer of this service
541 is tcp, which has not this problem),
542 so that the last trick works only on connected
544 2. oif also should be the same.
547 if (((rt->rt6i_dst.plen != 128 ||
548 ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr))
549 && (np->daddr_cache == NULL ||
550 ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache)))
551 || (fl->oif && fl->oif != dst->dev->ifindex)) {
558 dst = ip6_route_output(sk, fl);
561 IP6_INC_STATS(Ip6OutNoRoutes);
566 if (fl->fl6_src == NULL) {
567 err = ipv6_get_saddr(dst, fl->fl6_dst, &saddr);
571 printk(KERN_DEBUG "ip6_build_xmit: "
572 "no available source address\n");
576 fl->fl6_src = &saddr;
581 if (ipv6_addr_is_multicast(fl->fl6_dst))
582 hlimit = np->mcast_hops;
584 hlimit = np->hop_limit;
586 hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
591 if (!sk->protinfo.af_inet.hdrincl) {
592 pktlength += sizeof(struct ipv6hdr);
594 pktlength += opt->opt_flen + opt->opt_nflen;
596 if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) {
598 It is assumed, that in the case of hdrincl
599 jumbo option is supplied by user.
602 jumbolen = pktlength - sizeof(struct ipv6hdr);
607 if (np->frag_size < mtu) {
610 else if (np->pmtudisc == IPV6_PMTUDISC_DONT)
614 /* Critical arithmetic overflow check.
615 FIXME: may gcc optimize it out? --ANK (980726)
617 if (pktlength < length) {
618 ipv6_local_error(sk, EMSGSIZE, fl, mtu);
623 if (flags&MSG_CONFIRM)
626 if (pktlength <= mtu) {
629 struct net_device *dev = dst->dev;
635 skb = sock_alloc_send_skb(sk, pktlength + 15 +
636 dev->hard_header_len,
637 flags & MSG_DONTWAIT, &err);
640 IP6_INC_STATS(Ip6OutDiscards);
644 skb->dst = dst_clone(dst);
646 skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
648 hdr = (struct ipv6hdr *) skb->tail;
651 if (!sk->protinfo.af_inet.hdrincl) {
652 ip6_bld_1(sk, skb, fl, hlimit,
653 jumbolen ? sizeof(struct ipv6hdr) : pktlength);
655 if (opt || jumbolen) {
656 u8 *prev_hdr = &hdr->nexthdr;
657 prev_hdr = ipv6_build_nfrag_opts(skb, prev_hdr, opt, final_dst, jumbolen);
658 if (opt && opt->opt_flen)
659 ipv6_build_frag_opts(skb, prev_hdr, opt);
663 skb_put(skb, length);
664 err = getfrag(data, &hdr->saddr,
665 ((char *) hdr) + (pktlength - length),
669 IP6_INC_STATS(Ip6OutRequests);
670 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
676 if (sk->protinfo.af_inet.hdrincl || jumbolen ||
677 np->pmtudisc == IPV6_PMTUDISC_DO) {
678 ipv6_local_error(sk, EMSGSIZE, fl, mtu);
683 err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit,
691 ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL);
693 err = np->recverr ? net_xmit_errno(err) : 0;
697 int ip6_call_ra_chain(struct sk_buff *skb, int sel)
699 struct ip6_ra_chain *ra;
700 struct sock *last = NULL;
702 read_lock(&ip6_ra_lock);
703 for (ra = ip6_ra_chain; ra; ra = ra->next) {
704 struct sock *sk = ra->sk;
705 if (sk && ra->sel == sel) {
707 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
709 rawv6_rcv(last, skb2);
716 rawv6_rcv(last, skb);
717 read_unlock(&ip6_ra_lock);
720 read_unlock(&ip6_ra_lock);
724 static inline int ip6_forward_finish(struct sk_buff *skb)
726 return skb->dst->output(skb);
729 int ip6_forward(struct sk_buff *skb)
731 struct dst_entry *dst = skb->dst;
732 struct ipv6hdr *hdr = skb->nh.ipv6h;
733 struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
735 if (ipv6_devconf.forwarding == 0)
738 skb->ip_summed = CHECKSUM_NONE;
741 * We DO NOT make any processing on
742 * RA packets, pushing them to user level AS IS
743 * without ane WARRANTY that application will be able
744 * to interpret them. The reason is that we
745 * cannot make anything clever here.
747 * We are not end-node, so that if packet contains
748 * AH/ESP, we cannot make anything.
749 * Defragmentation also would be mistake, RA packets
750 * cannot be fragmented, because there is no warranty
751 * that different fragments will go along one path. --ANK
754 u8 *ptr = skb->nh.raw + opt->ra;
755 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
760 * check and decrement ttl
762 if (hdr->hop_limit <= 1) {
763 /* Force OUTPUT device used as source address */
765 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
772 /* IPv6 specs say nothing about it, but it is clear that we cannot
773 send redirects to source routed frames.
775 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
776 struct in6_addr *target = NULL;
778 struct neighbour *n = dst->neighbour;
781 * incoming and outgoing devices are the same
785 rt = (struct rt6_info *) dst;
786 if ((rt->rt6i_flags & RTF_GATEWAY))
787 target = (struct in6_addr*)&n->primary_key;
789 target = &hdr->daddr;
791 /* Limit redirects both by destination (here)
792 and by source (inside ndisc_send_redirect)
794 if (xrlim_allow(dst, 1*HZ))
795 ndisc_send_redirect(skb, n, target);
796 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
797 |IPV6_ADDR_LINKLOCAL)) {
798 /* This check is security critical. */
802 if (skb->len > dst->pmtu) {
803 /* Again, force OUTPUT device used as source address */
805 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
806 IP6_INC_STATS_BH(Ip6InTooBigErrors);
811 if (skb_cow(skb, dst->dev->hard_header_len))
816 /* Mangling hops number delayed to point after skb COW */
820 IP6_INC_STATS_BH(Ip6OutForwDatagrams);
821 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
824 IP6_INC_STATS_BH(Ip6InAddrErrors);