include upstream ip1000a driver version 2.09f
[linux-2.4.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder. 
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/inet_ecn.h>
40
41 #ifdef CONFIG_IPV6
42 #include <net/ipv6.h>
43 #include <net/ip6_fib.h>
44 #include <net/ip6_route.h>
45 #endif
46
47 /*
48    Problems & solutions
49    --------------------
50
51    1. The most important issue is detecting local dead loops.
52    They would cause complete host lockup in transmit, which
53    would be "resolved" by stack overflow or, if queueing is enabled,
54    with infinite looping in net_bh.
55
56    We cannot track such dead loops during route installation,
57    it is infeasible task. The most general solutions would be
58    to keep skb->encapsulation counter (sort of local ttl),
59    and silently drop packet when it expires. It is the best
60    solution, but it supposes maintaing new variable in ALL
61    skb, even if no tunneling is used.
62
63    Current solution: t->recursion lock breaks dead loops. It looks 
64    like dev->tbusy flag, but I preferred new variable, because
65    the semantics is different. One day, when hard_start_xmit
66    will be multithreaded we will have to use skb->encapsulation.
67
68
69
70    2. Networking dead loops would not kill routers, but would really
71    kill network. IP hop limit plays role of "t->recursion" in this case,
72    if we copy it from packet being encapsulated to upper header.
73    It is very good solution, but it introduces two problems:
74
75    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
76      do not work over tunnels.
77    - traceroute does not work. I planned to relay ICMP from tunnel,
78      so that this problem would be solved and traceroute output
79      would even more informative. This idea appeared to be wrong:
80      only Linux complies to rfc1812 now (yes, guys, Linux is the only
81      true router now :-)), all routers (at least, in neighbourhood of mine)
82      return only 8 bytes of payload. It is the end.
83
84    Hence, if we want that OSPF worked or traceroute said something reasonable,
85    we should search for another solution.
86
87    One of them is to parse packet trying to detect inner encapsulation
88    made by our node. It is difficult or even impossible, especially,
89    taking into account fragmentation. TO be short, tt is not solution at all.
90
91    Current solution: The solution was UNEXPECTEDLY SIMPLE.
92    We force DF flag on tunnels with preconfigured hop limit,
93    that is ALL. :-) Well, it does not remove the problem completely,
94    but exponential growth of network traffic is changed to linear
95    (branches, that exceed pmtu are pruned) and tunnel mtu
96    fastly degrades to value <68, where looping stops.
97    Yes, it is not good if there exists a router in the loop,
98    which does not force DF, even when encapsulating packets have DF set.
99    But it is not our problem! Nobody could accuse us, we made
100    all that we could make. Even if it is your gated who injected
101    fatal route to network, even if it were you who configured
102    fatal static route: you are innocent. :-)
103
104
105
106    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
107    practically identical code. It would be good to glue them
108    together, but it is not very evident, how to make them modular.
109    sit is integral part of IPv6, ipip and gre are naturally modular.
110    We could extract common parts (hash table, ioctl etc)
111    to a separate module (ip_tunnel.c).
112
113    Alexey Kuznetsov.
114  */
115
116 static int ipgre_tunnel_init(struct net_device *dev);
117
118 /* Fallback tunnel: no source, no destination, no key, no options */
119
120 static int ipgre_fb_tunnel_init(struct net_device *dev);
121
122 static struct net_device ipgre_fb_tunnel_dev = {
123         "gre0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
124 };
125
126 static struct ip_tunnel ipgre_fb_tunnel = {
127         NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
128 };
129
130 /* Tunnel hash table */
131
132 /*
133    4 hash tables:
134
135    3: (remote,local)
136    2: (remote,*)
137    1: (*,local)
138    0: (*,*)
139
140    We require exact key match i.e. if a key is present in packet
141    it will match only tunnel with the same key; if it is not present,
142    it will match only keyless tunnel.
143
144    All keysless packets, if not matched configured keyless tunnels
145    will match fallback tunnel.
146  */
147
148 #define HASH_SIZE  16
149 #define HASH(addr) ((addr^(addr>>4))&0xF)
150
151 static struct ip_tunnel *tunnels[4][HASH_SIZE];
152
153 #define tunnels_r_l     (tunnels[3])
154 #define tunnels_r       (tunnels[2])
155 #define tunnels_l       (tunnels[1])
156 #define tunnels_wc      (tunnels[0])
157
158 static rwlock_t ipgre_lock = RW_LOCK_UNLOCKED;
159
160 /* Given src, dst and key, find approriate for input tunnel. */
161
162 static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
163 {
164         unsigned h0 = HASH(remote);
165         unsigned h1 = HASH(key);
166         struct ip_tunnel *t;
167
168         for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
169                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
170                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
171                                 return t;
172                 }
173         }
174         for (t = tunnels_r[h0^h1]; t; t = t->next) {
175                 if (remote == t->parms.iph.daddr) {
176                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177                                 return t;
178                 }
179         }
180         for (t = tunnels_l[h1]; t; t = t->next) {
181                 if (local == t->parms.iph.saddr ||
182                      (local == t->parms.iph.daddr && MULTICAST(local))) {
183                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
184                                 return t;
185                 }
186         }
187         for (t = tunnels_wc[h1]; t; t = t->next) {
188                 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
189                         return t;
190         }
191         if (ipgre_fb_tunnel_dev.flags&IFF_UP)
192                 return &ipgre_fb_tunnel;
193         return NULL;
194 }
195
196 static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
197 {
198         u32 remote = t->parms.iph.daddr;
199         u32 local = t->parms.iph.saddr;
200         u32 key = t->parms.i_key;
201         unsigned h = HASH(key);
202         int prio = 0;
203
204         if (local)
205                 prio |= 1;
206         if (remote && !MULTICAST(remote)) {
207                 prio |= 2;
208                 h ^= HASH(remote);
209         }
210
211         return &tunnels[prio][h];
212 }
213
214 static void ipgre_tunnel_link(struct ip_tunnel *t)
215 {
216         struct ip_tunnel **tp = ipgre_bucket(t);
217
218         t->next = *tp;
219         write_lock_bh(&ipgre_lock);
220         *tp = t;
221         write_unlock_bh(&ipgre_lock);
222 }
223
224 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
225 {
226         struct ip_tunnel **tp;
227
228         for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
229                 if (t == *tp) {
230                         write_lock_bh(&ipgre_lock);
231                         *tp = t->next;
232                         write_unlock_bh(&ipgre_lock);
233                         break;
234                 }
235         }
236 }
237
238 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
239 {
240         u32 remote = parms->iph.daddr;
241         u32 local = parms->iph.saddr;
242         u32 key = parms->i_key;
243         struct ip_tunnel *t, **tp, *nt;
244         struct net_device *dev;
245         unsigned h = HASH(key);
246         int prio = 0;
247
248         if (local)
249                 prio |= 1;
250         if (remote && !MULTICAST(remote)) {
251                 prio |= 2;
252                 h ^= HASH(remote);
253         }
254         for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
255                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
256                         if (key == t->parms.i_key)
257                                 return t;
258                 }
259         }
260         if (!create)
261                 return NULL;
262
263         MOD_INC_USE_COUNT;
264         dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
265         if (dev == NULL) {
266                 MOD_DEC_USE_COUNT;
267                 return NULL;
268         }
269         memset(dev, 0, sizeof(*dev) + sizeof(*t));
270         dev->priv = (void*)(dev+1);
271         nt = (struct ip_tunnel*)dev->priv;
272         nt->dev = dev;
273         dev->init = ipgre_tunnel_init;
274         dev->features |= NETIF_F_DYNALLOC;
275         memcpy(&nt->parms, parms, sizeof(*parms));
276         nt->parms.name[IFNAMSIZ-1] = '\0';
277         strcpy(dev->name, nt->parms.name);
278         if (dev->name[0] == 0) {
279                 int i;
280                 for (i=1; i<100; i++) {
281                         sprintf(dev->name, "gre%d", i);
282                         if (__dev_get_by_name(dev->name) == NULL)
283                                 break;
284                 }
285                 if (i==100)
286                         goto failed;
287                 memcpy(nt->parms.name, dev->name, IFNAMSIZ);
288         }
289         if (register_netdevice(dev) < 0)
290                 goto failed;
291
292         dev_hold(dev);
293         ipgre_tunnel_link(nt);
294         /* Do not decrement MOD_USE_COUNT here. */
295         return nt;
296
297 failed:
298         kfree(dev);
299         MOD_DEC_USE_COUNT;
300         return NULL;
301 }
302
303 static void ipgre_tunnel_destructor(struct net_device *dev)
304 {
305         if (dev != &ipgre_fb_tunnel_dev) {
306                 MOD_DEC_USE_COUNT;
307         }
308 }
309
310 static void ipgre_tunnel_uninit(struct net_device *dev)
311 {
312         ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
313         dev_put(dev);
314 }
315
316
317 void ipgre_err(struct sk_buff *skb, u32 info)
318 {
319 #ifndef I_WISH_WORLD_WERE_PERFECT
320
321 /* It is not :-( All the routers (except for Linux) return only
322    8 bytes of packet payload. It means, that precise relaying of
323    ICMP in the real Internet is absolutely infeasible.
324
325    Moreover, Cisco "wise men" put GRE key to the third word
326    in GRE header. It makes impossible maintaining even soft state for keyed
327    GRE tunnels with enabled checksum. Tell them "thank you".
328
329    Well, I wonder, rfc1812 was written by Cisco employee,
330    what the hell these idiots break standrads established
331    by themself???
332  */
333
334         struct iphdr *iph = (struct iphdr*)skb->data;
335         u16          *p = (u16*)(skb->data+(iph->ihl<<2));
336         int grehlen = (iph->ihl<<2) + 4;
337         int type = skb->h.icmph->type;
338         int code = skb->h.icmph->code;
339         struct ip_tunnel *t;
340         u16 flags;
341
342         flags = p[0];
343         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
344                 if (flags&(GRE_VERSION|GRE_ROUTING))
345                         return;
346                 if (flags&GRE_KEY) {
347                         grehlen += 4;
348                         if (flags&GRE_CSUM)
349                                 grehlen += 4;
350                 }
351         }
352
353         /* If only 8 bytes returned, keyed message will be dropped here */
354         if (skb_headlen(skb) < grehlen)
355                 return;
356
357         switch (type) {
358         default:
359         case ICMP_PARAMETERPROB:
360                 return;
361
362         case ICMP_DEST_UNREACH:
363                 switch (code) {
364                 case ICMP_SR_FAILED:
365                 case ICMP_PORT_UNREACH:
366                         /* Impossible event. */
367                         return;
368                 case ICMP_FRAG_NEEDED:
369                         /* Soft state for pmtu is maintained by IP core. */
370                         return;
371                 default:
372                         /* All others are translated to HOST_UNREACH.
373                            rfc2003 contains "deep thoughts" about NET_UNREACH,
374                            I believe they are just ether pollution. --ANK
375                          */
376                         break;
377                 }
378                 break;
379         case ICMP_TIME_EXCEEDED:
380                 if (code != ICMP_EXC_TTL)
381                         return;
382                 break;
383         }
384
385         read_lock(&ipgre_lock);
386         t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
387         if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
388                 goto out;
389
390         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
391                 goto out;
392
393         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
394                 t->err_count++;
395         else
396                 t->err_count = 1;
397         t->err_time = jiffies;
398 out:
399         read_unlock(&ipgre_lock);
400         return;
401 #else
402         struct iphdr *iph = (struct iphdr*)dp;
403         struct iphdr *eiph;
404         u16          *p = (u16*)(dp+(iph->ihl<<2));
405         int type = skb->h.icmph->type;
406         int code = skb->h.icmph->code;
407         int rel_type = 0;
408         int rel_code = 0;
409         int rel_info = 0;
410         u16 flags;
411         int grehlen = (iph->ihl<<2) + 4;
412         struct sk_buff *skb2;
413         struct rtable *rt;
414
415         if (p[1] != htons(ETH_P_IP))
416                 return;
417
418         flags = p[0];
419         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
420                 if (flags&(GRE_VERSION|GRE_ROUTING))
421                         return;
422                 if (flags&GRE_CSUM)
423                         grehlen += 4;
424                 if (flags&GRE_KEY)
425                         grehlen += 4;
426                 if (flags&GRE_SEQ)
427                         grehlen += 4;
428         }
429         if (len < grehlen + sizeof(struct iphdr))
430                 return;
431         eiph = (struct iphdr*)(dp + grehlen);
432
433         switch (type) {
434         default:
435                 return;
436         case ICMP_PARAMETERPROB:
437                 if (skb->h.icmph->un.gateway < (iph->ihl<<2))
438                         return;
439
440                 /* So... This guy found something strange INSIDE encapsulated
441                    packet. Well, he is fool, but what can we do ?
442                  */
443                 rel_type = ICMP_PARAMETERPROB;
444                 rel_info = skb->h.icmph->un.gateway - grehlen;
445                 break;
446
447         case ICMP_DEST_UNREACH:
448                 switch (code) {
449                 case ICMP_SR_FAILED:
450                 case ICMP_PORT_UNREACH:
451                         /* Impossible event. */
452                         return;
453                 case ICMP_FRAG_NEEDED:
454                         /* And it is the only really necesary thing :-) */
455                         rel_info = ntohs(skb->h.icmph->un.frag.mtu);
456                         if (rel_info < grehlen+68)
457                                 return;
458                         rel_info -= grehlen;
459                         /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
460                         if (rel_info > ntohs(eiph->tot_len))
461                                 return;
462                         break;
463                 default:
464                         /* All others are translated to HOST_UNREACH.
465                            rfc2003 contains "deep thoughts" about NET_UNREACH,
466                            I believe, it is just ether pollution. --ANK
467                          */
468                         rel_type = ICMP_DEST_UNREACH;
469                         rel_code = ICMP_HOST_UNREACH;
470                         break;
471                 }
472                 break;
473         case ICMP_TIME_EXCEEDED:
474                 if (code != ICMP_EXC_TTL)
475                         return;
476                 break;
477         }
478
479         /* Prepare fake skb to feed it to icmp_send */
480         skb2 = skb_clone(skb, GFP_ATOMIC);
481         if (skb2 == NULL)
482                 return;
483         dst_release(skb2->dst);
484         skb2->dst = NULL;
485         skb_pull(skb2, skb->data - (u8*)eiph);
486         skb2->nh.raw = skb2->data;
487
488         /* Try to guess incoming interface */
489         if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
490                 kfree_skb(skb2);
491                 return;
492         }
493         skb2->dev = rt->u.dst.dev;
494
495         /* route "incoming" packet */
496         if (rt->rt_flags&RTCF_LOCAL) {
497                 ip_rt_put(rt);
498                 rt = NULL;
499                 if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
500                     rt->u.dst.dev->type != ARPHRD_IPGRE) {
501                         ip_rt_put(rt);
502                         kfree_skb(skb2);
503                         return;
504                 }
505         } else {
506                 ip_rt_put(rt);
507                 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
508                     skb2->dst->dev->type != ARPHRD_IPGRE) {
509                         kfree_skb(skb2);
510                         return;
511                 }
512         }
513
514         /* change mtu on this route */
515         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
516                 if (rel_info > skb2->dst->pmtu) {
517                         kfree_skb(skb2);
518                         return;
519                 }
520                 skb2->dst->pmtu = rel_info;
521                 rel_info = htonl(rel_info);
522         } else if (type == ICMP_TIME_EXCEEDED) {
523                 struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
524                 if (t->parms.iph.ttl) {
525                         rel_type = ICMP_DEST_UNREACH;
526                         rel_code = ICMP_HOST_UNREACH;
527                 }
528         }
529
530         icmp_send(skb2, rel_type, rel_code, rel_info);
531         kfree_skb(skb2);
532 #endif
533 }
534
535 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
536 {
537         if (INET_ECN_is_ce(iph->tos)) {
538                 if (skb->protocol == htons(ETH_P_IP)) {
539                         if (INET_ECN_is_not_ce(skb->nh.iph->tos))
540                                 IP_ECN_set_ce(skb->nh.iph);
541                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
542                         if (INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h)))
543                                 IP6_ECN_set_ce(skb->nh.ipv6h);
544                 }
545         }
546 }
547
548 static inline u8
549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551         u8 inner = 0;
552         if (skb->protocol == htons(ETH_P_IP))
553                 inner = old_iph->tos;
554         else if (skb->protocol == htons(ETH_P_IPV6))
555                 inner = ip6_get_dsfield((struct ipv6hdr*)old_iph);
556         return INET_ECN_encapsulate(tos, inner);
557 }
558
559 int ipgre_rcv(struct sk_buff *skb)
560 {
561         struct iphdr *iph;
562         u8     *h;
563         u16    flags;
564         u16    csum = 0;
565         u32    key = 0;
566         u32    seqno = 0;
567         struct ip_tunnel *tunnel;
568         int    offset = 4;
569
570         if (!pskb_may_pull(skb, 16))
571                 goto drop_nolock;
572
573         iph = skb->nh.iph;
574         h = skb->data;
575         flags = *(u16*)h;
576
577         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578                 /* - Version must be 0.
579                    - We do not support routing headers.
580                  */
581                 if (flags&(GRE_VERSION|GRE_ROUTING))
582                         goto drop_nolock;
583
584                 if (flags&GRE_CSUM) {
585                         if (skb->ip_summed == CHECKSUM_HW) {
586                                 csum = (u16)csum_fold(skb->csum);
587                                 if (csum)
588                                         skb->ip_summed = CHECKSUM_NONE;
589                         }
590                         if (skb->ip_summed == CHECKSUM_NONE) {
591                                 skb->csum = skb_checksum(skb, 0, skb->len, 0);
592                                 skb->ip_summed = CHECKSUM_HW;
593                                 csum = (u16)csum_fold(skb->csum);
594                         }
595                         offset += 4;
596                 }
597                 if (flags&GRE_KEY) {
598                         key = *(u32*)(h + offset);
599                         offset += 4;
600                 }
601                 if (flags&GRE_SEQ) {
602                         seqno = ntohl(*(u32*)(h + offset));
603                         offset += 4;
604                 }
605         }
606
607         read_lock(&ipgre_lock);
608         if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
609                 skb->mac.raw = skb->nh.raw;
610                 skb->nh.raw = __pskb_pull(skb, offset);
611                 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
612                 if (skb->ip_summed == CHECKSUM_HW)
613                         skb->csum = csum_sub(skb->csum,
614                                              csum_partial(skb->mac.raw, skb->nh.raw-skb->mac.raw, 0));
615                 skb->protocol = *(u16*)(h + 2);
616                 skb->pkt_type = PACKET_HOST;
617 #ifdef CONFIG_NET_IPGRE_BROADCAST
618                 if (MULTICAST(iph->daddr)) {
619                         /* Looped back packet, drop it! */
620                         if (((struct rtable*)skb->dst)->key.iif == 0)
621                                 goto drop;
622                         tunnel->stat.multicast++;
623                         skb->pkt_type = PACKET_BROADCAST;
624                 }
625 #endif
626
627                 if (((flags&GRE_CSUM) && csum) ||
628                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
629                         tunnel->stat.rx_crc_errors++;
630                         tunnel->stat.rx_errors++;
631                         goto drop;
632                 }
633                 if (tunnel->parms.i_flags&GRE_SEQ) {
634                         if (!(flags&GRE_SEQ) ||
635                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
636                                 tunnel->stat.rx_fifo_errors++;
637                                 tunnel->stat.rx_errors++;
638                                 goto drop;
639                         }
640                         tunnel->i_seqno = seqno + 1;
641                 }
642                 tunnel->stat.rx_packets++;
643                 tunnel->stat.rx_bytes += skb->len;
644                 skb->dev = tunnel->dev;
645                 dst_release(skb->dst);
646                 skb->dst = NULL;
647                 nf_reset(skb);
648                 ipgre_ecn_decapsulate(iph, skb);
649                 netif_rx(skb);
650                 read_unlock(&ipgre_lock);
651                 return(0);
652         }
653         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
654
655 drop:
656         read_unlock(&ipgre_lock);
657 drop_nolock:
658         kfree_skb(skb);
659         return(0);
660 }
661
662 /* Need this wrapper because NF_HOOK takes the function address */
663 static inline int do_ip_send(struct sk_buff *skb)
664 {
665         return ip_send(skb);
666 }
667
668 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
669 {
670         struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
671         struct net_device_stats *stats = &tunnel->stat;
672         struct iphdr  *old_iph = skb->nh.iph;
673         struct iphdr  *tiph;
674         u8     tos;
675         u16    df;
676         struct rtable *rt;                      /* Route to the other host */
677         struct net_device *tdev;                        /* Device to other host */
678         struct iphdr  *iph;                     /* Our new IP header */
679         int    max_headroom;                    /* The extra header space needed */
680         int    gre_hlen;
681         u32    dst;
682         int    mtu;
683
684         if (tunnel->recursion++) {
685                 tunnel->stat.collisions++;
686                 goto tx_error;
687         }
688
689         if (dev->hard_header) {
690                 gre_hlen = 0;
691                 tiph = (struct iphdr*)skb->data;
692         } else {
693                 gre_hlen = tunnel->hlen;
694                 tiph = &tunnel->parms.iph;
695         }
696
697         if ((dst = tiph->daddr) == 0) {
698                 /* NBMA tunnel */
699
700                 if (skb->dst == NULL) {
701                         tunnel->stat.tx_fifo_errors++;
702                         goto tx_error;
703                 }
704
705                 if (skb->protocol == htons(ETH_P_IP)) {
706                         rt = (struct rtable*)skb->dst;
707                         if ((dst = rt->rt_gateway) == 0)
708                                 goto tx_error_icmp;
709                 }
710 #ifdef CONFIG_IPV6
711                 else if (skb->protocol == htons(ETH_P_IPV6)) {
712                         struct in6_addr *addr6;
713                         int addr_type;
714                         struct neighbour *neigh = skb->dst->neighbour;
715
716                         if (neigh == NULL)
717                                 goto tx_error;
718
719                         addr6 = (struct in6_addr*)&neigh->primary_key;
720                         addr_type = ipv6_addr_type(addr6);
721
722                         if (addr_type == IPV6_ADDR_ANY) {
723                                 addr6 = &skb->nh.ipv6h->daddr;
724                                 addr_type = ipv6_addr_type(addr6);
725                         }
726
727                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728                                 goto tx_error_icmp;
729
730                         dst = addr6->s6_addr32[3];
731                 }
732 #endif
733                 else
734                         goto tx_error;
735         }
736
737         tos = tiph->tos;
738         if (tos&1) {
739                 if (skb->protocol == htons(ETH_P_IP))
740                         tos = old_iph->tos;
741                 tos &= ~1;
742         }
743
744         if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
745                 tunnel->stat.tx_carrier_errors++;
746                 goto tx_error;
747         }
748         tdev = rt->u.dst.dev;
749
750         if (tdev == dev) {
751                 ip_rt_put(rt);
752                 tunnel->stat.collisions++;
753                 goto tx_error;
754         }
755
756         df = tiph->frag_off;
757         if (df)
758                 mtu = rt->u.dst.pmtu - tunnel->hlen;
759         else
760                 mtu = skb->dst ? skb->dst->pmtu : dev->mtu;
761
762         if (skb->protocol == htons(ETH_P_IP)) {
763                 if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
764                         skb->dst->pmtu = mtu;
765
766                 df |= (old_iph->frag_off&htons(IP_DF));
767
768                 if ((old_iph->frag_off&htons(IP_DF)) &&
769                     mtu < ntohs(old_iph->tot_len)) {
770                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
771                         ip_rt_put(rt);
772                         goto tx_error;
773                 }
774         }
775 #ifdef CONFIG_IPV6
776         else if (skb->protocol == htons(ETH_P_IPV6)) {
777                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
778
779                 if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
780                         if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
781                             rt6->rt6i_dst.plen == 128) {
782                                 rt6->rt6i_flags |= RTF_MODIFIED;
783                                 skb->dst->pmtu = mtu;
784                         }
785                 }
786
787                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
788                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
789                         ip_rt_put(rt);
790                         goto tx_error;
791                 }
792         }
793 #endif
794
795         if (tunnel->err_count > 0) {
796                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
797                         tunnel->err_count--;
798
799                         dst_link_failure(skb);
800                 } else
801                         tunnel->err_count = 0;
802         }
803
804         max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
805
806         if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
807                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
808                 if (!new_skb) {
809                         ip_rt_put(rt);
810                         stats->tx_dropped++;
811                         dev_kfree_skb(skb);
812                         tunnel->recursion--;
813                         return 0;
814                 }
815                 if (skb->sk)
816                         skb_set_owner_w(new_skb, skb->sk);
817                 dev_kfree_skb(skb);
818                 skb = new_skb;
819                 old_iph = skb->nh.iph;
820         }
821
822         skb->h.raw = skb->nh.raw;
823         skb->nh.raw = skb_push(skb, gre_hlen);
824         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
825         dst_release(skb->dst);
826         skb->dst = &rt->u.dst;
827
828         /*
829          *      Push down and install the IPIP header.
830          */
831
832         iph                     =       skb->nh.iph;
833         iph->version            =       4;
834         iph->ihl                =       sizeof(struct iphdr) >> 2;
835         iph->frag_off           =       df;
836         iph->protocol           =       IPPROTO_GRE;
837         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
838         iph->daddr              =       rt->rt_dst;
839         iph->saddr              =       rt->rt_src;
840
841         if ((iph->ttl = tiph->ttl) == 0) {
842                 if (skb->protocol == htons(ETH_P_IP))
843                         iph->ttl = old_iph->ttl;
844 #ifdef CONFIG_IPV6
845                 else if (skb->protocol == htons(ETH_P_IPV6))
846                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
847 #endif
848                 else
849                         iph->ttl = sysctl_ip_default_ttl;
850         }
851
852         ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
853         ((u16*)(iph+1))[1] = skb->protocol;
854
855         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
856                 u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
857
858                 if (tunnel->parms.o_flags&GRE_SEQ) {
859                         ++tunnel->o_seqno;
860                         *ptr = htonl(tunnel->o_seqno);
861                         ptr--;
862                 }
863                 if (tunnel->parms.o_flags&GRE_KEY) {
864                         *ptr = tunnel->parms.o_key;
865                         ptr--;
866                 }
867                 if (tunnel->parms.o_flags&GRE_CSUM) {
868                         *ptr = 0;
869                         *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
870                 }
871         }
872
873         nf_reset(skb);
874
875         IPTUNNEL_XMIT();
876         tunnel->recursion--;
877         return 0;
878
879 tx_error_icmp:
880         dst_link_failure(skb);
881
882 tx_error:
883         stats->tx_errors++;
884         dev_kfree_skb(skb);
885         tunnel->recursion--;
886         return 0;
887 }
888
889 static int
890 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
891 {
892         int err = 0;
893         struct ip_tunnel_parm p;
894         struct ip_tunnel *t;
895
896         MOD_INC_USE_COUNT;
897
898         switch (cmd) {
899         case SIOCGETTUNNEL:
900                 t = NULL;
901                 if (dev == &ipgre_fb_tunnel_dev) {
902                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
903                                 err = -EFAULT;
904                                 break;
905                         }
906                         t = ipgre_tunnel_locate(&p, 0);
907                 }
908                 if (t == NULL)
909                         t = (struct ip_tunnel*)dev->priv;
910                 memcpy(&p, &t->parms, sizeof(p));
911                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
912                         err = -EFAULT;
913                 break;
914
915         case SIOCADDTUNNEL:
916         case SIOCCHGTUNNEL:
917                 err = -EPERM;
918                 if (!capable(CAP_NET_ADMIN))
919                         goto done;
920
921                 err = -EFAULT;
922                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
923                         goto done;
924
925                 err = -EINVAL;
926                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
927                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
928                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
929                         goto done;
930                 if (p.iph.ttl)
931                         p.iph.frag_off |= htons(IP_DF);
932
933                 if (!(p.i_flags&GRE_KEY))
934                         p.i_key = 0;
935                 if (!(p.o_flags&GRE_KEY))
936                         p.o_key = 0;
937
938                 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
939
940                 if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
941                     t != &ipgre_fb_tunnel) {
942                         if (t != NULL) {
943                                 if (t->dev != dev) {
944                                         err = -EEXIST;
945                                         break;
946                                 }
947                         } else {
948                                 unsigned nflags=0;
949
950                                 t = (struct ip_tunnel*)dev->priv;
951
952                                 if (MULTICAST(p.iph.daddr))
953                                         nflags = IFF_BROADCAST;
954                                 else if (p.iph.daddr)
955                                         nflags = IFF_POINTOPOINT;
956
957                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
958                                         err = -EINVAL;
959                                         break;
960                                 }
961                                 ipgre_tunnel_unlink(t);
962                                 t->parms.iph.saddr = p.iph.saddr;
963                                 t->parms.iph.daddr = p.iph.daddr;
964                                 t->parms.i_key = p.i_key;
965                                 t->parms.o_key = p.o_key;
966                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
967                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
968                                 ipgre_tunnel_link(t);
969                                 netdev_state_change(dev);
970                         }
971                 }
972
973                 if (t) {
974                         err = 0;
975                         if (cmd == SIOCCHGTUNNEL) {
976                                 t->parms.iph.ttl = p.iph.ttl;
977                                 t->parms.iph.tos = p.iph.tos;
978                                 t->parms.iph.frag_off = p.iph.frag_off;
979                         }
980                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
981                                 err = -EFAULT;
982                 } else
983                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
984                 break;
985
986         case SIOCDELTUNNEL:
987                 err = -EPERM;
988                 if (!capable(CAP_NET_ADMIN))
989                         goto done;
990
991                 if (dev == &ipgre_fb_tunnel_dev) {
992                         err = -EFAULT;
993                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994                                 goto done;
995                         err = -ENOENT;
996                         if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
997                                 goto done;
998                         err = -EPERM;
999                         if (t == &ipgre_fb_tunnel)
1000                                 goto done;
1001                         dev = t->dev;
1002                 }
1003                 err = unregister_netdevice(dev);
1004                 break;
1005
1006         default:
1007                 err = -EINVAL;
1008         }
1009
1010 done:
1011         MOD_DEC_USE_COUNT;
1012         return err;
1013 }
1014
1015 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1016 {
1017         return &(((struct ip_tunnel*)dev->priv)->stat);
1018 }
1019
1020 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1021 {
1022         struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1023         if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1024                 return -EINVAL;
1025         dev->mtu = new_mtu;
1026         return 0;
1027 }
1028
1029 #ifdef CONFIG_NET_IPGRE_BROADCAST
1030 /* Nice toy. Unfortunately, useless in real life :-)
1031    It allows to construct virtual multiprotocol broadcast "LAN"
1032    over the Internet, provided multicast routing is tuned.
1033
1034
1035    I have no idea was this bicycle invented before me,
1036    so that I had to set ARPHRD_IPGRE to a random value.
1037    I have an impression, that Cisco could make something similar,
1038    but this feature is apparently missing in IOS<=11.2(8).
1039    
1040    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1041    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1042
1043    ping -t 255 224.66.66.66
1044
1045    If nobody answers, mbone does not work.
1046
1047    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1048    ip addr add 10.66.66.<somewhat>/24 dev Universe
1049    ifconfig Universe up
1050    ifconfig Universe add fe80::<Your_real_addr>/10
1051    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1052    ftp 10.66.66.66
1053    ...
1054    ftp fec0:6666:6666::193.233.7.65
1055    ...
1056
1057  */
1058
1059 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
1060                         void *daddr, void *saddr, unsigned len)
1061 {
1062         struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1063         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1064         u16 *p = (u16*)(iph+1);
1065
1066         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1067         p[0]            = t->parms.o_flags;
1068         p[1]            = htons(type);
1069
1070         /*
1071          *      Set the source hardware address. 
1072          */
1073          
1074         if (saddr)
1075                 memcpy(&iph->saddr, saddr, 4);
1076
1077         if (daddr) {
1078                 memcpy(&iph->daddr, daddr, 4);
1079                 return t->hlen;
1080         }
1081         if (iph->daddr && !MULTICAST(iph->daddr))
1082                 return t->hlen;
1083         
1084         return -t->hlen;
1085 }
1086
1087 static int ipgre_open(struct net_device *dev)
1088 {
1089         struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1090
1091         MOD_INC_USE_COUNT;
1092         if (MULTICAST(t->parms.iph.daddr)) {
1093                 struct rtable *rt;
1094                 if (ip_route_output(&rt, t->parms.iph.daddr,
1095                                     t->parms.iph.saddr, RT_TOS(t->parms.iph.tos), 
1096                                     t->parms.link)) {
1097                         MOD_DEC_USE_COUNT;
1098                         return -EADDRNOTAVAIL;
1099                 }
1100                 dev = rt->u.dst.dev;
1101                 ip_rt_put(rt);
1102                 if (__in_dev_get(dev) == NULL) {
1103                         MOD_DEC_USE_COUNT;
1104                         return -EADDRNOTAVAIL;
1105                 }
1106                 t->mlink = dev->ifindex;
1107                 ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
1108         }
1109         return 0;
1110 }
1111
1112 static int ipgre_close(struct net_device *dev)
1113 {
1114         struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1115         if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
1116                 struct in_device *in_dev = inetdev_by_index(t->mlink);
1117                 if (in_dev) {
1118                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1119                         in_dev_put(in_dev);
1120                 }
1121         }
1122         MOD_DEC_USE_COUNT;
1123         return 0;
1124 }
1125
1126 #endif
1127
1128 static void ipgre_tunnel_init_gen(struct net_device *dev)
1129 {
1130         struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
1131
1132         dev->uninit             = ipgre_tunnel_uninit;
1133         dev->destructor         = ipgre_tunnel_destructor;
1134         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1135         dev->get_stats          = ipgre_tunnel_get_stats;
1136         dev->do_ioctl           = ipgre_tunnel_ioctl;
1137         dev->change_mtu         = ipgre_tunnel_change_mtu;
1138
1139         dev->type               = ARPHRD_IPGRE;
1140         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1141         dev->mtu                = 1500 - sizeof(struct iphdr) - 4;
1142         dev->flags              = IFF_NOARP;
1143         dev->iflink             = 0;
1144         dev->addr_len           = 4;
1145         memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
1146         memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
1147 }
1148
1149 static int ipgre_tunnel_init(struct net_device *dev)
1150 {
1151         struct net_device *tdev = NULL;
1152         struct ip_tunnel *tunnel;
1153         struct iphdr *iph;
1154         int hlen = LL_MAX_HEADER;
1155         int mtu = 1500;
1156         int addend = sizeof(struct iphdr) + 4;
1157
1158         tunnel = (struct ip_tunnel*)dev->priv;
1159         iph = &tunnel->parms.iph;
1160
1161         ipgre_tunnel_init_gen(dev);
1162
1163         /* Guess output device to choose reasonable mtu and hard_header_len */
1164
1165         if (iph->daddr) {
1166                 struct rtable *rt;
1167                 if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
1168                         tdev = rt->u.dst.dev;
1169                         ip_rt_put(rt);
1170                 }
1171
1172                 dev->flags |= IFF_POINTOPOINT;
1173
1174 #ifdef CONFIG_NET_IPGRE_BROADCAST
1175                 if (MULTICAST(iph->daddr)) {
1176                         if (!iph->saddr)
1177                                 return -EINVAL;
1178                         dev->flags = IFF_BROADCAST;
1179                         dev->hard_header = ipgre_header;
1180                         dev->open = ipgre_open;
1181                         dev->stop = ipgre_close;
1182                 }
1183 #endif
1184         }
1185
1186         if (!tdev && tunnel->parms.link)
1187                 tdev = __dev_get_by_index(tunnel->parms.link);
1188
1189         if (tdev) {
1190                 hlen = tdev->hard_header_len;
1191                 mtu = tdev->mtu;
1192         }
1193         dev->iflink = tunnel->parms.link;
1194
1195         /* Precalculate GRE options length */
1196         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1197                 if (tunnel->parms.o_flags&GRE_CSUM)
1198                         addend += 4;
1199                 if (tunnel->parms.o_flags&GRE_KEY)
1200                         addend += 4;
1201                 if (tunnel->parms.o_flags&GRE_SEQ)
1202                         addend += 4;
1203         }
1204         dev->hard_header_len = hlen + addend;
1205         dev->mtu = mtu - addend;
1206         tunnel->hlen = addend;
1207         return 0;
1208 }
1209
1210 #ifdef MODULE
1211 static int ipgre_fb_tunnel_open(struct net_device *dev)
1212 {
1213         MOD_INC_USE_COUNT;
1214         return 0;
1215 }
1216
1217 static int ipgre_fb_tunnel_close(struct net_device *dev)
1218 {
1219         MOD_DEC_USE_COUNT;
1220         return 0;
1221 }
1222 #endif
1223
1224 int __init ipgre_fb_tunnel_init(struct net_device *dev)
1225 {
1226         struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
1227         struct iphdr *iph;
1228
1229         ipgre_tunnel_init_gen(dev);
1230 #ifdef MODULE
1231         dev->open               = ipgre_fb_tunnel_open;
1232         dev->stop               = ipgre_fb_tunnel_close;
1233 #endif
1234
1235         iph = &ipgre_fb_tunnel.parms.iph;
1236         iph->version            = 4;
1237         iph->protocol           = IPPROTO_GRE;
1238         iph->ihl                = 5;
1239         tunnel->hlen            = sizeof(struct iphdr) + 4;
1240
1241         dev_hold(dev);
1242         tunnels_wc[0]           = &ipgre_fb_tunnel;
1243         return 0;
1244 }
1245
1246
1247 static struct inet_protocol ipgre_protocol = {
1248   ipgre_rcv,             /* GRE handler          */
1249   ipgre_err,             /* TUNNEL error control */
1250   0,                    /* next                 */
1251   IPPROTO_GRE,          /* protocol ID          */
1252   0,                    /* copy                 */
1253   NULL,                 /* data                 */
1254   "GRE"                 /* name                 */
1255 };
1256
1257
1258 /*
1259  *      And now the modules code and kernel interface.
1260  */
1261
1262 #ifdef MODULE
1263 int init_module(void) 
1264 #else
1265 int __init ipgre_init(void)
1266 #endif
1267 {
1268         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1269
1270         ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
1271         register_netdev(&ipgre_fb_tunnel_dev);
1272         inet_add_protocol(&ipgre_protocol);
1273         return 0;
1274 }
1275
1276 #ifdef MODULE
1277
1278 void cleanup_module(void)
1279 {
1280         if ( inet_del_protocol(&ipgre_protocol) < 0 )
1281                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1282
1283         unregister_netdev(&ipgre_fb_tunnel_dev);
1284 }
1285
1286 #endif
1287 MODULE_LICENSE("GPL");