www.usr.com/support/gpl/USR9113_release1.0.tar.gz
[bcm963xx.git] / kernel / linux / net / ipv4 / netfilter / ip_nat_core.c
1 /* NAT for netfilter; shared with compatibility layer. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
18 #include <net/icmp.h>
19 #include <net/ip.h>
20 #include <net/tcp.h>  /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23
24 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
25 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
26
27 #include <linux/netfilter_ipv4/ip_conntrack.h>
28 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
30 #include <linux/netfilter_ipv4/ip_nat.h>
31 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
32 #include <linux/netfilter_ipv4/ip_nat_core.h>
33 #include <linux/netfilter_ipv4/ip_nat_helper.h>
34 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
35 #include <linux/netfilter_ipv4/listhelp.h>
36
37 #if 0
38 #define DEBUGP printk
39 #else
40 #define DEBUGP(format, args...)
41 #endif
42
43 DECLARE_RWLOCK(ip_nat_lock);
44 DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
45
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
48
49 static struct list_head *bysource;
50 static struct list_head *byipsproto;
51 LIST_HEAD(protos);
52 LIST_HEAD(helpers);
53
54 extern struct ip_nat_protocol unknown_nat_protocol;
55
56 /* We keep extra hashes for each conntrack, for fast searching. */
57 static inline size_t
58 hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
59 {
60         /* Modified src and dst, to ensure we don't create two
61            identical streams. */
62         return (src + dst + proto) % ip_nat_htable_size;
63 }
64
65 static inline size_t
66 hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
67 {
68         /* Original src, to ensure we map it consistently if poss. */
69         return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
70 }
71
72 /* Noone using conntrack by the time this called. */
73 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
74 {
75         struct ip_nat_info *info = &conn->nat.info;
76         unsigned int hs, hp;
77
78         if (!info->initialized)
79                 return;
80
81         IP_NF_ASSERT(info->bysource.conntrack);
82         IP_NF_ASSERT(info->byipsproto.conntrack);
83
84         hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
85                          conn->tuplehash[IP_CT_DIR_ORIGINAL]
86                          .tuple.dst.protonum);
87
88         hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
89                               conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
90                               conn->tuplehash[IP_CT_DIR_REPLY]
91                               .tuple.dst.protonum);
92
93         WRITE_LOCK(&ip_nat_lock);
94         LIST_DELETE(&bysource[hs], &info->bysource);
95         LIST_DELETE(&byipsproto[hp], &info->byipsproto);
96         WRITE_UNLOCK(&ip_nat_lock);
97 }
98
99 /* We do checksum mangling, so if they were wrong before they're still
100  * wrong.  Also works for incomplete packets (eg. ICMP dest
101  * unreachables.) */
102 u_int16_t
103 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
104 {
105         u_int32_t diffs[] = { oldvalinv, newval };
106         return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
107                                       oldcheck^0xFFFF));
108 }
109
110 static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
111 {
112         return i->protonum == proto;
113 }
114
115 struct ip_nat_protocol *
116 find_nat_proto(u_int16_t protonum)
117 {
118         struct ip_nat_protocol *i;
119
120         MUST_BE_READ_LOCKED(&ip_nat_lock);
121         i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
122         if (!i)
123                 i = &unknown_nat_protocol;
124         return i;
125 }
126
127 /* Is this tuple already taken? (not by us) */
128 int
129 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
130                   const struct ip_conntrack *ignored_conntrack)
131 {
132         /* Conntrack tracking doesn't keep track of outgoing tuples; only
133            incoming ones.  NAT means they don't have a fixed mapping,
134            so we invert the tuple and look for the incoming reply.
135
136            We could keep a separate hash if this proves too slow. */
137         struct ip_conntrack_tuple reply;
138
139         invert_tuplepr(&reply, tuple);
140         return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
141 }
142
143 /* Does tuple + the source manip come within the range mr */
144 static int
145 in_range(const struct ip_conntrack_tuple *tuple,
146          const struct ip_conntrack_manip *manip,
147          const struct ip_nat_multi_range *mr)
148 {
149         struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
150         unsigned int i;
151         struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
152
153         for (i = 0; i < mr->rangesize; i++) {
154                 /* If we are allowed to map IPs, then we must be in the
155                    range specified, otherwise we must be unchanged. */
156                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
157                         if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
158                             || (ntohl(newtuple.src.ip)
159                                 > ntohl(mr->range[i].max_ip)))
160                                 continue;
161                 } else {
162                         if (newtuple.src.ip != tuple->src.ip)
163                                 continue;
164                 }
165
166                 if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
167                     || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
168                                        &mr->range[i].min, &mr->range[i].max))
169                         return 1;
170         }
171         return 0;
172 }
173
174 static inline int
175 src_cmp(const struct ip_nat_hash *i,
176         const struct ip_conntrack_tuple *tuple,
177         const struct ip_nat_multi_range *mr)
178 {
179         return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
180                 == tuple->dst.protonum
181                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
182                 == tuple->src.ip
183                 && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
184                 == tuple->src.u.all
185                 && in_range(tuple,
186                             &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
187                             .tuple.src,
188                             mr));
189 }
190
191 /* Only called for SRC manip */
192 static struct ip_conntrack_manip *
193 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
194                      const struct ip_nat_multi_range *mr)
195 {
196         unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
197         struct ip_nat_hash *i;
198
199         MUST_BE_READ_LOCKED(&ip_nat_lock);
200         i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
201         if (i)
202                 return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
203         else
204                 return NULL;
205 }
206
207 #ifdef CONFIG_IP_NF_NAT_LOCAL
208 /* If it's really a local destination manip, it may need to do a
209    source manip too. */
210 static int
211 do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
212 {
213         struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
214         struct rtable *rt;
215
216         /* FIXME: IPTOS_TOS(iph->tos) --RR */
217         if (ip_route_output_key(&rt, &fl) != 0) {
218                 DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
219                        NIPQUAD(var_ip));
220                 return 0;
221         }
222
223         *other_ipp = rt->rt_src;
224         ip_rt_put(rt);
225         return 1;
226 }
227 #endif
228
229 /* Simple way to iterate through all. */
230 static inline int fake_cmp(const struct ip_nat_hash *i,
231                            u_int32_t src, u_int32_t dst, u_int16_t protonum,
232                            unsigned int *score,
233                            const struct ip_conntrack *conntrack)
234 {
235         /* Compare backwards: we're dealing with OUTGOING tuples, and
236            inside the conntrack is the REPLY tuple.  Don't count this
237            conntrack. */
238         if (i->conntrack != conntrack
239             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
240             && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
241             && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
242                 == protonum))
243                 (*score)++;
244         return 0;
245 }
246
247 static inline unsigned int
248 count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
249            const struct ip_conntrack *conntrack)
250 {
251         unsigned int score = 0;
252         unsigned int h;
253
254         MUST_BE_READ_LOCKED(&ip_nat_lock);
255         h = hash_by_ipsproto(src, dst, protonum);
256         LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
257                   src, dst, protonum, &score, conntrack);
258
259         return score;
260 }
261
262 /* For [FUTURE] fragmentation handling, we want the least-used
263    src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
264    if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
265    1-65535, we don't do pro-rata allocation based on ports; we choose
266    the ip with the lowest src-ip/dst-ip/proto usage.
267
268    If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
269    range), we eliminate that and try again.  This is not the most
270    efficient approach, but if you're worried about that, don't hand us
271    ranges you don't really have.  */
272 static struct ip_nat_range *
273 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
274                     const struct ip_nat_multi_range *mr,
275                     const struct ip_conntrack *conntrack,
276                     unsigned int hooknum)
277 {
278         unsigned int i;
279         struct {
280                 const struct ip_nat_range *range;
281                 unsigned int score;
282                 struct ip_conntrack_tuple tuple;
283         } best = { NULL,  0xFFFFFFFF };
284         u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
285         static unsigned int randomness;
286
287         if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
288                 var_ipp = &tuple->src.ip;
289                 saved_ip = tuple->dst.ip;
290                 other_ipp = &tuple->dst.ip;
291         } else {
292                 var_ipp = &tuple->dst.ip;
293                 saved_ip = tuple->src.ip;
294                 other_ipp = &tuple->src.ip;
295         }
296         /* Don't do do_extra_mangle unless necessary (overrides
297            explicit socket bindings, for example) */
298         orig_dstip = tuple->dst.ip;
299
300         IP_NF_ASSERT(mr->rangesize >= 1);
301         for (i = 0; i < mr->rangesize; i++) {
302                 /* Host order */
303                 u_int32_t minip, maxip, j;
304
305                 /* Don't do ranges which are already eliminated. */
306                 if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
307                         continue;
308                 }
309
310                 if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
311                         minip = ntohl(mr->range[i].min_ip);
312                         maxip = ntohl(mr->range[i].max_ip);
313                 } else
314                         minip = maxip = ntohl(*var_ipp);
315
316                 randomness++;
317                 for (j = 0; j < maxip - minip + 1; j++) {
318                         unsigned int score;
319
320                         *var_ipp = htonl(minip + (randomness + j) 
321                                          % (maxip - minip + 1));
322
323                         /* Reset the other ip in case it was mangled by
324                          * do_extra_mangle last time. */
325                         *other_ipp = saved_ip;
326
327 #ifdef CONFIG_IP_NF_NAT_LOCAL
328                         if (hooknum == NF_IP_LOCAL_OUT
329                             && *var_ipp != orig_dstip
330                             && !do_extra_mangle(*var_ipp, other_ipp)) {
331                                 DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
332                                        i, NIPQUAD(*var_ipp));
333                                 /* Can't route?  This whole range part is
334                                  * probably screwed, but keep trying
335                                  * anyway. */
336                                 continue;
337                         }
338 #endif
339
340                         /* Count how many others map onto this. */
341                         score = count_maps(tuple->src.ip, tuple->dst.ip,
342                                            tuple->dst.protonum, conntrack);
343                         if (score < best.score) {
344                                 /* Optimization: doesn't get any better than
345                                    this. */
346                                 if (score == 0)
347                                         return (struct ip_nat_range *)
348                                                 &mr->range[i];
349
350                                 best.score = score;
351                                 best.tuple = *tuple;
352                                 best.range = &mr->range[i];
353                         }
354                 }
355         }
356         *tuple = best.tuple;
357
358         /* Discard const. */
359         return (struct ip_nat_range *)best.range;
360 }
361
362 /* Fast version doesn't iterate through hash chains, but only handles
363    common case of single IP address (null NAT, masquerade) */
364 static struct ip_nat_range *
365 find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
366                          const struct ip_nat_multi_range *mr,
367                          const struct ip_conntrack *conntrack,
368                          unsigned int hooknum)
369 {
370         if (mr->rangesize != 1
371             || (mr->range[0].flags & IP_NAT_RANGE_FULL)
372             || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
373                 && mr->range[0].min_ip != mr->range[0].max_ip))
374                 return find_best_ips_proto(tuple, mr, conntrack, hooknum);
375
376         if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
377                 if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
378                         tuple->src.ip = mr->range[0].min_ip;
379                 else {
380                         /* Only do extra mangle when required (breaks
381                            socket binding) */
382 #ifdef CONFIG_IP_NF_NAT_LOCAL
383                         if (tuple->dst.ip != mr->range[0].min_ip
384                             && hooknum == NF_IP_LOCAL_OUT
385                             && !do_extra_mangle(mr->range[0].min_ip,
386                                                 &tuple->src.ip))
387                                 return NULL;
388 #endif
389                         tuple->dst.ip = mr->range[0].min_ip;
390                 }
391         }
392
393         /* Discard const. */
394         return (struct ip_nat_range *)&mr->range[0];
395 }
396
397 static int
398 get_unique_tuple(struct ip_conntrack_tuple *tuple,
399                  const struct ip_conntrack_tuple *orig_tuple,
400                  const struct ip_nat_multi_range *mrr,
401                  struct ip_conntrack *conntrack,
402                  unsigned int hooknum)
403 {
404         struct ip_nat_protocol *proto
405                 = find_nat_proto(orig_tuple->dst.protonum);
406         struct ip_nat_range *rptr;
407         unsigned int i;
408         int ret;
409
410         /* We temporarily use flags for marking full parts, but we
411            always clean up afterwards */
412         struct ip_nat_multi_range *mr = (void *)mrr;
413
414         /* 1) If this srcip/proto/src-proto-part is currently mapped,
415            and that same mapping gives a unique tuple within the given
416            range, use that.
417
418            This is only required for source (ie. NAT/masq) mappings.
419            So far, we don't do local source mappings, so multiple
420            manips not an issue.  */
421         if (hooknum == NF_IP_POST_ROUTING) {
422                 struct ip_conntrack_manip *manip;
423
424                 manip = find_appropriate_src(orig_tuple, mr);
425                 if (manip) {
426                         /* Apply same source manipulation. */
427                         *tuple = ((struct ip_conntrack_tuple)
428                                   { *manip, orig_tuple->dst });
429                         DEBUGP("get_unique_tuple: Found current src map\n");
430                         if (!ip_nat_used_tuple(tuple, conntrack))
431                                 return 1;
432                 }
433         }
434
435         /* 2) Select the least-used IP/proto combination in the given
436            range.
437         */
438         *tuple = *orig_tuple;
439         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
440                != NULL) {
441                 DEBUGP("Found best for "); DUMP_TUPLE(tuple);
442                 /* 3) The per-protocol part of the manip is made to
443                    map into the range to make a unique tuple. */
444
445                 /* Only bother mapping if it's not already in range
446                    and unique */
447                 if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
448                      || proto->in_range(tuple, HOOK2MANIP(hooknum),
449                                         &rptr->min, &rptr->max))
450                     && !ip_nat_used_tuple(tuple, conntrack)) {
451                         ret = 1;
452                         goto clear_fulls;
453                 } else {
454                         if (proto->unique_tuple(tuple, rptr,
455                                                 HOOK2MANIP(hooknum),
456                                                 conntrack)) {
457                                 /* Must be unique. */
458                                 IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
459                                                                 conntrack));
460                                 ret = 1;
461                                 goto clear_fulls;
462                         } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
463                                 /* Try implicit source NAT; protocol
464                                    may be able to play with ports to
465                                    make it unique. */
466                                 struct ip_nat_range r
467                                         = { IP_NAT_RANGE_MAP_IPS, 
468                                             tuple->src.ip, tuple->src.ip,
469                                             { 0 }, { 0 } };
470                                 DEBUGP("Trying implicit mapping\n");
471                                 if (proto->unique_tuple(tuple, &r,
472                                                         IP_NAT_MANIP_SRC,
473                                                         conntrack)) {
474                                         /* Must be unique. */
475                                         IP_NF_ASSERT(!ip_nat_used_tuple
476                                                      (tuple, conntrack));
477                                         ret = 1;
478                                         goto clear_fulls;
479                                 }
480                         }
481                         DEBUGP("Protocol can't get unique tuple %u.\n",
482                                hooknum);
483                 }
484
485                 /* Eliminate that from range, and try again. */
486                 rptr->flags |= IP_NAT_RANGE_FULL;
487                 *tuple = *orig_tuple;
488         }
489
490         ret = 0;
491
492  clear_fulls:
493         /* Clear full flags. */
494         IP_NF_ASSERT(mr->rangesize >= 1);
495         for (i = 0; i < mr->rangesize; i++)
496                 mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
497
498         return ret;
499 }
500
501 static inline int
502 helper_cmp(const struct ip_nat_helper *helper,
503            const struct ip_conntrack_tuple *tuple)
504 {
505         return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
506 }
507
508 /* Where to manip the reply packets (will be reverse manip). */
509 static unsigned int opposite_hook[NF_IP_NUMHOOKS]
510 = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
511     [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
512 #ifdef CONFIG_IP_NF_NAT_LOCAL
513     [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
514     [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
515 #endif
516 };
517
518 unsigned int
519 ip_nat_setup_info(struct ip_conntrack *conntrack,
520                   const struct ip_nat_multi_range *mr,
521                   unsigned int hooknum)
522 {
523         struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
524         struct ip_conntrack_tuple orig_tp;
525         struct ip_nat_info *info = &conntrack->nat.info;
526         int in_hashes = info->initialized;
527
528         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
529         IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
530                      || hooknum == NF_IP_POST_ROUTING
531                      || hooknum == NF_IP_LOCAL_IN
532                      || hooknum == NF_IP_LOCAL_OUT);
533         IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
534         IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
535
536         /* What we've got will look like inverse of reply. Normally
537            this is what is in the conntrack, except for prior
538            manipulations (future optimization: if num_manips == 0,
539            orig_tp =
540            conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
541         invert_tuplepr(&orig_tp,
542                        &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
543
544 #if 0
545         {
546         unsigned int i;
547
548         DEBUGP("Hook %u (%s), ", hooknum,
549                HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
550         DUMP_TUPLE(&orig_tp);
551         DEBUGP("Range %p: ", mr);
552         for (i = 0; i < mr->rangesize; i++) {
553                 DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
554                        i,
555                        (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
556                        ? " MAP_IPS" : "",
557                        (mr->range[i].flags
558                         & IP_NAT_RANGE_PROTO_SPECIFIED)
559                        ? " PROTO_SPECIFIED" : "",
560                        (mr->range[i].flags & IP_NAT_RANGE_FULL)
561                        ? " FULL" : "",
562                        NIPQUAD(mr->range[i].min_ip),
563                        NIPQUAD(mr->range[i].max_ip),
564                        mr->range[i].min.all,
565                        mr->range[i].max.all);
566         }
567         }
568 #endif
569
570         do {
571                 if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
572                                       hooknum)) {
573                         DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
574                                conntrack);
575                         return NF_DROP;
576                 }
577
578 #if 0
579                 DEBUGP("Hook %u (%s) %p\n", hooknum,
580                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
581                        conntrack);
582                 DEBUGP("Original: ");
583                 DUMP_TUPLE(&orig_tp);
584                 DEBUGP("New: ");
585                 DUMP_TUPLE(&new_tuple);
586 #endif
587
588                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
589                    the original (A/B/C/D') and the mangled one (E/F/G/H').
590
591                    We're only allowed to work with the SRC per-proto
592                    part, so we create inverses of both to start, then
593                    derive the other fields we need.  */
594
595                 /* Reply connection: simply invert the new tuple
596                    (G/H/E/F') */
597                 invert_tuplepr(&reply, &new_tuple);
598
599                 /* Alter conntrack table so it recognizes replies.
600                    If fail this race (reply tuple now used), repeat. */
601         } while (!ip_conntrack_alter_reply(conntrack, &reply));
602
603         /* FIXME: We can simply used existing conntrack reply tuple
604            here --RR */
605         /* Create inverse of original: C/D/A/B' */
606         invert_tuplepr(&inv_tuple, &orig_tp);
607
608         /* Has source changed?. */
609         if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
610                 /* In this direction, a source manip. */
611                 info->manips[info->num_manips++] =
612                         ((struct ip_nat_info_manip)
613                          { IP_CT_DIR_ORIGINAL, hooknum,
614                            IP_NAT_MANIP_SRC, new_tuple.src });
615
616                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
617
618                 /* In the reverse direction, a destination manip. */
619                 info->manips[info->num_manips++] =
620                         ((struct ip_nat_info_manip)
621                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
622                            IP_NAT_MANIP_DST, orig_tp.src });
623                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
624         }
625
626         /* Has destination changed? */
627         if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
628                 /* In this direction, a destination manip */
629                 info->manips[info->num_manips++] =
630                         ((struct ip_nat_info_manip)
631                          { IP_CT_DIR_ORIGINAL, hooknum,
632                            IP_NAT_MANIP_DST, reply.src });
633
634                 IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
635
636                 /* In the reverse direction, a source manip. */
637                 info->manips[info->num_manips++] =
638                         ((struct ip_nat_info_manip)
639                          { IP_CT_DIR_REPLY, opposite_hook[hooknum],
640                            IP_NAT_MANIP_SRC, inv_tuple.src });
641                 IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
642         }
643
644         /* If there's a helper, assign it; based on new tuple. */
645         if (!conntrack->master)
646                 info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
647                                          &reply);
648
649         /* It's done. */
650         info->initialized |= (1 << HOOK2MANIP(hooknum));
651
652         if (in_hashes) {
653                 IP_NF_ASSERT(info->bysource.conntrack);
654                 replace_in_hashes(conntrack, info);
655         } else {
656                 place_in_hashes(conntrack, info);
657         }
658
659         return NF_ACCEPT;
660 }
661
662 void replace_in_hashes(struct ip_conntrack *conntrack,
663                        struct ip_nat_info *info)
664 {
665         /* Source has changed, so replace in hashes. */
666         unsigned int srchash
667                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
668                               .tuple.src,
669                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
670                               .tuple.dst.protonum);
671         /* We place packet as seen OUTGOUNG in byips_proto hash
672            (ie. reverse dst and src of reply packet. */
673         unsigned int ipsprotohash
674                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
675                                    .tuple.dst.ip,
676                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
677                                    .tuple.src.ip,
678                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
679                                    .tuple.dst.protonum);
680
681         IP_NF_ASSERT(info->bysource.conntrack == conntrack);
682         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
683
684         list_del(&info->bysource.list);
685         list_del(&info->byipsproto.list);
686
687         list_prepend(&bysource[srchash], &info->bysource);
688         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
689 }
690
691 void place_in_hashes(struct ip_conntrack *conntrack,
692                      struct ip_nat_info *info)
693 {
694         unsigned int srchash
695                 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
696                               .tuple.src,
697                               conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
698                               .tuple.dst.protonum);
699         /* We place packet as seen OUTGOUNG in byips_proto hash
700            (ie. reverse dst and src of reply packet. */
701         unsigned int ipsprotohash
702                 = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
703                                    .tuple.dst.ip,
704                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
705                                    .tuple.src.ip,
706                                    conntrack->tuplehash[IP_CT_DIR_REPLY]
707                                    .tuple.dst.protonum);
708
709         IP_NF_ASSERT(!info->bysource.conntrack);
710
711         MUST_BE_WRITE_LOCKED(&ip_nat_lock);
712         info->byipsproto.conntrack = conntrack;
713         info->bysource.conntrack = conntrack;
714
715         list_prepend(&bysource[srchash], &info->bysource);
716         list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
717 }
718
719 /* Returns true if succeeded. */
720 static int
721 manip_pkt(u_int16_t proto,
722           struct sk_buff **pskb,
723           unsigned int iphdroff,
724           const struct ip_conntrack_manip *manip,
725           enum ip_nat_manip_type maniptype)
726 {
727         struct iphdr *iph;
728
729         (*pskb)->nfcache |= NFC_ALTERED;
730         if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
731                 return 0;
732
733         iph = (void *)(*pskb)->data + iphdroff;
734
735         /* Manipulate protcol part. */
736         if (!find_nat_proto(proto)->manip_pkt(pskb,
737                                               iphdroff + iph->ihl*4,
738                                               manip, maniptype))
739                 return 0;
740
741         iph = (void *)(*pskb)->data + iphdroff;
742
743         if (maniptype == IP_NAT_MANIP_SRC) {
744                 iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
745                                                 iph->check);
746                 iph->saddr = manip->ip;
747         } else {
748                 iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
749                                                 iph->check);
750                 iph->daddr = manip->ip;
751         }
752         return 1;
753 }
754
755 static inline int exp_for_packet(struct ip_conntrack_expect *exp,
756                                  struct sk_buff *skb)
757 {
758         struct ip_conntrack_protocol *proto;
759         int ret = 1;
760
761         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
762         proto = __ip_ct_find_proto(skb->nh.iph->protocol);
763         if (proto->exp_matches_pkt)
764                 ret = proto->exp_matches_pkt(exp, skb);
765
766         return ret;
767 }
768
769 /* Do packet manipulations according to binding. */
770 unsigned int
771 do_bindings(struct ip_conntrack *ct,
772             enum ip_conntrack_info ctinfo,
773             struct ip_nat_info *info,
774             unsigned int hooknum,
775             struct sk_buff **pskb)
776 {
777         unsigned int i;
778         struct ip_nat_helper *helper;
779         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
780         int proto = (*pskb)->nh.iph->protocol;
781
782         /* Need nat lock to protect against modification, but neither
783            conntrack (referenced) and helper (deleted with
784            synchronize_bh()) can vanish. */
785         READ_LOCK(&ip_nat_lock);
786         for (i = 0; i < info->num_manips; i++) {
787                 if (info->manips[i].direction == dir
788                     && info->manips[i].hooknum == hooknum) {
789                         DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
790                                *pskb,
791                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
792                                ? "SRC" : "DST",
793                                NIPQUAD(info->manips[i].manip.ip),
794                                htons(info->manips[i].manip.u.all));
795                         if (!manip_pkt(proto, pskb, 0,
796                                        &info->manips[i].manip,
797                                        info->manips[i].maniptype)) {
798                                 READ_UNLOCK(&ip_nat_lock);
799                                 return NF_DROP;
800                         }
801                 }
802         }
803         helper = info->helper;
804         READ_UNLOCK(&ip_nat_lock);
805
806         if (helper) {
807                 struct ip_conntrack_expect *exp = NULL;
808                 struct list_head *cur_item;
809                 int ret = NF_ACCEPT;
810                 int helper_called = 0;
811
812                 DEBUGP("do_bindings: helper existing for (%p)\n", ct);
813
814                 /* Always defragged for helpers */
815                 IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
816                                & htons(IP_MF|IP_OFFSET)));
817
818                 /* Have to grab read lock before sibling_list traversal */
819                 READ_LOCK(&ip_conntrack_lock);
820                 list_for_each_prev(cur_item, &ct->sibling_list) { 
821                         exp = list_entry(cur_item, struct ip_conntrack_expect, 
822                                          expected_list);
823                                          
824                         /* if this expectation is already established, skip */
825                         if (exp->sibling)
826                                 continue;
827
828                         if (exp_for_packet(exp, *pskb)) {
829                                 /* FIXME: May be true multiple times in the
830                                  * case of UDP!! */
831                                 DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
832                                 ret = helper->help(ct, exp, info, ctinfo, 
833                                                    hooknum, pskb);
834                                 if (ret != NF_ACCEPT) {
835                                         READ_UNLOCK(&ip_conntrack_lock);
836                                         return ret;
837                                 }
838                                 helper_called = 1;
839                         }
840                 }
841                 /* Helper might want to manip the packet even when there is no
842                  * matching expectation for this packet */
843                 if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
844                         DEBUGP("calling nat helper for packet without expectation\n");
845                         ret = helper->help(ct, NULL, info, ctinfo, 
846                                            hooknum, pskb);
847                         if (ret != NF_ACCEPT) {
848                                 READ_UNLOCK(&ip_conntrack_lock);
849                                 return ret;
850                         }
851                 }
852                 READ_UNLOCK(&ip_conntrack_lock);
853                 
854                 /* Adjust sequence number only once per packet 
855                  * (helper is called at all hooks) */
856                 if (proto == IPPROTO_TCP
857                     && (hooknum == NF_IP_POST_ROUTING
858                         || hooknum == NF_IP_LOCAL_IN)) {
859                         DEBUGP("ip_nat_core: adjusting sequence number\n");
860                         /* future: put this in a l4-proto specific function,
861                          * and call this function here. */
862                         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
863                                 ret = NF_DROP;
864                 }
865
866                 return ret;
867
868         } else 
869                 return NF_ACCEPT;
870
871         /* not reached */
872 }
873
874 int
875 icmp_reply_translation(struct sk_buff **pskb,
876                        struct ip_conntrack *conntrack,
877                        unsigned int hooknum,
878                        int dir)
879 {
880         struct {
881                 struct icmphdr icmp;
882                 struct iphdr ip;
883         } *inside;
884         unsigned int i;
885         struct ip_nat_info *info = &conntrack->nat.info;
886         int hdrlen;
887
888         if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
889                 return 0;
890         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
891
892         /* We're actually going to mangle it beyond trivial checksum
893            adjustment, so make sure the current checksum is correct. */
894         if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
895                 hdrlen = (*pskb)->nh.iph->ihl * 4;
896                 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
897                                                 (*pskb)->len - hdrlen, 0)))
898                         return 0;
899         }
900
901         /* Must be RELATED */
902         IP_NF_ASSERT((*pskb)->nfct
903                      - ((struct ip_conntrack *)(*pskb)->nfct->master)->infos
904                      == IP_CT_RELATED
905                      || (*pskb)->nfct
906                      - ((struct ip_conntrack *)(*pskb)->nfct->master)->infos
907                      == IP_CT_RELATED+IP_CT_IS_REPLY);
908
909         /* Redirects on non-null nats must be dropped, else they'll
910            start talking to each other without our translation, and be
911            confused... --RR */
912         if (inside->icmp.type == ICMP_REDIRECT) {
913                 /* Don't care about races here. */
914                 if (info->initialized
915                     != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
916                     || info->num_manips != 0)
917                         return 0;
918         }
919
920         DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
921                *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
922         /* Note: May not be from a NAT'd host, but probably safest to
923            do translation always as if it came from the host itself
924            (even though a "host unreachable" coming from the host
925            itself is a bit weird).
926
927            More explanation: some people use NAT for anonymizing.
928            Also, CERT recommends dropping all packets from private IP
929            addresses (although ICMP errors from internal links with
930            such addresses are not too uncommon, as Alan Cox points
931            out) */
932
933         READ_LOCK(&ip_nat_lock);
934         for (i = 0; i < info->num_manips; i++) {
935                 DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
936                        i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
937                        "ORIG" : "REPLY", info->manips[i].hooknum);
938
939                 if (info->manips[i].direction != dir)
940                         continue;
941
942                 /* Mapping the inner packet is just like a normal
943                    packet, except it was never src/dst reversed, so
944                    where we would normally apply a dst manip, we apply
945                    a src, and vice versa. */
946                 if (info->manips[i].hooknum == hooknum) {
947                         DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
948                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
949                                ? "DST" : "SRC",
950                                NIPQUAD(info->manips[i].manip.ip),
951                                ntohs(info->manips[i].manip.u.udp.port));
952                         if (!manip_pkt(inside->ip.protocol, pskb,
953                                        (*pskb)->nh.iph->ihl*4
954                                        + sizeof(inside->icmp),
955                                        &info->manips[i].manip,
956                                        !info->manips[i].maniptype))
957                                 goto unlock_fail;
958
959                         /* Outer packet needs to have IP header NATed like
960                            it's a reply. */
961
962                         /* Use mapping to map outer packet: 0 give no
963                            per-proto mapping */
964                         DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
965                                info->manips[i].maniptype == IP_NAT_MANIP_SRC
966                                ? "SRC" : "DST",
967                                NIPQUAD(info->manips[i].manip.ip));
968                         if (!manip_pkt(0, pskb, 0,
969                                        &info->manips[i].manip,
970                                        info->manips[i].maniptype))
971                                 goto unlock_fail;
972                 }
973         }
974         READ_UNLOCK(&ip_nat_lock);
975
976         hdrlen = (*pskb)->nh.iph->ihl * 4;
977
978         inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
979
980         inside->icmp.checksum = 0;
981         inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
982                                                        (*pskb)->len - hdrlen,
983                                                        0));
984         return 1;
985
986  unlock_fail:
987         READ_UNLOCK(&ip_nat_lock);
988         return 0;
989 }
990
991 int __init ip_nat_init(void)
992 {
993         size_t i;
994
995         /* Leave them the same for the moment. */
996         ip_nat_htable_size = ip_conntrack_htable_size;
997
998         /* One vmalloc for both hash tables */
999         bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
1000         if (!bysource) {
1001                 return -ENOMEM;
1002         }
1003         byipsproto = bysource + ip_nat_htable_size;
1004
1005         /* Sew in builtin protocols. */
1006         WRITE_LOCK(&ip_nat_lock);
1007         list_append(&protos, &ip_nat_protocol_tcp);
1008         list_append(&protos, &ip_nat_protocol_udp);
1009         list_append(&protos, &ip_nat_protocol_icmp);
1010         WRITE_UNLOCK(&ip_nat_lock);
1011
1012         for (i = 0; i < ip_nat_htable_size; i++) {
1013                 INIT_LIST_HEAD(&bysource[i]);
1014                 INIT_LIST_HEAD(&byipsproto[i]);
1015         }
1016
1017         /* FIXME: Man, this is a hack.  <SIGH> */
1018         IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
1019         ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
1020         
1021         /* Initialize fake conntrack so that NAT will skip it */
1022         ip_conntrack_untracked.nat.info.initialized |= 
1023                 (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST);
1024
1025         return 0;
1026 }
1027
1028 /* Clear NAT section of all conntracks, in case we're loaded again. */
1029 static int clean_nat(const struct ip_conntrack *i, void *data)
1030 {
1031         memset((void *)&i->nat, 0, sizeof(i->nat));
1032         return 0;
1033 }
1034
1035 /* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
1036 void ip_nat_cleanup(void)
1037 {
1038         ip_ct_selective_cleanup(&clean_nat, NULL);
1039         ip_conntrack_destroyed = NULL;
1040         vfree(bysource);
1041 }