git.rot13.org Git - linux/blob - net/netfilter/nf_nat_core.c

   1 /*
   2  * (C) 1999-2001 Paul `Rusty' Russell
   3  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   4  * (C) 2011 Patrick McHardy <kaber@trash.net>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/module.h>
  14 #include <linux/types.h>
  15 #include <linux/timer.h>
  16 #include <linux/skbuff.h>
  17 #include <linux/gfp.h>
  18 #include <net/xfrm.h>
  19 #include <linux/jhash.h>
  20 #include <linux/rtnetlink.h>
  21
  22 #include <net/netfilter/nf_conntrack.h>
  23 #include <net/netfilter/nf_conntrack_core.h>
  24 #include <net/netfilter/nf_nat.h>
  25 #include <net/netfilter/nf_nat_l3proto.h>
  26 #include <net/netfilter/nf_nat_core.h>
  27 #include <net/netfilter/nf_nat_helper.h>
  28 #include <net/netfilter/nf_conntrack_helper.h>
  29 #include <net/netfilter/nf_conntrack_seqadj.h>
  30 #include <net/netfilter/nf_conntrack_zones.h>
  31 #include <linux/netfilter/nf_nat.h>
  32
  33 #include "nf_internals.h"
  34
  35 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
  36
  37 static DEFINE_MUTEX(nf_nat_proto_mutex);
  38 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
  39                                                 __read_mostly;
  40 static unsigned int nat_net_id __read_mostly;
  41
  42 static struct hlist_head *nf_nat_bysource __read_mostly;
  43 static unsigned int nf_nat_htable_size __read_mostly;
  44 static unsigned int nf_nat_hash_rnd __read_mostly;
  45
  46 struct nf_nat_lookup_hook_priv {
  47         struct nf_hook_entries __rcu *entries;
  48
  49         struct rcu_head rcu_head;
  50 };
  51
  52 struct nf_nat_hooks_net {
  53         struct nf_hook_ops *nat_hook_ops;
  54         unsigned int users;
  55 };
  56
  57 struct nat_net {
  58         struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
  59 };
  60
  61 inline const struct nf_nat_l3proto *
  62 __nf_nat_l3proto_find(u8 family)
  63 {
  64         return rcu_dereference(nf_nat_l3protos[family]);
  65 }
  66
  67 #ifdef CONFIG_XFRM
  68 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
  69 {
  70         const struct nf_nat_l3proto *l3proto;
  71         const struct nf_conn *ct;
  72         enum ip_conntrack_info ctinfo;
  73         enum ip_conntrack_dir dir;
  74         unsigned  long statusbit;
  75         u8 family;
  76
  77         ct = nf_ct_get(skb, &ctinfo);
  78         if (ct == NULL)
  79                 return;
  80
  81         family = nf_ct_l3num(ct);
  82         l3proto = __nf_nat_l3proto_find(family);
  83         if (l3proto == NULL)
  84                 return;
  85
  86         dir = CTINFO2DIR(ctinfo);
  87         if (dir == IP_CT_DIR_ORIGINAL)
  88                 statusbit = IPS_DST_NAT;
  89         else
  90                 statusbit = IPS_SRC_NAT;
  91
  92         l3proto->decode_session(skb, ct, dir, statusbit, fl);
  93 }
  94
  95 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
  96 {
  97         struct flowi fl;
  98         unsigned int hh_len;
  99         struct dst_entry *dst;
 100         struct sock *sk = skb->sk;
 101         int err;
 102
 103         err = xfrm_decode_session(skb, &fl, family);
 104         if (err < 0)
 105                 return err;
 106
 107         dst = skb_dst(skb);
 108         if (dst->xfrm)
 109                 dst = ((struct xfrm_dst *)dst)->route;
 110         if (!dst_hold_safe(dst))
 111                 return -EHOSTUNREACH;
 112
 113         if (sk && !net_eq(net, sock_net(sk)))
 114                 sk = NULL;
 115
 116         dst = xfrm_lookup(net, dst, &fl, sk, 0);
 117         if (IS_ERR(dst))
 118                 return PTR_ERR(dst);
 119
 120         skb_dst_drop(skb);
 121         skb_dst_set(skb, dst);
 122
 123         /* Change in oif may mean change in hh_len. */
 124         hh_len = skb_dst(skb)->dev->hard_header_len;
 125         if (skb_headroom(skb) < hh_len &&
 126             pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
 127                 return -ENOMEM;
 128         return 0;
 129 }
 130 EXPORT_SYMBOL(nf_xfrm_me_harder);
 131 #endif /* CONFIG_XFRM */
 132
 133 /* We keep an extra hash for each conntrack, for fast searching. */
 134 static unsigned int
 135 hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 136 {
 137         unsigned int hash;
 138
 139         get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 140
 141         /* Original src, to ensure we map it consistently if poss. */
 142         hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
 143                       tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 144
 145         return reciprocal_scale(hash, nf_nat_htable_size);
 146 }
 147
 148 /* Is this tuple already taken? (not by us) */
 149 int
 150 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
 151                   const struct nf_conn *ignored_conntrack)
 152 {
 153         /* Conntrack tracking doesn't keep track of outgoing tuples; only
 154          * incoming ones.  NAT means they don't have a fixed mapping,
 155          * so we invert the tuple and look for the incoming reply.
 156          *
 157          * We could keep a separate hash if this proves too slow.
 158          */
 159         struct nf_conntrack_tuple reply;
 160
 161         nf_ct_invert_tuplepr(&reply, tuple);
 162         return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
 163 }
 164 EXPORT_SYMBOL(nf_nat_used_tuple);
 165
 166 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
 167                                  const struct nf_nat_range2 *range)
 168 {
 169         if (t->src.l3num == NFPROTO_IPV4)
 170                 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
 171                        ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
 172
 173         return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
 174                ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
 175 }
 176
 177 /* Is the manipable part of the tuple between min and max incl? */
 178 static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 179                              enum nf_nat_manip_type maniptype,
 180                              const union nf_conntrack_man_proto *min,
 181                              const union nf_conntrack_man_proto *max)
 182 {
 183         __be16 port;
 184
 185         switch (tuple->dst.protonum) {
 186         case IPPROTO_ICMP: /* fallthrough */
 187         case IPPROTO_ICMPV6:
 188                 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
 189                        ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
 190         case IPPROTO_GRE: /* all fall though */
 191         case IPPROTO_TCP:
 192         case IPPROTO_UDP:
 193         case IPPROTO_UDPLITE:
 194         case IPPROTO_DCCP:
 195         case IPPROTO_SCTP:
 196                 if (maniptype == NF_NAT_MANIP_SRC)
 197                         port = tuple->src.u.all;
 198                 else
 199                         port = tuple->dst.u.all;
 200
 201                 return ntohs(port) >= ntohs(min->all) &&
 202                        ntohs(port) <= ntohs(max->all);
 203         default:
 204                 return true;
 205         }
 206 }
 207
 208 /* If we source map this tuple so reply looks like reply_tuple, will
 209  * that meet the constraints of range.
 210  */
 211 static int in_range(const struct nf_conntrack_tuple *tuple,
 212                     const struct nf_nat_range2 *range)
 213 {
 214         /* If we are supposed to map IPs, then we must be in the
 215          * range specified, otherwise let this drag us onto a new src IP.
 216          */
 217         if (range->flags & NF_NAT_RANGE_MAP_IPS &&
 218             !nf_nat_inet_in_range(tuple, range))
 219                 return 0;
 220
 221         if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
 222                 return 1;
 223
 224         return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
 225                                 &range->min_proto, &range->max_proto);
 226 }
 227
 228 static inline int
 229 same_src(const struct nf_conn *ct,
 230          const struct nf_conntrack_tuple *tuple)
 231 {
 232         const struct nf_conntrack_tuple *t;
 233
 234         t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 235         return (t->dst.protonum == tuple->dst.protonum &&
 236                 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
 237                 t->src.u.all == tuple->src.u.all);
 238 }
 239
 240 /* Only called for SRC manip */
 241 static int
 242 find_appropriate_src(struct net *net,
 243                      const struct nf_conntrack_zone *zone,
 244                      const struct nf_conntrack_tuple *tuple,
 245                      struct nf_conntrack_tuple *result,
 246                      const struct nf_nat_range2 *range)
 247 {
 248         unsigned int h = hash_by_src(net, tuple);
 249         const struct nf_conn *ct;
 250
 251         hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
 252                 if (same_src(ct, tuple) &&
 253                     net_eq(net, nf_ct_net(ct)) &&
 254                     nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
 255                         /* Copy source part from reply tuple. */
 256                         nf_ct_invert_tuplepr(result,
 257                                        &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 258                         result->dst = tuple->dst;
 259
 260                         if (in_range(result, range))
 261                                 return 1;
 262                 }
 263         }
 264         return 0;
 265 }
 266
 267 /* For [FUTURE] fragmentation handling, we want the least-used
 268  * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 269  * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 270  * 1-65535, we don't do pro-rata allocation based on ports; we choose
 271  * the ip with the lowest src-ip/dst-ip/proto usage.
 272  */
 273 static void
 274 find_best_ips_proto(const struct nf_conntrack_zone *zone,
 275                     struct nf_conntrack_tuple *tuple,
 276                     const struct nf_nat_range2 *range,
 277                     const struct nf_conn *ct,
 278                     enum nf_nat_manip_type maniptype)
 279 {
 280         union nf_inet_addr *var_ipp;
 281         unsigned int i, max;
 282         /* Host order */
 283         u32 minip, maxip, j, dist;
 284         bool full_range;
 285
 286         /* No IP mapping?  Do nothing. */
 287         if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 288                 return;
 289
 290         if (maniptype == NF_NAT_MANIP_SRC)
 291                 var_ipp = &tuple->src.u3;
 292         else
 293                 var_ipp = &tuple->dst.u3;
 294
 295         /* Fast path: only one choice. */
 296         if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
 297                 *var_ipp = range->min_addr;
 298                 return;
 299         }
 300
 301         if (nf_ct_l3num(ct) == NFPROTO_IPV4)
 302                 max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
 303         else
 304                 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
 305
 306         /* Hashing source and destination IPs gives a fairly even
 307          * spread in practice (if there are a small number of IPs
 308          * involved, there usually aren't that many connections
 309          * anyway).  The consistency means that servers see the same
 310          * client coming from the same IP (some Internet Banking sites
 311          * like this), even across reboots.
 312          */
 313         j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
 314                    range->flags & NF_NAT_RANGE_PERSISTENT ?
 315                         0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
 316
 317         full_range = false;
 318         for (i = 0; i <= max; i++) {
 319                 /* If first bytes of the address are at the maximum, use the
 320                  * distance. Otherwise use the full range.
 321                  */
 322                 if (!full_range) {
 323                         minip = ntohl((__force __be32)range->min_addr.all[i]);
 324                         maxip = ntohl((__force __be32)range->max_addr.all[i]);
 325                         dist  = maxip - minip + 1;
 326                 } else {
 327                         minip = 0;
 328                         dist  = ~0;
 329                 }
 330
 331                 var_ipp->all[i] = (__force __u32)
 332                         htonl(minip + reciprocal_scale(j, dist));
 333                 if (var_ipp->all[i] != range->max_addr.all[i])
 334                         full_range = true;
 335
 336                 if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
 337                         j ^= (__force u32)tuple->dst.u3.all[i];
 338         }
 339 }
 340
 341 /* Alter the per-proto part of the tuple (depending on maniptype), to
 342  * give a unique tuple in the given range if possible.
 343  *
 344  * Per-protocol part of tuple is initialized to the incoming packet.
 345  */
 346 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 347                                         const struct nf_nat_range2 *range,
 348                                         enum nf_nat_manip_type maniptype,
 349                                         const struct nf_conn *ct)
 350 {
 351         unsigned int range_size, min, max, i, attempts;
 352         __be16 *keyptr;
 353         u16 off;
 354         static const unsigned int max_attempts = 128;
 355
 356         switch (tuple->dst.protonum) {
 357         case IPPROTO_ICMP: /* fallthrough */
 358         case IPPROTO_ICMPV6:
 359                 /* id is same for either direction... */
 360                 keyptr = &tuple->src.u.icmp.id;
 361                 min = range->min_proto.icmp.id;
 362                 range_size = ntohs(range->max_proto.icmp.id) -
 363                              ntohs(range->min_proto.icmp.id) + 1;
 364                 goto find_free_id;
 365 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
 366         case IPPROTO_GRE:
 367                 /* If there is no master conntrack we are not PPTP,
 368                    do not change tuples */
 369                 if (!ct->master)
 370                         return;
 371
 372                 if (maniptype == NF_NAT_MANIP_SRC)
 373                         keyptr = &tuple->src.u.gre.key;
 374                 else
 375                         keyptr = &tuple->dst.u.gre.key;
 376
 377                 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
 378                         min = 1;
 379                         range_size = 65535;
 380                 } else {
 381                         min = ntohs(range->min_proto.gre.key);
 382                         range_size = ntohs(range->max_proto.gre.key) - min + 1;
 383                 }
 384                 goto find_free_id;
 385 #endif
 386         case IPPROTO_UDP:       /* fallthrough */
 387         case IPPROTO_UDPLITE:   /* fallthrough */
 388         case IPPROTO_TCP:       /* fallthrough */
 389         case IPPROTO_SCTP:      /* fallthrough */
 390         case IPPROTO_DCCP:      /* fallthrough */
 391                 if (maniptype == NF_NAT_MANIP_SRC)
 392                         keyptr = &tuple->src.u.all;
 393                 else
 394                         keyptr = &tuple->dst.u.all;
 395
 396                 break;
 397         default:
 398                 return;
 399         }
 400
 401         /* If no range specified... */
 402         if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
 403                 /* If it's dst rewrite, can't change port */
 404                 if (maniptype == NF_NAT_MANIP_DST)
 405                         return;
 406
 407                 if (ntohs(*keyptr) < 1024) {
 408                         /* Loose convention: >> 512 is credential passing */
 409                         if (ntohs(*keyptr) < 512) {
 410                                 min = 1;
 411                                 range_size = 511 - min + 1;
 412                         } else {
 413                                 min = 600;
 414                                 range_size = 1023 - min + 1;
 415                         }
 416                 } else {
 417                         min = 1024;
 418                         range_size = 65535 - 1024 + 1;
 419                 }
 420         } else {
 421                 min = ntohs(range->min_proto.all);
 422                 max = ntohs(range->max_proto.all);
 423                 if (unlikely(max < min))
 424                         swap(max, min);
 425                 range_size = max - min + 1;
 426         }
 427
 428 find_free_id:
 429         if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
 430                 off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
 431         else
 432                 off = prandom_u32();
 433
 434         attempts = range_size;
 435         if (attempts > max_attempts)
 436                 attempts = max_attempts;
 437
 438         /* We are in softirq; doing a search of the entire range risks
 439          * soft lockup when all tuples are already used.
 440          *
 441          * If we can't find any free port from first offset, pick a new
 442          * one and try again, with ever smaller search window.
 443          */
 444 another_round:
 445         for (i = 0; i < attempts; i++, off++) {
 446                 *keyptr = htons(min + off % range_size);
 447                 if (!nf_nat_used_tuple(tuple, ct))
 448                         return;
 449         }
 450
 451         if (attempts >= range_size || attempts < 16)
 452                 return;
 453         attempts /= 2;
 454         off = prandom_u32();
 455         goto another_round;
 456 }
 457
 458 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 459  * we change the source to map into the range. For NF_INET_PRE_ROUTING
 460  * and NF_INET_LOCAL_OUT, we change the destination to map into the
 461  * range. It might not be possible to get a unique tuple, but we try.
 462  * At worst (or if we race), we will end up with a final duplicate in
 463  * __ip_conntrack_confirm and drop the packet. */
 464 static void
 465 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 466                  const struct nf_conntrack_tuple *orig_tuple,
 467                  const struct nf_nat_range2 *range,
 468                  struct nf_conn *ct,
 469                  enum nf_nat_manip_type maniptype)
 470 {
 471         const struct nf_conntrack_zone *zone;
 472         struct net *net = nf_ct_net(ct);
 473
 474         zone = nf_ct_zone(ct);
 475
 476         /* 1) If this srcip/proto/src-proto-part is currently mapped,
 477          * and that same mapping gives a unique tuple within the given
 478          * range, use that.
 479          *
 480          * This is only required for source (ie. NAT/masq) mappings.
 481          * So far, we don't do local source mappings, so multiple
 482          * manips not an issue.
 483          */
 484         if (maniptype == NF_NAT_MANIP_SRC &&
 485             !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 486                 /* try the original tuple first */
 487                 if (in_range(orig_tuple, range)) {
 488                         if (!nf_nat_used_tuple(orig_tuple, ct)) {
 489                                 *tuple = *orig_tuple;
 490                                 return;
 491                         }
 492                 } else if (find_appropriate_src(net, zone,
 493                                                 orig_tuple, tuple, range)) {
 494                         pr_debug("get_unique_tuple: Found current src map\n");
 495                         if (!nf_nat_used_tuple(tuple, ct))
 496                                 return;
 497                 }
 498         }
 499
 500         /* 2) Select the least-used IP/proto combination in the given range */
 501         *tuple = *orig_tuple;
 502         find_best_ips_proto(zone, tuple, range, ct, maniptype);
 503
 504         /* 3) The per-protocol part of the manip is made to map into
 505          * the range to make a unique tuple.
 506          */
 507
 508         /* Only bother mapping if it's not already in range and unique */
 509         if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 510                 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
 511                         if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
 512                             l4proto_in_range(tuple, maniptype,
 513                                   &range->min_proto,
 514                                   &range->max_proto) &&
 515                             (range->min_proto.all == range->max_proto.all ||
 516                              !nf_nat_used_tuple(tuple, ct)))
 517                                 return;
 518                 } else if (!nf_nat_used_tuple(tuple, ct)) {
 519                         return;
 520                 }
 521         }
 522
 523         /* Last chance: get protocol to try to obtain unique tuple. */
 524         nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
 525 }
 526
 527 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
 528 {
 529         struct nf_conn_nat *nat = nfct_nat(ct);
 530         if (nat)
 531                 return nat;
 532
 533         if (!nf_ct_is_confirmed(ct))
 534                 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
 535
 536         return nat;
 537 }
 538 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
 539
 540 unsigned int
 541 nf_nat_setup_info(struct nf_conn *ct,
 542                   const struct nf_nat_range2 *range,
 543                   enum nf_nat_manip_type maniptype)
 544 {
 545         struct net *net = nf_ct_net(ct);
 546         struct nf_conntrack_tuple curr_tuple, new_tuple;
 547
 548         /* Can't setup nat info for confirmed ct. */
 549         if (nf_ct_is_confirmed(ct))
 550                 return NF_ACCEPT;
 551
 552         WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
 553                 maniptype != NF_NAT_MANIP_DST);
 554
 555         if (WARN_ON(nf_nat_initialized(ct, maniptype)))
 556                 return NF_DROP;
 557
 558         /* What we've got will look like inverse of reply. Normally
 559          * this is what is in the conntrack, except for prior
 560          * manipulations (future optimization: if num_manips == 0,
 561          * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
 562          */
 563         nf_ct_invert_tuplepr(&curr_tuple,
 564                              &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 565
 566         get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
 567
 568         if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
 569                 struct nf_conntrack_tuple reply;
 570
 571                 /* Alter conntrack table so will recognize replies. */
 572                 nf_ct_invert_tuplepr(&reply, &new_tuple);
 573                 nf_conntrack_alter_reply(ct, &reply);
 574
 575                 /* Non-atomic: we own this at the moment. */
 576                 if (maniptype == NF_NAT_MANIP_SRC)
 577                         ct->status |= IPS_SRC_NAT;
 578                 else
 579                         ct->status |= IPS_DST_NAT;
 580
 581                 if (nfct_help(ct) && !nfct_seqadj(ct))
 582                         if (!nfct_seqadj_ext_add(ct))
 583                                 return NF_DROP;
 584         }
 585
 586         if (maniptype == NF_NAT_MANIP_SRC) {
 587                 unsigned int srchash;
 588                 spinlock_t *lock;
 589
 590                 srchash = hash_by_src(net,
 591                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 592                 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
 593                 spin_lock_bh(lock);
 594                 hlist_add_head_rcu(&ct->nat_bysource,
 595                                    &nf_nat_bysource[srchash]);
 596                 spin_unlock_bh(lock);
 597         }
 598
 599         /* It's done. */
 600         if (maniptype == NF_NAT_MANIP_DST)
 601                 ct->status |= IPS_DST_NAT_DONE;
 602         else
 603                 ct->status |= IPS_SRC_NAT_DONE;
 604
 605         return NF_ACCEPT;
 606 }
 607 EXPORT_SYMBOL(nf_nat_setup_info);
 608
 609 static unsigned int
 610 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 611 {
 612         /* Force range to this IP; let proto decide mapping for
 613          * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
 614          * Use reply in case it's already been mangled (eg local packet).
 615          */
 616         union nf_inet_addr ip =
 617                 (manip == NF_NAT_MANIP_SRC ?
 618                 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
 619                 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
 620         struct nf_nat_range2 range = {
 621                 .flags          = NF_NAT_RANGE_MAP_IPS,
 622                 .min_addr       = ip,
 623                 .max_addr       = ip,
 624         };
 625         return nf_nat_setup_info(ct, &range, manip);
 626 }
 627
 628 unsigned int
 629 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 630 {
 631         return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
 632 }
 633 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
 634
 635 static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
 636                                      enum nf_nat_manip_type mtype,
 637                                      enum ip_conntrack_dir dir)
 638 {
 639         const struct nf_nat_l3proto *l3proto;
 640         struct nf_conntrack_tuple target;
 641
 642         /* We are aiming to look like inverse of other direction. */
 643         nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
 644
 645         l3proto = __nf_nat_l3proto_find(target.src.l3num);
 646         if (!l3proto->manip_pkt(skb, 0, &target, mtype))
 647                 return NF_DROP;
 648
 649         return NF_ACCEPT;
 650 }
 651
 652 /* Do packet manipulations according to nf_nat_setup_info. */
 653 unsigned int nf_nat_packet(struct nf_conn *ct,
 654                            enum ip_conntrack_info ctinfo,
 655                            unsigned int hooknum,
 656                            struct sk_buff *skb)
 657 {
 658         enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 659         enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 660         unsigned int verdict = NF_ACCEPT;
 661         unsigned long statusbit;
 662
 663         if (mtype == NF_NAT_MANIP_SRC)
 664                 statusbit = IPS_SRC_NAT;
 665         else
 666                 statusbit = IPS_DST_NAT;
 667
 668         /* Invert if this is reply dir. */
 669         if (dir == IP_CT_DIR_REPLY)
 670                 statusbit ^= IPS_NAT_MASK;
 671
 672         /* Non-atomic: these bits don't change. */
 673         if (ct->status & statusbit)
 674                 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
 675
 676         return verdict;
 677 }
 678 EXPORT_SYMBOL_GPL(nf_nat_packet);
 679
 680 unsigned int
 681 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 682                const struct nf_hook_state *state)
 683 {
 684         struct nf_conn *ct;
 685         enum ip_conntrack_info ctinfo;
 686         struct nf_conn_nat *nat;
 687         /* maniptype == SRC for postrouting. */
 688         enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 689
 690         ct = nf_ct_get(skb, &ctinfo);
 691         /* Can't track?  It's not due to stress, or conntrack would
 692          * have dropped it.  Hence it's the user's responsibilty to
 693          * packet filter it out, or implement conntrack/NAT for that
 694          * protocol. 8) --RR
 695          */
 696         if (!ct)
 697                 return NF_ACCEPT;
 698
 699         nat = nfct_nat(ct);
 700
 701         switch (ctinfo) {
 702         case IP_CT_RELATED:
 703         case IP_CT_RELATED_REPLY:
 704                 /* Only ICMPs can be IP_CT_IS_REPLY.  Fallthrough */
 705         case IP_CT_NEW:
 706                 /* Seen it before?  This can happen for loopback, retrans,
 707                  * or local packets.
 708                  */
 709                 if (!nf_nat_initialized(ct, maniptype)) {
 710                         struct nf_nat_lookup_hook_priv *lpriv = priv;
 711                         struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
 712                         unsigned int ret;
 713                         int i;
 714
 715                         if (!e)
 716                                 goto null_bind;
 717
 718                         for (i = 0; i < e->num_hook_entries; i++) {
 719                                 ret = e->hooks[i].hook(e->hooks[i].priv, skb,
 720                                                        state);
 721                                 if (ret != NF_ACCEPT)
 722                                         return ret;
 723                                 if (nf_nat_initialized(ct, maniptype))
 724                                         goto do_nat;
 725                         }
 726 null_bind:
 727                         ret = nf_nat_alloc_null_binding(ct, state->hook);
 728                         if (ret != NF_ACCEPT)
 729                                 return ret;
 730                 } else {
 731                         pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
 732                                  maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
 733                                  ct, ct->status);
 734                         if (nf_nat_oif_changed(state->hook, ctinfo, nat,
 735                                                state->out))
 736                                 goto oif_changed;
 737                 }
 738                 break;
 739         default:
 740                 /* ESTABLISHED */
 741                 WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
 742                         ctinfo != IP_CT_ESTABLISHED_REPLY);
 743                 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 744                         goto oif_changed;
 745         }
 746 do_nat:
 747         return nf_nat_packet(ct, ctinfo, state->hook, skb);
 748
 749 oif_changed:
 750         nf_ct_kill_acct(ct, ctinfo, skb);
 751         return NF_DROP;
 752 }
 753 EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
 754
 755 struct nf_nat_proto_clean {
 756         u8      l3proto;
 757         u8      l4proto;
 758 };
 759
 760 /* kill conntracks with affected NAT section */
 761 static int nf_nat_proto_remove(struct nf_conn *i, void *data)
 762 {
 763         const struct nf_nat_proto_clean *clean = data;
 764
 765         if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
 766             (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
 767                 return 0;
 768
 769         return i->status & IPS_NAT_MASK ? 1 : 0;
 770 }
 771
 772 static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
 773 {
 774         unsigned int h;
 775
 776         h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 777         spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
 778         hlist_del_rcu(&ct->nat_bysource);
 779         spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
 780 }
 781
 782 static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 783 {
 784         if (nf_nat_proto_remove(ct, data))
 785                 return 1;
 786
 787         /* This module is being removed and conntrack has nat null binding.
 788          * Remove it from bysource hash, as the table will be freed soon.
 789          *
 790          * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
 791          * will delete entry from already-freed table.
 792          */
 793         if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
 794                 __nf_nat_cleanup_conntrack(ct);
 795
 796         /* don't delete conntrack.  Although that would make things a lot
 797          * simpler, we'd end up flushing all conntracks on nat rmmod.
 798          */
 799         return 0;
 800 }
 801
 802 static void nf_nat_l3proto_clean(u8 l3proto)
 803 {
 804         struct nf_nat_proto_clean clean = {
 805                 .l3proto = l3proto,
 806         };
 807
 808         nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
 809 }
 810
 811 int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
 812 {
 813         RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
 814         return 0;
 815 }
 816 EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
 817
 818 void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
 819 {
 820         mutex_lock(&nf_nat_proto_mutex);
 821         RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
 822         mutex_unlock(&nf_nat_proto_mutex);
 823         synchronize_rcu();
 824
 825         nf_nat_l3proto_clean(l3proto->l3proto);
 826 }
 827 EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 828
 829 /* No one using conntrack by the time this called. */
 830 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 831 {
 832         if (ct->status & IPS_SRC_NAT_DONE)
 833                 __nf_nat_cleanup_conntrack(ct);
 834 }
 835
 836 static struct nf_ct_ext_type nat_extend __read_mostly = {
 837         .len            = sizeof(struct nf_conn_nat),
 838         .align          = __alignof__(struct nf_conn_nat),
 839         .destroy        = nf_nat_cleanup_conntrack,
 840         .id             = NF_CT_EXT_NAT,
 841 };
 842
 843 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 844
 845 #include <linux/netfilter/nfnetlink.h>
 846 #include <linux/netfilter/nfnetlink_conntrack.h>
 847
 848 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
 849         [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
 850         [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
 851 };
 852
 853 static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
 854                                           struct nf_nat_range2 *range)
 855 {
 856         if (tb[CTA_PROTONAT_PORT_MIN]) {
 857                 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
 858                 range->max_proto.all = range->min_proto.all;
 859                 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 860         }
 861         if (tb[CTA_PROTONAT_PORT_MAX]) {
 862                 range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
 863                 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 864         }
 865         return 0;
 866 }
 867
 868 static int nfnetlink_parse_nat_proto(struct nlattr *attr,
 869                                      const struct nf_conn *ct,
 870                                      struct nf_nat_range2 *range)
 871 {
 872         struct nlattr *tb[CTA_PROTONAT_MAX+1];
 873         int err;
 874
 875         err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr,
 876                                protonat_nla_policy, NULL);
 877         if (err < 0)
 878                 return err;
 879
 880         return nf_nat_l4proto_nlattr_to_range(tb, range);
 881 }
 882
 883 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
 884         [CTA_NAT_V4_MINIP]      = { .type = NLA_U32 },
 885         [CTA_NAT_V4_MAXIP]      = { .type = NLA_U32 },
 886         [CTA_NAT_V6_MINIP]      = { .len = sizeof(struct in6_addr) },
 887         [CTA_NAT_V6_MAXIP]      = { .len = sizeof(struct in6_addr) },
 888         [CTA_NAT_PROTO]         = { .type = NLA_NESTED },
 889 };
 890
 891 static int
 892 nfnetlink_parse_nat(const struct nlattr *nat,
 893                     const struct nf_conn *ct, struct nf_nat_range2 *range,
 894                     const struct nf_nat_l3proto *l3proto)
 895 {
 896         struct nlattr *tb[CTA_NAT_MAX+1];
 897         int err;
 898
 899         memset(range, 0, sizeof(*range));
 900
 901         err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL);
 902         if (err < 0)
 903                 return err;
 904
 905         err = l3proto->nlattr_to_range(tb, range);
 906         if (err < 0)
 907                 return err;
 908
 909         if (!tb[CTA_NAT_PROTO])
 910                 return 0;
 911
 912         return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
 913 }
 914
 915 /* This function is called under rcu_read_lock() */
 916 static int
 917 nfnetlink_parse_nat_setup(struct nf_conn *ct,
 918                           enum nf_nat_manip_type manip,
 919                           const struct nlattr *attr)
 920 {
 921         struct nf_nat_range2 range;
 922         const struct nf_nat_l3proto *l3proto;
 923         int err;
 924
 925         /* Should not happen, restricted to creating new conntracks
 926          * via ctnetlink.
 927          */
 928         if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
 929                 return -EEXIST;
 930
 931         /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
 932          * attach the null binding, otherwise this may oops.
 933          */
 934         l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
 935         if (l3proto == NULL)
 936                 return -EAGAIN;
 937
 938         /* No NAT information has been passed, allocate the null-binding */
 939         if (attr == NULL)
 940                 return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0;
 941
 942         err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
 943         if (err < 0)
 944                 return err;
 945
 946         return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
 947 }
 948 #else
 949 static int
 950 nfnetlink_parse_nat_setup(struct nf_conn *ct,
 951                           enum nf_nat_manip_type manip,
 952                           const struct nlattr *attr)
 953 {
 954         return -EOPNOTSUPP;
 955 }
 956 #endif
 957
 958 static struct nf_ct_helper_expectfn follow_master_nat = {
 959         .name           = "nat-follow-master",
 960         .expectfn       = nf_nat_follow_master,
 961 };
 962
 963 int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
 964                        const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
 965 {
 966         struct nat_net *nat_net = net_generic(net, nat_net_id);
 967         struct nf_nat_hooks_net *nat_proto_net;
 968         struct nf_nat_lookup_hook_priv *priv;
 969         unsigned int hooknum = ops->hooknum;
 970         struct nf_hook_ops *nat_ops;
 971         int i, ret;
 972
 973         if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
 974                 return -EINVAL;
 975
 976         nat_proto_net = &nat_net->nat_proto_net[ops->pf];
 977
 978         for (i = 0; i < ops_count; i++) {
 979                 if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
 980                         return -EINVAL;
 981                 if (orig_nat_ops[i].hooknum == hooknum) {
 982                         hooknum = i;
 983                         break;
 984                 }
 985         }
 986
 987         if (WARN_ON_ONCE(i == ops_count))
 988                 return -EINVAL;
 989
 990         mutex_lock(&nf_nat_proto_mutex);
 991         if (!nat_proto_net->nat_hook_ops) {
 992                 WARN_ON(nat_proto_net->users != 0);
 993
 994                 nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
 995                 if (!nat_ops) {
 996                         mutex_unlock(&nf_nat_proto_mutex);
 997                         return -ENOMEM;
 998                 }
 999
1000                 for (i = 0; i < ops_count; i++) {
1001                         priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1002                         if (priv) {
1003                                 nat_ops[i].priv = priv;
1004                                 continue;
1005                         }
1006                         mutex_unlock(&nf_nat_proto_mutex);
1007                         while (i)
1008                                 kfree(nat_ops[--i].priv);
1009                         kfree(nat_ops);
1010                         return -ENOMEM;
1011                 }
1012
1013                 ret = nf_register_net_hooks(net, nat_ops, ops_count);
1014                 if (ret < 0) {
1015                         mutex_unlock(&nf_nat_proto_mutex);
1016                         for (i = 0; i < ops_count; i++)
1017                                 kfree(nat_ops[i].priv);
1018                         kfree(nat_ops);
1019                         return ret;
1020                 }
1021
1022                 nat_proto_net->nat_hook_ops = nat_ops;
1023         }
1024
1025         nat_ops = nat_proto_net->nat_hook_ops;
1026         priv = nat_ops[hooknum].priv;
1027         if (WARN_ON_ONCE(!priv)) {
1028                 mutex_unlock(&nf_nat_proto_mutex);
1029                 return -EOPNOTSUPP;
1030         }
1031
1032         ret = nf_hook_entries_insert_raw(&priv->entries, ops);
1033         if (ret == 0)
1034                 nat_proto_net->users++;
1035
1036         mutex_unlock(&nf_nat_proto_mutex);
1037         return ret;
1038 }
1039 EXPORT_SYMBOL_GPL(nf_nat_register_fn);
1040
1041 void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
1042                           unsigned int ops_count)
1043 {
1044         struct nat_net *nat_net = net_generic(net, nat_net_id);
1045         struct nf_nat_hooks_net *nat_proto_net;
1046         struct nf_nat_lookup_hook_priv *priv;
1047         struct nf_hook_ops *nat_ops;
1048         int hooknum = ops->hooknum;
1049         int i;
1050
1051         if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
1052                 return;
1053
1054         nat_proto_net = &nat_net->nat_proto_net[ops->pf];
1055
1056         mutex_lock(&nf_nat_proto_mutex);
1057         if (WARN_ON(nat_proto_net->users == 0))
1058                 goto unlock;
1059
1060         nat_proto_net->users--;
1061
1062         nat_ops = nat_proto_net->nat_hook_ops;
1063         for (i = 0; i < ops_count; i++) {
1064                 if (nat_ops[i].hooknum == hooknum) {
1065                         hooknum = i;
1066                         break;
1067                 }
1068         }
1069         if (WARN_ON_ONCE(i == ops_count))
1070                 goto unlock;
1071         priv = nat_ops[hooknum].priv;
1072         nf_hook_entries_delete_raw(&priv->entries, ops);
1073
1074         if (nat_proto_net->users == 0) {
1075                 nf_unregister_net_hooks(net, nat_ops, ops_count);
1076
1077                 for (i = 0; i < ops_count; i++) {
1078                         priv = nat_ops[i].priv;
1079                         kfree_rcu(priv, rcu_head);
1080                 }
1081
1082                 nat_proto_net->nat_hook_ops = NULL;
1083                 kfree(nat_ops);
1084         }
1085 unlock:
1086         mutex_unlock(&nf_nat_proto_mutex);
1087 }
1088 EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
1089
1090 static struct pernet_operations nat_net_ops = {
1091         .id = &nat_net_id,
1092         .size = sizeof(struct nat_net),
1093 };
1094
1095 static struct nf_nat_hook nat_hook = {
1096         .parse_nat_setup        = nfnetlink_parse_nat_setup,
1097 #ifdef CONFIG_XFRM
1098         .decode_session         = __nf_nat_decode_session,
1099 #endif
1100         .manip_pkt              = nf_nat_manip_pkt,
1101 };
1102
1103 static int __init nf_nat_init(void)
1104 {
1105         int ret, i;
1106
1107         /* Leave them the same for the moment. */
1108         nf_nat_htable_size = nf_conntrack_htable_size;
1109         if (nf_nat_htable_size < CONNTRACK_LOCKS)
1110                 nf_nat_htable_size = CONNTRACK_LOCKS;
1111
1112         nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
1113         if (!nf_nat_bysource)
1114                 return -ENOMEM;
1115
1116         ret = nf_ct_extend_register(&nat_extend);
1117         if (ret < 0) {
1118                 kvfree(nf_nat_bysource);
1119                 pr_err("Unable to register extension\n");
1120                 return ret;
1121         }
1122
1123         for (i = 0; i < CONNTRACK_LOCKS; i++)
1124                 spin_lock_init(&nf_nat_locks[i]);
1125
1126         ret = register_pernet_subsys(&nat_net_ops);
1127         if (ret < 0) {
1128                 nf_ct_extend_unregister(&nat_extend);
1129                 return ret;
1130         }
1131
1132         nf_ct_helper_expectfn_register(&follow_master_nat);
1133
1134         WARN_ON(nf_nat_hook != NULL);
1135         RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
1136
1137         return 0;
1138 }
1139
1140 static void __exit nf_nat_cleanup(void)
1141 {
1142         struct nf_nat_proto_clean clean = {};
1143
1144         nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
1145
1146         nf_ct_extend_unregister(&nat_extend);
1147         nf_ct_helper_expectfn_unregister(&follow_master_nat);
1148         RCU_INIT_POINTER(nf_nat_hook, NULL);
1149
1150         synchronize_net();
1151         kvfree(nf_nat_bysource);
1152         unregister_pernet_subsys(&nat_net_ops);
1153 }
1154
1155 MODULE_LICENSE("GPL");
1156
1157 module_init(nf_nat_init);
1158 module_exit(nf_nat_cleanup);