www.usr.com/support/gpl/USR9107_release.1.4.tar.gz
[bcm963xx.git] / kernel / linux / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 /* For ERR_PTR().  Yeah, I know... --RR */
38 #include <linux/fs.h>
39
40 /* This rwlock protects the main hash table, protocol/helper/expected
41    registrations, conntrack timers*/
42 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
43 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
44
45 #include <linux/netfilter_ipv4/ip_conntrack.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
49 #include <linux/netfilter_ipv4/listhelp.h>
50
51 #define IP_CONNTRACK_VERSION    "2.1"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DECLARE_RWLOCK(ip_conntrack_lock);
60 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
61
62 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
63 LIST_HEAD(ip_conntrack_expect_list);
64 LIST_HEAD(protocol_list);
65 static LIST_HEAD(helpers);
66 unsigned int ip_conntrack_htable_size = 0;
67 #ifdef CONFIG_MIPS_BRCM
68 int ip_conntrack_max=0;
69 #else
70 static int ip_conntrack_max=0;
71 #endif
72 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
73 struct list_head *ip_conntrack_hash;
74 static kmem_cache_t *ip_conntrack_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76
77 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
78
79 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
80                               u_int8_t protocol)
81 {
82         return protocol == curr->proto;
83 }
84
85 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
86 {
87         struct ip_conntrack_protocol *p;
88
89         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
90         p = LIST_FIND(&protocol_list, proto_cmpfn,
91                       struct ip_conntrack_protocol *, protocol);
92         if (!p)
93                 p = &ip_conntrack_generic_protocol;
94
95         return p;
96 }
97
98 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
99 {
100         struct ip_conntrack_protocol *p;
101
102         READ_LOCK(&ip_conntrack_lock);
103         p = __ip_ct_find_proto(protocol);
104         READ_UNLOCK(&ip_conntrack_lock);
105         return p;
106 }
107
108 inline void 
109 ip_conntrack_put(struct ip_conntrack *ct)
110 {
111         IP_NF_ASSERT(ct);
112         IP_NF_ASSERT(ct->infos[0].master);
113         /* nf_conntrack_put wants to go via an info struct, so feed it
114            one at random. */
115         nf_conntrack_put(&ct->infos[0]);
116 }
117
118 static int ip_conntrack_hash_rnd_initted;
119 static unsigned int ip_conntrack_hash_rnd;
120
121 static u_int32_t
122 hash_conntrack(const struct ip_conntrack_tuple *tuple)
123 {
124 #if 0
125         dump_tuple(tuple);
126 #endif
127         return (jhash_3words(tuple->src.ip,
128                              (tuple->dst.ip ^ tuple->dst.protonum),
129                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
130                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
131 }
132
133 int
134 get_tuple(const struct iphdr *iph,
135           const struct sk_buff *skb,
136           unsigned int dataoff,
137           struct ip_conntrack_tuple *tuple,
138           const struct ip_conntrack_protocol *protocol)
139 {
140         /* Never happen */
141         if (iph->frag_off & htons(IP_OFFSET)) {
142                 printk("ip_conntrack_core: Frag of proto %u.\n",
143                        iph->protocol);
144                 return 0;
145         }
146
147         tuple->src.ip = iph->saddr;
148         tuple->dst.ip = iph->daddr;
149         tuple->dst.protonum = iph->protocol;
150         tuple->src.u.all = tuple->dst.u.all = 0;
151
152         return protocol->pkt_to_tuple(skb, dataoff, tuple);
153 }
154
155 static int
156 invert_tuple(struct ip_conntrack_tuple *inverse,
157              const struct ip_conntrack_tuple *orig,
158              const struct ip_conntrack_protocol *protocol)
159 {
160         inverse->src.ip = orig->dst.ip;
161         inverse->dst.ip = orig->src.ip;
162         inverse->dst.protonum = orig->dst.protonum;
163
164         inverse->src.u.all = inverse->dst.u.all = 0;
165
166         return protocol->invert_tuple(inverse, orig);
167 }
168
169
170 /* ip_conntrack_expect helper functions */
171
172 /* Compare tuple parts depending on mask. */
173 static inline int expect_cmp(const struct ip_conntrack_expect *i,
174                              const struct ip_conntrack_tuple *tuple)
175 {
176         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
177         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
178 }
179
180 static void
181 destroy_expect(struct ip_conntrack_expect *exp)
182 {
183         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
184         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
185         IP_NF_ASSERT(!timer_pending(&exp->timeout));
186
187         kfree(exp);
188 }
189
190 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
191 {
192         IP_NF_ASSERT(exp);
193
194         if (atomic_dec_and_test(&exp->use)) {
195                 /* usage count dropped to zero */
196                 destroy_expect(exp);
197         }
198 }
199
200 static inline struct ip_conntrack_expect *
201 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
202 {
203         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
204         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
205         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
206                          struct ip_conntrack_expect *, tuple);
207 }
208
209 /* Find a expectation corresponding to a tuple. */
210 struct ip_conntrack_expect *
211 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
212 {
213         struct ip_conntrack_expect *exp;
214
215         READ_LOCK(&ip_conntrack_lock);
216         READ_LOCK(&ip_conntrack_expect_tuple_lock);
217         exp = __ip_ct_expect_find(tuple);
218         if (exp)
219                 atomic_inc(&exp->use);
220         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
221         READ_UNLOCK(&ip_conntrack_lock);
222
223         return exp;
224 }
225
226 /* remove one specific expectation from all lists and drop refcount,
227  * does _NOT_ delete the timer. */
228 static void __unexpect_related(struct ip_conntrack_expect *expect)
229 {
230         DEBUGP("unexpect_related(%p)\n", expect);
231         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
232
233         /* we're not allowed to unexpect a confirmed expectation! */
234         IP_NF_ASSERT(!expect->sibling);
235
236         /* delete from global and local lists */
237         list_del(&expect->list);
238         list_del(&expect->expected_list);
239
240         /* decrement expect-count of master conntrack */
241         if (expect->expectant)
242                 expect->expectant->expecting--;
243
244         ip_conntrack_expect_put(expect);
245 }
246
247 /* remove one specific expecatation from all lists, drop refcount
248  * and expire timer. 
249  * This function can _NOT_ be called for confirmed expects! */
250 static void unexpect_related(struct ip_conntrack_expect *expect)
251 {
252         IP_NF_ASSERT(expect->expectant);
253         IP_NF_ASSERT(expect->expectant->helper);
254         /* if we are supposed to have a timer, but we can't delete
255          * it: race condition.  __unexpect_related will
256          * be calledd by timeout function */
257         if (expect->expectant->helper->timeout
258             && !del_timer(&expect->timeout))
259                 return;
260
261         __unexpect_related(expect);
262 }
263
264 /* delete all unconfirmed expectations for this conntrack */
265 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
266 {
267         struct list_head *exp_entry, *next;
268         struct ip_conntrack_expect *exp;
269
270         DEBUGP("remove_expectations(%p)\n", ct);
271
272         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
273                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
274                                  expected_list);
275
276                 /* we skip established expectations, as we want to delete
277                  * the un-established ones only */
278                 if (exp->sibling) {
279                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
280                         if (drop_refcount) {
281                                 /* Indicate that this expectations parent is dead */
282                                 ip_conntrack_put(exp->expectant);
283                                 exp->expectant = NULL;
284                         }
285                         continue;
286                 }
287
288                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
289                 IP_NF_ASSERT(exp->expectant == ct);
290
291                 /* delete expectation from global and private lists */
292                 unexpect_related(exp);
293         }
294 }
295
296 static void
297 clean_from_lists(struct ip_conntrack *ct)
298 {
299         unsigned int ho, hr;
300         
301         DEBUGP("clean_from_lists(%p)\n", ct);
302         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
303
304         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
305         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
306         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
307         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
308
309         /* Destroy all un-established, pending expectations */
310         remove_expectations(ct, 1);
311 }
312
313 static void
314 destroy_conntrack(struct nf_conntrack *nfct)
315 {
316         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
317         struct ip_conntrack_protocol *proto;
318
319         DEBUGP("destroy_conntrack(%p)\n", ct);
320         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
321         IP_NF_ASSERT(!timer_pending(&ct->timeout));
322
323         /* To make sure we don't get any weird locking issues here:
324          * destroy_conntrack() MUST NOT be called with a write lock
325          * to ip_conntrack_lock!!! -HW */
326         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327         if (proto && proto->destroy)
328                 proto->destroy(ct);
329
330         if (ip_conntrack_destroyed)
331                 ip_conntrack_destroyed(ct);
332
333         WRITE_LOCK(&ip_conntrack_lock);
334         /* Make sure don't leave any orphaned expectations lying around */
335         if (ct->expecting)
336                 remove_expectations(ct, 1);
337
338         /* Delete our master expectation */
339         if (ct->master) {
340                 if (ct->master->expectant) {
341                         /* can't call __unexpect_related here,
342                          * since it would screw up expect_list */
343                         list_del(&ct->master->expected_list);
344                         master = ct->master->expectant;
345                 }
346                 kfree(ct->master);
347         }
348         WRITE_UNLOCK(&ip_conntrack_lock);
349
350         if (master)
351                 ip_conntrack_put(master);
352
353         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
354         kmem_cache_free(ip_conntrack_cachep, ct);
355         atomic_dec(&ip_conntrack_count);
356 }
357
358 static void death_by_timeout(unsigned long ul_conntrack)
359 {
360         struct ip_conntrack *ct = (void *)ul_conntrack;
361
362         WRITE_LOCK(&ip_conntrack_lock);
363         clean_from_lists(ct);
364         WRITE_UNLOCK(&ip_conntrack_lock);
365         ip_conntrack_put(ct);
366 }
367
368 static inline int
369 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
370                     const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
374         return i->ctrack != ignored_conntrack
375                 && ip_ct_tuple_equal(tuple, &i->tuple);
376 }
377
378 static struct ip_conntrack_tuple_hash *
379 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
380                     const struct ip_conntrack *ignored_conntrack)
381 {
382         struct ip_conntrack_tuple_hash *h;
383         unsigned int hash = hash_conntrack(tuple);
384
385         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
386         h = LIST_FIND(&ip_conntrack_hash[hash],
387                       conntrack_tuple_cmp,
388                       struct ip_conntrack_tuple_hash *,
389                       tuple, ignored_conntrack);
390         return h;
391 }
392
393 /* Find a connection corresponding to a tuple. */
394 struct ip_conntrack_tuple_hash *
395 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
396                       const struct ip_conntrack *ignored_conntrack)
397 {
398         struct ip_conntrack_tuple_hash *h;
399
400         READ_LOCK(&ip_conntrack_lock);
401         h = __ip_conntrack_find(tuple, ignored_conntrack);
402         if (h)
403                 atomic_inc(&h->ctrack->ct_general.use);
404         READ_UNLOCK(&ip_conntrack_lock);
405
406         return h;
407 }
408
409 static inline struct ip_conntrack *
410 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
411 {
412         struct ip_conntrack *ct
413                 = (struct ip_conntrack *)nfct->master;
414
415         /* ctinfo is the index of the nfct inside the conntrack */
416         *ctinfo = nfct - ct->infos;
417         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
418         return ct;
419 }
420
421 /* Return conntrack and conntrack_info given skb->nfct->master */
422 struct ip_conntrack *
423 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
424 {
425         if (skb->nfct) 
426                 return __ip_conntrack_get(skb->nfct, ctinfo);
427         return NULL;
428 }
429
430 /* Confirm a connection given skb->nfct; places it in hash table */
431 int
432 __ip_conntrack_confirm(struct nf_ct_info *nfct)
433 {
434         unsigned int hash, repl_hash;
435         struct ip_conntrack *ct;
436         enum ip_conntrack_info ctinfo;
437
438         ct = __ip_conntrack_get(nfct, &ctinfo);
439
440         /* ipt_REJECT uses ip_conntrack_attach to attach related
441            ICMP/TCP RST packets in other direction.  Actual packet
442            which created connection will be IP_CT_NEW or for an
443            expected connection, IP_CT_RELATED. */
444         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
445                 return NF_ACCEPT;
446
447         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
448         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
449
450         /* We're not in hash table, and we refuse to set up related
451            connections for unconfirmed conns.  But packet copies and
452            REJECT will give spurious warnings here. */
453         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
454
455         /* No external references means noone else could have
456            confirmed us. */
457         IP_NF_ASSERT(!is_confirmed(ct));
458         DEBUGP("Confirming conntrack %p\n", ct);
459
460         WRITE_LOCK(&ip_conntrack_lock);
461         /* See if there's one in the list already, including reverse:
462            NAT could have grabbed it without realizing, since we're
463            not in the hash.  If there is, we lost race. */
464         if (!LIST_FIND(&ip_conntrack_hash[hash],
465                        conntrack_tuple_cmp,
466                        struct ip_conntrack_tuple_hash *,
467                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
468             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
469                           conntrack_tuple_cmp,
470                           struct ip_conntrack_tuple_hash *,
471                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
472                 list_prepend(&ip_conntrack_hash[hash],
473                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
474                 list_prepend(&ip_conntrack_hash[repl_hash],
475                              &ct->tuplehash[IP_CT_DIR_REPLY]);
476                 /* Timer relative to confirmation time, not original
477                    setting time, otherwise we'd get timer wrap in
478                    weird delay cases. */
479                 ct->timeout.expires += jiffies;
480                 add_timer(&ct->timeout);
481                 atomic_inc(&ct->ct_general.use);
482                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
483                 WRITE_UNLOCK(&ip_conntrack_lock);
484                 return NF_ACCEPT;
485         }
486
487         WRITE_UNLOCK(&ip_conntrack_lock);
488         return NF_DROP;
489 }
490
491 /* Returns true if a connection correspondings to the tuple (required
492    for NAT). */
493 int
494 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
495                          const struct ip_conntrack *ignored_conntrack)
496 {
497         struct ip_conntrack_tuple_hash *h;
498
499         READ_LOCK(&ip_conntrack_lock);
500         h = __ip_conntrack_find(tuple, ignored_conntrack);
501         READ_UNLOCK(&ip_conntrack_lock);
502
503         return h != NULL;
504 }
505
506 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
507 struct ip_conntrack *
508 icmp_error_track(struct sk_buff *skb,
509                  enum ip_conntrack_info *ctinfo,
510                  unsigned int hooknum)
511 {
512         struct ip_conntrack_tuple innertuple, origtuple;
513         struct {
514                 struct icmphdr icmp;
515                 struct iphdr ip;
516         } inside;
517         struct ip_conntrack_protocol *innerproto;
518         struct ip_conntrack_tuple_hash *h;
519         int dataoff;
520
521         IP_NF_ASSERT(skb->nfct == NULL);
522
523         /* Not enough header? */
524         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
525                 return NULL;
526
527         if (inside.icmp.type != ICMP_DEST_UNREACH
528             && inside.icmp.type != ICMP_SOURCE_QUENCH
529             && inside.icmp.type != ICMP_TIME_EXCEEDED
530             && inside.icmp.type != ICMP_PARAMETERPROB
531             && inside.icmp.type != ICMP_REDIRECT)
532                 return NULL;
533
534         /* Ignore ICMP's containing fragments (shouldn't happen) */
535         if (inside.ip.frag_off & htons(IP_OFFSET)) {
536                 DEBUGP("icmp_error_track: fragment of proto %u\n",
537                        inside.ip.protocol);
538                 return NULL;
539         }
540
541         innerproto = ip_ct_find_proto(inside.ip.protocol);
542         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
543         /* Are they talking about one of our connections? */
544         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
545                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
546                 return NULL;
547         }
548
549         /* Ordinarily, we'd expect the inverted tupleproto, but it's
550            been preserved inside the ICMP. */
551         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
552                 DEBUGP("icmp_error_track: Can't invert tuple\n");
553                 return NULL;
554         }
555
556         *ctinfo = IP_CT_RELATED;
557
558         h = ip_conntrack_find_get(&innertuple, NULL);
559         if (!h) {
560                 /* Locally generated ICMPs will match inverted if they
561                    haven't been SNAT'ed yet */
562                 /* FIXME: NAT code has to handle half-done double NAT --RR */
563                 if (hooknum == NF_IP_LOCAL_OUT)
564                         h = ip_conntrack_find_get(&origtuple, NULL);
565
566                 if (!h) {
567                         DEBUGP("icmp_error_track: no match\n");
568                         return NULL;
569                 }
570                 /* Reverse direction from that found */
571                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
572                         *ctinfo += IP_CT_IS_REPLY;
573         } else {
574                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
575                         *ctinfo += IP_CT_IS_REPLY;
576         }
577
578         /* Update skb to refer to this connection */
579         skb->nfct = &h->ctrack->infos[*ctinfo];
580         return h->ctrack;
581 }
582
583 /* There's a small race here where we may free a just-assured
584    connection.  Too bad: we're in trouble anyway. */
585 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
586 {
587         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
588 }
589
590 static int early_drop(struct list_head *chain)
591 {
592         /* Traverse backwards: gives us oldest, which is roughly LRU */
593         struct ip_conntrack_tuple_hash *h;
594         int dropped = 0;
595
596         READ_LOCK(&ip_conntrack_lock);
597         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
598         if (h)
599                 atomic_inc(&h->ctrack->ct_general.use);
600         READ_UNLOCK(&ip_conntrack_lock);
601
602         if (!h)
603                 return dropped;
604
605         if (del_timer(&h->ctrack->timeout)) {
606                 death_by_timeout((unsigned long)h->ctrack);
607                 dropped = 1;
608         }
609         ip_conntrack_put(h->ctrack);
610         return dropped;
611 }
612
613 #if defined(CONFIG_MIPS_BRCM)
614 static inline int regardless(const struct ip_conntrack_tuple_hash *i)
615 {
616         return 1;
617 }
618
619 static int regardless_drop(struct list_head *chain)
620 {
621         /* Traverse backwards: gives us oldest, which is roughly LRU */
622         struct ip_conntrack_tuple_hash *h;
623         int dropped = 0;
624
625         READ_LOCK(&ip_conntrack_lock);
626         h = LIST_FIND_B(chain, regardless, struct ip_conntrack_tuple_hash *);
627         if (h)
628                 atomic_inc(&h->ctrack->ct_general.use);
629         READ_UNLOCK(&ip_conntrack_lock);
630
631         if (!h)
632                 return dropped;
633
634         if (del_timer(&h->ctrack->timeout)) {
635                 death_by_timeout((unsigned long)h->ctrack);
636                 dropped = 1;
637         }
638         ip_conntrack_put(h->ctrack);
639         return dropped;
640 }
641 #endif
642
643 static inline int helper_cmp(const struct ip_conntrack_helper *i,
644                              const struct ip_conntrack_tuple *rtuple)
645 {
646         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
647 }
648
649 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
650 {
651         return LIST_FIND(&helpers, helper_cmp,
652                          struct ip_conntrack_helper *,
653                          tuple);
654 }
655
656 /* Allocate a new conntrack: we return -ENOMEM if classification
657    failed due to stress.  Otherwise it really is unclassifiable. */
658 static struct ip_conntrack_tuple_hash *
659 init_conntrack(const struct ip_conntrack_tuple *tuple,
660                struct ip_conntrack_protocol *protocol,
661                struct sk_buff *skb)
662 {
663         struct ip_conntrack *conntrack;
664         struct ip_conntrack_tuple repl_tuple;
665         size_t hash;
666         struct ip_conntrack_expect *expected;
667         int i;
668         static unsigned int drop_next;
669
670         if (!ip_conntrack_hash_rnd_initted) {
671                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
672                 ip_conntrack_hash_rnd_initted = 1;
673         }
674
675         hash = hash_conntrack(tuple);
676
677         if (ip_conntrack_max &&
678             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
679                 /* Try dropping from random chain, or else from the
680                    chain about to put into (in case they're trying to
681                    bomb one hash chain). */
682                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
683                 
684                 if (!early_drop(&ip_conntrack_hash[next])
685                     && !early_drop(&ip_conntrack_hash[hash])) {
686 #if defined(CONFIG_MIPS_BRCM)   
687                         /* Sorry, we have to kick one out regardless. */
688                          while (!regardless_drop(&ip_conntrack_hash[next]))
689                                 next = (drop_next++)%ip_conntrack_htable_size;
690 #else
691                         if (net_ratelimit())
692                                 printk(KERN_WARNING
693                                        "ip_conntrack: table full, dropping"
694                                        " packet.\n");
695                         return ERR_PTR(-ENOMEM);
696 #endif                  
697                 }
698         }
699
700         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
701                 DEBUGP("Can't invert tuple.\n");
702                 return NULL;
703         }
704
705         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
706         if (!conntrack) {
707                 DEBUGP("Can't allocate conntrack.\n");
708                 return ERR_PTR(-ENOMEM);
709         }
710
711         memset(conntrack, 0, sizeof(*conntrack));
712         atomic_set(&conntrack->ct_general.use, 1);
713         conntrack->ct_general.destroy = destroy_conntrack;
714         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
715         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
716         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
717         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
718         for (i=0; i < IP_CT_NUMBER; i++)
719                 conntrack->infos[i].master = &conntrack->ct_general;
720
721         if (!protocol->new(conntrack, skb)) {
722                 kmem_cache_free(ip_conntrack_cachep, conntrack);
723                 return NULL;
724         }
725         /* Don't set timer yet: wait for confirmation */
726         init_timer(&conntrack->timeout);
727         conntrack->timeout.data = (unsigned long)conntrack;
728         conntrack->timeout.function = death_by_timeout;
729
730         INIT_LIST_HEAD(&conntrack->sibling_list);
731
732         WRITE_LOCK(&ip_conntrack_lock);
733         /* Need finding and deleting of expected ONLY if we win race */
734         READ_LOCK(&ip_conntrack_expect_tuple_lock);
735         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
736                              struct ip_conntrack_expect *, tuple);
737         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
738
739         /* If master is not in hash table yet (ie. packet hasn't left
740            this machine yet), how can other end know about expected?
741            Hence these are not the droids you are looking for (if
742            master ct never got confirmed, we'd hold a reference to it
743            and weird things would happen to future packets). */
744         if (expected && !is_confirmed(expected->expectant))
745                 expected = NULL;
746
747         /* Look up the conntrack helper for master connections only */
748         if (!expected)
749                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
750
751         /* If the expectation is dying, then this is a loser. */
752         if (expected
753             && expected->expectant->helper->timeout
754             && ! del_timer(&expected->timeout))
755                 expected = NULL;
756
757         if (expected) {
758                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
759                         conntrack, expected);
760                 /* Welcome, Mr. Bond.  We've been expecting you... */
761                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
762                 conntrack->master = expected;
763                 expected->sibling = conntrack;
764 #if CONFIG_IP_NF_CONNTRACK_MARK
765                 conntrack->mark = expected->expectant->mark;
766 #endif
767                 LIST_DELETE(&ip_conntrack_expect_list, expected);
768                 expected->expectant->expecting--;
769                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
770         }
771         atomic_inc(&ip_conntrack_count);
772         WRITE_UNLOCK(&ip_conntrack_lock);
773
774         if (expected && expected->expectfn)
775                 expected->expectfn(conntrack);
776         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
777 }
778
779 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
780 static inline struct ip_conntrack *
781 resolve_normal_ct(struct sk_buff *skb,
782                   struct ip_conntrack_protocol *proto,
783                   int *set_reply,
784                   unsigned int hooknum,
785                   enum ip_conntrack_info *ctinfo)
786 {
787         struct ip_conntrack_tuple tuple;
788         struct ip_conntrack_tuple_hash *h;
789
790         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
791
792         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
793                 return NULL;
794
795         /* look for tuple match */
796         h = ip_conntrack_find_get(&tuple, NULL);
797         if (!h) {
798                 h = init_conntrack(&tuple, proto, skb);
799                 if (!h)
800                         return NULL;
801                 if (IS_ERR(h))
802                         return (void *)h;
803         }
804
805         /* It exists; we have (non-exclusive) reference. */
806         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
807                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
808                 /* Please set reply bit if this packet OK */
809                 *set_reply = 1;
810         } else {
811                 /* Once we've had two way comms, always ESTABLISHED. */
812                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
813                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
814                                h->ctrack);
815                         *ctinfo = IP_CT_ESTABLISHED;
816                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
817                         DEBUGP("ip_conntrack_in: related packet for %p\n",
818                                h->ctrack);
819                         *ctinfo = IP_CT_RELATED;
820                 } else {
821                         DEBUGP("ip_conntrack_in: new packet for %p\n",
822                                h->ctrack);
823                         *ctinfo = IP_CT_NEW;
824                 }
825                 *set_reply = 0;
826         }
827         skb->nfct = &h->ctrack->infos[*ctinfo];
828         return h->ctrack;
829 }
830
831 /* Netfilter hook itself. */
832 unsigned int ip_conntrack_in(unsigned int hooknum,
833                              struct sk_buff **pskb,
834                              const struct net_device *in,
835                              const struct net_device *out,
836                              int (*okfn)(struct sk_buff *))
837 {
838         struct ip_conntrack *ct;
839         enum ip_conntrack_info ctinfo;
840         struct ip_conntrack_protocol *proto;
841         int set_reply;
842         int ret;
843
844         /* Never happen */
845         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
846                 if (net_ratelimit()) {
847                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
848                        (*pskb)->nh.iph->protocol, hooknum);
849                 }
850                 return NF_DROP;
851         }
852
853         /* FIXME: Do this right please. --RR */
854         (*pskb)->nfcache |= NFC_UNKNOWN;
855
856 /* Doesn't cover locally-generated broadcast, so not worth it. */
857 #if 0
858         /* Ignore broadcast: no `connection'. */
859         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
860                 printk("Broadcast packet!\n");
861                 return NF_ACCEPT;
862         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
863                    == htonl(0x000000FF)) {
864                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
865                        NIPQUAD((*pskb)->nh.iph->saddr),
866                        NIPQUAD((*pskb)->nh.iph->daddr),
867                        (*pskb)->sk, (*pskb)->pkt_type);
868         }
869 #endif
870
871         /* Previously seen (loopback or untracked)?  Ignore. */
872         if ((*pskb)->nfct)
873                 return NF_ACCEPT;
874
875         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
876
877         /* It may be an icmp error... */
878         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
879             && icmp_error_track(*pskb, &ctinfo, hooknum))
880                 return NF_ACCEPT;
881
882         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
883                 /* Not valid part of a connection */
884                 return NF_ACCEPT;
885
886         if (IS_ERR(ct))
887                 /* Too stressed to deal. */
888                 return NF_DROP;
889
890         IP_NF_ASSERT((*pskb)->nfct);
891
892         ret = proto->packet(ct, *pskb, ctinfo);
893         if (ret == -1) {
894                 /* Invalid */
895                 nf_conntrack_put((*pskb)->nfct);
896                 (*pskb)->nfct = NULL;
897                 return NF_ACCEPT;
898         }
899
900         if (ret != NF_DROP && ct->helper) {
901                 ret = ct->helper->help(*pskb, ct, ctinfo);
902                 if (ret == -1) {
903                         /* Invalid */
904                         nf_conntrack_put((*pskb)->nfct);
905                         (*pskb)->nfct = NULL;
906                         return NF_ACCEPT;
907                 }
908         }
909         if (set_reply)
910                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
911
912         return ret;
913 }
914
915 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
916                    const struct ip_conntrack_tuple *orig)
917 {
918         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
919 }
920
921 static inline int resent_expect(const struct ip_conntrack_expect *i,
922                                 const struct ip_conntrack_tuple *tuple,
923                                 const struct ip_conntrack_tuple *mask)
924 {
925         DEBUGP("resent_expect\n");
926         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
927         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
928         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
929         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
930                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
931                 && ip_ct_tuple_equal(&i->mask, mask));
932 }
933
934 /* Would two expected things clash? */
935 static inline int expect_clash(const struct ip_conntrack_expect *i,
936                                const struct ip_conntrack_tuple *tuple,
937                                const struct ip_conntrack_tuple *mask)
938 {
939         /* Part covered by intersection of masks must be unequal,
940            otherwise they clash */
941         struct ip_conntrack_tuple intersect_mask
942                 = { { i->mask.src.ip & mask->src.ip,
943                       { i->mask.src.u.all & mask->src.u.all } },
944                     { i->mask.dst.ip & mask->dst.ip,
945                       { i->mask.dst.u.all & mask->dst.u.all },
946                       i->mask.dst.protonum & mask->dst.protonum } };
947
948         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
949 }
950
951 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
952 {
953         WRITE_LOCK(&ip_conntrack_lock);
954         unexpect_related(expect);
955         WRITE_UNLOCK(&ip_conntrack_lock);
956 }
957         
958 static void expectation_timed_out(unsigned long ul_expect)
959 {
960         struct ip_conntrack_expect *expect = (void *) ul_expect;
961
962         DEBUGP("expectation %p timed out\n", expect);   
963         WRITE_LOCK(&ip_conntrack_lock);
964         __unexpect_related(expect);
965         WRITE_UNLOCK(&ip_conntrack_lock);
966 }
967
968 struct ip_conntrack_expect *
969 ip_conntrack_expect_alloc(void)
970 {
971         struct ip_conntrack_expect *new;
972         
973         new = (struct ip_conntrack_expect *)
974                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
975         if (!new) {
976                 DEBUGP("expect_related: OOM allocating expect\n");
977                 return NULL;
978         }
979
980         /* tuple_cmp compares whole union, we have to initialized cleanly */
981         memset(new, 0, sizeof(struct ip_conntrack_expect));
982
983         return new;
984 }
985
986 static void
987 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
988                            struct ip_conntrack *related_to)
989 {
990         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
991         new->expectant = related_to;
992         new->sibling = NULL;
993         atomic_set(&new->use, 1);
994
995         /* add to expected list for this connection */
996         list_add_tail(&new->expected_list, &related_to->sibling_list);
997         /* add to global list of expectations */
998         list_prepend(&ip_conntrack_expect_list, &new->list);
999         /* add and start timer if required */
1000         if (related_to->helper->timeout) {
1001                 init_timer(&new->timeout);
1002                 new->timeout.data = (unsigned long)new;
1003                 new->timeout.function = expectation_timed_out;
1004                 new->timeout.expires = jiffies +
1005                                         related_to->helper->timeout * HZ;
1006                 add_timer(&new->timeout);
1007         }
1008         related_to->expecting++;
1009 }
1010
1011 /* Add a related connection. */
1012 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
1013                                 struct ip_conntrack *related_to)
1014 {
1015         struct ip_conntrack_expect *old;
1016         int ret = 0;
1017
1018         WRITE_LOCK(&ip_conntrack_lock);
1019         /* Because of the write lock, no reader can walk the lists,
1020          * so there is no need to use the tuple lock too */
1021
1022         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1023         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1024         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1025
1026         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
1027                         struct ip_conntrack_expect *, &expect->tuple, 
1028                         &expect->mask);
1029         if (old) {
1030                 /* Helper private data may contain offsets but no pointers
1031                    pointing into the payload - otherwise we should have to copy 
1032                    the data filled out by the helper over the old one */
1033                 DEBUGP("expect_related: resent packet\n");
1034                 if (related_to->helper->timeout) {
1035                         if (!del_timer(&old->timeout)) {
1036                                 /* expectation is dying. Fall through */
1037                                 goto out;
1038                         } else {
1039                                 old->timeout.expires = jiffies + 
1040                                         related_to->helper->timeout * HZ;
1041                                 add_timer(&old->timeout);
1042                         }
1043                 }
1044
1045                 WRITE_UNLOCK(&ip_conntrack_lock);
1046                 kfree(expect);
1047                 return -EEXIST;
1048
1049         } else if (related_to->helper->max_expected && 
1050                    related_to->expecting >= related_to->helper->max_expected) {
1051                 /* old == NULL */
1052                 if (!(related_to->helper->flags & 
1053                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1054                         WRITE_UNLOCK(&ip_conntrack_lock);
1055                         if (net_ratelimit())
1056                                 printk(KERN_WARNING
1057                                        "ip_conntrack: max number of expected "
1058                                        "connections %i of %s reached for "
1059                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1060                                        related_to->helper->max_expected,
1061                                        related_to->helper->name,
1062                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1063                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1064                         kfree(expect);
1065                         return -EPERM;
1066                 }
1067                 DEBUGP("ip_conntrack: max number of expected "
1068                        "connections %i of %s reached for "
1069                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1070                        related_to->helper->max_expected,
1071                        related_to->helper->name,
1072                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1073                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1074  
1075                 /* choose the the oldest expectation to evict */
1076                 list_for_each_entry(old, &related_to->sibling_list, 
1077                                                       expected_list)
1078                         if (old->sibling == NULL)
1079                                 break;
1080
1081                 /* We cannot fail since related_to->expecting is the number
1082                  * of unconfirmed expectations */
1083                 IP_NF_ASSERT(old && old->sibling == NULL);
1084
1085                 /* newnat14 does not reuse the real allocated memory
1086                  * structures but rather unexpects the old and
1087                  * allocates a new.  unexpect_related will decrement
1088                  * related_to->expecting. 
1089                  */
1090                 unexpect_related(old);
1091                 ret = -EPERM;
1092         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1093                              struct ip_conntrack_expect *, &expect->tuple, 
1094                              &expect->mask)) {
1095                 WRITE_UNLOCK(&ip_conntrack_lock);
1096                 DEBUGP("expect_related: busy!\n");
1097
1098                 kfree(expect);
1099                 return -EBUSY;
1100         }
1101
1102 out:    ip_conntrack_expect_insert(expect, related_to);
1103
1104         WRITE_UNLOCK(&ip_conntrack_lock);
1105
1106         return ret;
1107 }
1108
1109 /* Change tuple in an existing expectation */
1110 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1111                                struct ip_conntrack_tuple *newtuple)
1112 {
1113         int ret;
1114
1115         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1116         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1117
1118         DEBUGP("change_expect:\n");
1119         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1120         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1121         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
1122         if (expect->ct_tuple.dst.protonum == 0) {
1123                 /* Never seen before */
1124                 DEBUGP("change expect: never seen before\n");
1125                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
1126                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1127                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1128                         /* Force NAT to find an unused tuple */
1129                         ret = -1;
1130                 } else {
1131                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1132                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1133                         ret = 0;
1134                 }
1135         } else {
1136                 /* Resent packet */
1137                 DEBUGP("change expect: resent packet\n");
1138                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1139                         ret = 0;
1140                 } else {
1141                         /* Force NAT to choose again the same port */
1142                         ret = -1;
1143                 }
1144         }
1145         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1146         
1147         return ret;
1148 }
1149
1150 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1151    return 0 and don't do alteration. */
1152 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1153                              const struct ip_conntrack_tuple *newreply)
1154 {
1155         WRITE_LOCK(&ip_conntrack_lock);
1156         if (__ip_conntrack_find(newreply, conntrack)) {
1157                 WRITE_UNLOCK(&ip_conntrack_lock);
1158                 return 0;
1159         }
1160         /* Should be unconfirmed, so not in hash table yet */
1161         IP_NF_ASSERT(!is_confirmed(conntrack));
1162
1163         DEBUGP("Altering reply tuple of %p to ", conntrack);
1164         DUMP_TUPLE(newreply);
1165
1166         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1167         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1168                 conntrack->helper = ip_ct_find_helper(newreply);
1169         WRITE_UNLOCK(&ip_conntrack_lock);
1170
1171         return 1;
1172 }
1173
1174 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1175 {
1176         WRITE_LOCK(&ip_conntrack_lock);
1177         list_prepend(&helpers, me);
1178         WRITE_UNLOCK(&ip_conntrack_lock);
1179
1180         return 0;
1181 }
1182
1183 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1184                          const struct ip_conntrack_helper *me)
1185 {
1186         if (i->ctrack->helper == me) {
1187                 /* Get rid of any expected. */
1188                 remove_expectations(i->ctrack, 0);
1189                 /* And *then* set helper to NULL */
1190                 i->ctrack->helper = NULL;
1191         }
1192         return 0;
1193 }
1194
1195 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1196 {
1197         unsigned int i;
1198
1199         /* Need write lock here, to delete helper. */
1200         WRITE_LOCK(&ip_conntrack_lock);
1201         LIST_DELETE(&helpers, me);
1202
1203         /* Get rid of expecteds, set helpers to NULL. */
1204         for (i = 0; i < ip_conntrack_htable_size; i++)
1205                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1206                             struct ip_conntrack_tuple_hash *, me);
1207         WRITE_UNLOCK(&ip_conntrack_lock);
1208
1209         /* Someone could be still looking at the helper in a bh. */
1210         synchronize_net();
1211 }
1212
1213 /* Refresh conntrack for this many jiffies. */
1214 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1215 {
1216         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1217
1218         /* If not in hash table, timer will not be active yet */
1219         if (!is_confirmed(ct))
1220                 ct->timeout.expires = extra_jiffies;
1221         else {
1222                 WRITE_LOCK(&ip_conntrack_lock);
1223                 /* Need del_timer for race avoidance (may already be dying). */
1224                 if (del_timer(&ct->timeout)) {
1225                         ct->timeout.expires = jiffies + extra_jiffies;
1226                         add_timer(&ct->timeout);
1227                 }
1228                 WRITE_UNLOCK(&ip_conntrack_lock);
1229         }
1230 }
1231
1232 /* Returns new sk_buff, or NULL */
1233 struct sk_buff *
1234 ip_ct_gather_frags(struct sk_buff *skb)
1235 {
1236         struct sock *sk = skb->sk;
1237 #ifdef CONFIG_NETFILTER_DEBUG
1238         unsigned int olddebug = skb->nf_debug;
1239 #endif
1240         if (sk) {
1241                 sock_hold(sk);
1242                 skb_orphan(skb);
1243         }
1244
1245         local_bh_disable(); 
1246         skb = ip_defrag(skb);
1247         local_bh_enable();
1248
1249         if (!skb) {
1250                 if (sk)
1251                         sock_put(sk);
1252                 return skb;
1253         }
1254
1255         if (sk) {
1256                 skb_set_owner_w(skb, sk);
1257                 sock_put(sk);
1258         }
1259
1260         ip_send_check(skb->nh.iph);
1261         skb->nfcache |= NFC_ALTERED;
1262 #ifdef CONFIG_NETFILTER_DEBUG
1263         /* Packet path as if nothing had happened. */
1264         skb->nf_debug = olddebug;
1265 #endif
1266         return skb;
1267 }
1268
1269 /* Used by ipt_REJECT. */
1270 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1271 {
1272         struct ip_conntrack *ct;
1273         enum ip_conntrack_info ctinfo;
1274
1275         ct = __ip_conntrack_get(nfct, &ctinfo);
1276
1277         /* This ICMP is in reverse direction to the packet which
1278            caused it */
1279         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1280                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1281         else
1282                 ctinfo = IP_CT_RELATED;
1283
1284         /* Attach new skbuff, and increment count */
1285         nskb->nfct = &ct->infos[ctinfo];
1286         atomic_inc(&ct->ct_general.use);
1287 }
1288
1289 static inline int
1290 do_kill(const struct ip_conntrack_tuple_hash *i,
1291         int (*kill)(const struct ip_conntrack *i, void *data),
1292         void *data)
1293 {
1294         return kill(i->ctrack, data);
1295 }
1296
1297 /* Bring out ya dead! */
1298 static struct ip_conntrack_tuple_hash *
1299 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1300                 void *data, unsigned int *bucket)
1301 {
1302         struct ip_conntrack_tuple_hash *h = NULL;
1303
1304         READ_LOCK(&ip_conntrack_lock);
1305         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1306                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1307                               struct ip_conntrack_tuple_hash *, kill, data);
1308         }
1309         if (h)
1310                 atomic_inc(&h->ctrack->ct_general.use);
1311         READ_UNLOCK(&ip_conntrack_lock);
1312
1313         return h;
1314 }
1315
1316 void
1317 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1318                         void *data)
1319 {
1320         struct ip_conntrack_tuple_hash *h;
1321         unsigned int bucket = 0;
1322
1323         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1324                 /* Time to push up daises... */
1325                 if (del_timer(&h->ctrack->timeout))
1326                         death_by_timeout((unsigned long)h->ctrack);
1327                 /* ... else the timer will get him soon. */
1328
1329                 ip_conntrack_put(h->ctrack);
1330         }
1331 }
1332
1333 /* Fast function for those who don't want to parse /proc (and I don't
1334    blame them). */
1335 /* Reversing the socket's dst/src point of view gives us the reply
1336    mapping. */
1337 static int
1338 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1339 {
1340         struct inet_opt *inet = inet_sk(sk);
1341         struct ip_conntrack_tuple_hash *h;
1342         struct ip_conntrack_tuple tuple;
1343         
1344         IP_CT_TUPLE_U_BLANK(&tuple);
1345         tuple.src.ip = inet->rcv_saddr;
1346         tuple.src.u.tcp.port = inet->sport;
1347         tuple.dst.ip = inet->daddr;
1348         tuple.dst.u.tcp.port = inet->dport;
1349         tuple.dst.protonum = IPPROTO_TCP;
1350
1351         /* We only do TCP at the moment: is there a better way? */
1352         if (strcmp(sk->sk_prot->name, "TCP")) {
1353                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1354                 return -ENOPROTOOPT;
1355         }
1356
1357         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1358                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1359                        *len, sizeof(struct sockaddr_in));
1360                 return -EINVAL;
1361         }
1362
1363         h = ip_conntrack_find_get(&tuple, NULL);
1364         if (h) {
1365                 struct sockaddr_in sin;
1366
1367                 sin.sin_family = AF_INET;
1368                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1369                         .tuple.dst.u.tcp.port;
1370                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1371                         .tuple.dst.ip;
1372
1373                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1374                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1375                 ip_conntrack_put(h->ctrack);
1376                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1377                         return -EFAULT;
1378                 else
1379                         return 0;
1380         }
1381         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1382                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1383                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1384         return -ENOENT;
1385 }
1386
1387 static struct nf_sockopt_ops so_getorigdst = {
1388         .pf             = PF_INET,
1389         .get_optmin     = SO_ORIGINAL_DST,
1390         .get_optmax     = SO_ORIGINAL_DST+1,
1391         .get            = &getorigdst,
1392 };
1393
1394 static int kill_all(const struct ip_conntrack *i, void *data)
1395 {
1396         return 1;
1397 }
1398
1399 /* Mishearing the voices in his head, our hero wonders how he's
1400    supposed to kill the mall. */
1401 void ip_conntrack_cleanup(void)
1402 {
1403         ip_ct_attach = NULL;
1404         /* This makes sure all current packets have passed through
1405            netfilter framework.  Roll on, two-stage module
1406            delete... */
1407         synchronize_net();
1408  
1409  i_see_dead_people:
1410         ip_ct_selective_cleanup(kill_all, NULL);
1411         if (atomic_read(&ip_conntrack_count) != 0) {
1412                 schedule();
1413                 goto i_see_dead_people;
1414         }
1415
1416         kmem_cache_destroy(ip_conntrack_cachep);
1417         vfree(ip_conntrack_hash);
1418         nf_unregister_sockopt(&so_getorigdst);
1419 }
1420
1421 static int hashsize;
1422 MODULE_PARM(hashsize, "i");
1423
1424 int __init ip_conntrack_init(void)
1425 {
1426         unsigned int i;
1427         int ret;
1428
1429         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1430          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1431         if (hashsize) {
1432                 ip_conntrack_htable_size = hashsize;
1433         } else {
1434                 ip_conntrack_htable_size
1435                         = (((num_physpages << PAGE_SHIFT) / 16384)
1436                            / sizeof(struct list_head));
1437                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1438                         ip_conntrack_htable_size = 8192;
1439                 if (ip_conntrack_htable_size < 16)
1440                         ip_conntrack_htable_size = 16;
1441         }
1442         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1443
1444 #ifdef CONFIG_MIPS_BRCM
1445         ip_conntrack_max=0;
1446 #endif
1447         printk("ip_conntrack version %s (%u buckets, %d max)"
1448                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1449                ip_conntrack_htable_size, ip_conntrack_max,
1450                sizeof(struct ip_conntrack));
1451
1452         ret = nf_register_sockopt(&so_getorigdst);
1453         if (ret != 0) {
1454                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1455                 return ret;
1456         }
1457
1458         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1459                                     * ip_conntrack_htable_size);
1460         if (!ip_conntrack_hash) {
1461                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1462                 goto err_unreg_sockopt;
1463         }
1464
1465         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1466                                                 sizeof(struct ip_conntrack), 0,
1467                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1468         if (!ip_conntrack_cachep) {
1469                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1470                 goto err_free_hash;
1471         }
1472         /* Don't NEED lock here, but good form anyway. */
1473         WRITE_LOCK(&ip_conntrack_lock);
1474         /* Sew in builtin protocols. */
1475         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1476         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1477         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1478         list_append(&protocol_list, &ip_conntrack_protocol_esp);
1479         WRITE_UNLOCK(&ip_conntrack_lock);
1480
1481         for (i = 0; i < ip_conntrack_htable_size; i++)
1482                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1483
1484         /* For use by ipt_REJECT */
1485         ip_ct_attach = ip_conntrack_attach;
1486
1487         /* Set up fake conntrack:
1488             - to never be deleted, not in any hashes */
1489         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1490         /*  - and look it like as a confirmed connection */
1491         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1492         /*  - and prepare the ctinfo field for REJECT & NAT. */
1493         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1494         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1495         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1496                         &ip_conntrack_untracked.ct_general;
1497
1498         return ret;
1499
1500 err_free_hash:
1501         vfree(ip_conntrack_hash);
1502 err_unreg_sockopt:
1503         nf_unregister_sockopt(&so_getorigdst);
1504
1505         return -ENOMEM;
1506 }