www.usr.com/support/gpl/USR9113_release1.0.tar.gz
[bcm963xx.git] / kernel / linux / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 /* For ERR_PTR().  Yeah, I know... --RR */
38 #include <linux/fs.h>
39
40 /* This rwlock protects the main hash table, protocol/helper/expected
41    registrations, conntrack timers*/
42 #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
43 #define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
44
45 #include <linux/netfilter_ipv4/ip_conntrack.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
49 #include <linux/netfilter_ipv4/listhelp.h>
50
51 #define IP_CONNTRACK_VERSION    "2.1"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DECLARE_RWLOCK(ip_conntrack_lock);
60 DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
61
62 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
63 LIST_HEAD(ip_conntrack_expect_list);
64 LIST_HEAD(protocol_list);
65 static LIST_HEAD(helpers);
66 unsigned int ip_conntrack_htable_size = 0;
67 #ifdef CONFIG_MIPS_BRCM
68 int ip_conntrack_max=0;
69 #else
70 static int ip_conntrack_max=0;
71 #endif
72 static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
73 struct list_head *ip_conntrack_hash;
74 static kmem_cache_t *ip_conntrack_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76
77 extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
78
79 static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
80                               u_int8_t protocol)
81 {
82         return protocol == curr->proto;
83 }
84
85 struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
86 {
87         struct ip_conntrack_protocol *p;
88
89         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
90         p = LIST_FIND(&protocol_list, proto_cmpfn,
91                       struct ip_conntrack_protocol *, protocol);
92         if (!p)
93                 p = &ip_conntrack_generic_protocol;
94
95         return p;
96 }
97
98 struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
99 {
100         struct ip_conntrack_protocol *p;
101
102         READ_LOCK(&ip_conntrack_lock);
103         p = __ip_ct_find_proto(protocol);
104         READ_UNLOCK(&ip_conntrack_lock);
105         return p;
106 }
107
108 inline void 
109 ip_conntrack_put(struct ip_conntrack *ct)
110 {
111         IP_NF_ASSERT(ct);
112         IP_NF_ASSERT(ct->infos[0].master);
113         /* nf_conntrack_put wants to go via an info struct, so feed it
114            one at random. */
115         nf_conntrack_put(&ct->infos[0]);
116 }
117
118 static int ip_conntrack_hash_rnd_initted;
119 static unsigned int ip_conntrack_hash_rnd;
120
121 static u_int32_t
122 hash_conntrack(const struct ip_conntrack_tuple *tuple)
123 {
124 #if 0
125         dump_tuple(tuple);
126 #endif
127         return (jhash_3words(tuple->src.ip,
128                              (tuple->dst.ip ^ tuple->dst.protonum),
129                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
130                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
131 }
132
133 int
134 get_tuple(const struct iphdr *iph,
135           const struct sk_buff *skb,
136           unsigned int dataoff,
137           struct ip_conntrack_tuple *tuple,
138           const struct ip_conntrack_protocol *protocol)
139 {
140         /* Never happen */
141         if (iph->frag_off & htons(IP_OFFSET)) {
142                 printk("ip_conntrack_core: Frag of proto %u.\n",
143                        iph->protocol);
144                 return 0;
145         }
146
147         tuple->src.ip = iph->saddr;
148         tuple->dst.ip = iph->daddr;
149         tuple->dst.protonum = iph->protocol;
150         tuple->src.u.all = tuple->dst.u.all = 0;
151
152         return protocol->pkt_to_tuple(skb, dataoff, tuple);
153 }
154
155 static int
156 invert_tuple(struct ip_conntrack_tuple *inverse,
157              const struct ip_conntrack_tuple *orig,
158              const struct ip_conntrack_protocol *protocol)
159 {
160         inverse->src.ip = orig->dst.ip;
161         inverse->dst.ip = orig->src.ip;
162         inverse->dst.protonum = orig->dst.protonum;
163
164         inverse->src.u.all = inverse->dst.u.all = 0;
165
166         return protocol->invert_tuple(inverse, orig);
167 }
168
169
170 /* ip_conntrack_expect helper functions */
171
172 /* Compare tuple parts depending on mask. */
173 static inline int expect_cmp(const struct ip_conntrack_expect *i,
174                              const struct ip_conntrack_tuple *tuple)
175 {
176         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
177         return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
178 }
179
180 static void
181 destroy_expect(struct ip_conntrack_expect *exp)
182 {
183         DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
184         IP_NF_ASSERT(atomic_read(&exp->use) == 0);
185         IP_NF_ASSERT(!timer_pending(&exp->timeout));
186
187         kfree(exp);
188 }
189
190 inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
191 {
192         IP_NF_ASSERT(exp);
193
194         if (atomic_dec_and_test(&exp->use)) {
195                 /* usage count dropped to zero */
196                 destroy_expect(exp);
197         }
198 }
199
200 static inline struct ip_conntrack_expect *
201 __ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
202 {
203         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
204         MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
205         return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
206                          struct ip_conntrack_expect *, tuple);
207 }
208
209 /* Find a expectation corresponding to a tuple. */
210 struct ip_conntrack_expect *
211 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
212 {
213         struct ip_conntrack_expect *exp;
214
215         READ_LOCK(&ip_conntrack_lock);
216         READ_LOCK(&ip_conntrack_expect_tuple_lock);
217         exp = __ip_ct_expect_find(tuple);
218         if (exp)
219                 atomic_inc(&exp->use);
220         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
221         READ_UNLOCK(&ip_conntrack_lock);
222
223         return exp;
224 }
225
226 /* remove one specific expectation from all lists and drop refcount,
227  * does _NOT_ delete the timer. */
228 static void __unexpect_related(struct ip_conntrack_expect *expect)
229 {
230         DEBUGP("unexpect_related(%p)\n", expect);
231         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
232
233         /* we're not allowed to unexpect a confirmed expectation! */
234         IP_NF_ASSERT(!expect->sibling);
235
236         /* delete from global and local lists */
237         list_del(&expect->list);
238         list_del(&expect->expected_list);
239
240         /* decrement expect-count of master conntrack */
241         if (expect->expectant)
242                 expect->expectant->expecting--;
243
244         ip_conntrack_expect_put(expect);
245 }
246
247 /* remove one specific expecatation from all lists, drop refcount
248  * and expire timer. 
249  * This function can _NOT_ be called for confirmed expects! */
250 static void unexpect_related(struct ip_conntrack_expect *expect)
251 {
252         IP_NF_ASSERT(expect->expectant);
253         IP_NF_ASSERT(expect->expectant->helper);
254         /* if we are supposed to have a timer, but we can't delete
255          * it: race condition.  __unexpect_related will
256          * be calledd by timeout function */
257         if (expect->expectant->helper->timeout
258             && !del_timer(&expect->timeout))
259                 return;
260
261         __unexpect_related(expect);
262 }
263
264 /* delete all unconfirmed expectations for this conntrack */
265 static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
266 {
267         struct list_head *exp_entry, *next;
268         struct ip_conntrack_expect *exp;
269
270         DEBUGP("remove_expectations(%p)\n", ct);
271
272         list_for_each_safe(exp_entry, next, &ct->sibling_list) {
273                 exp = list_entry(exp_entry, struct ip_conntrack_expect,
274                                  expected_list);
275
276                 /* we skip established expectations, as we want to delete
277                  * the un-established ones only */
278                 if (exp->sibling) {
279                         DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
280                         if (drop_refcount) {
281                                 /* Indicate that this expectations parent is dead */
282                                 ip_conntrack_put(exp->expectant);
283                                 exp->expectant = NULL;
284                         }
285                         continue;
286                 }
287
288                 IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
289                 IP_NF_ASSERT(exp->expectant == ct);
290
291                 /* delete expectation from global and private lists */
292                 unexpect_related(exp);
293         }
294 }
295
296 static void
297 clean_from_lists(struct ip_conntrack *ct)
298 {
299         unsigned int ho, hr;
300         
301         DEBUGP("clean_from_lists(%p)\n", ct);
302         MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
303
304         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
305         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
306         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
307         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
308
309         /* Destroy all un-established, pending expectations */
310         remove_expectations(ct, 1);
311 }
312
313 static void
314 destroy_conntrack(struct nf_conntrack *nfct)
315 {
316         struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
317         struct ip_conntrack_protocol *proto;
318
319         DEBUGP("destroy_conntrack(%p)\n", ct);
320         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
321         IP_NF_ASSERT(!timer_pending(&ct->timeout));
322
323         /* To make sure we don't get any weird locking issues here:
324          * destroy_conntrack() MUST NOT be called with a write lock
325          * to ip_conntrack_lock!!! -HW */
326         proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327         if (proto && proto->destroy)
328                 proto->destroy(ct);
329
330         if (ip_conntrack_destroyed)
331                 ip_conntrack_destroyed(ct);
332
333         WRITE_LOCK(&ip_conntrack_lock);
334         /* Make sure don't leave any orphaned expectations lying around */
335         if (ct->expecting)
336                 remove_expectations(ct, 1);
337
338         /* Delete our master expectation */
339         if (ct->master) {
340                 if (ct->master->expectant) {
341                         /* can't call __unexpect_related here,
342                          * since it would screw up expect_list */
343                         list_del(&ct->master->expected_list);
344                         master = ct->master->expectant;
345                 }
346                 kfree(ct->master);
347         }
348         WRITE_UNLOCK(&ip_conntrack_lock);
349
350         if (master)
351                 ip_conntrack_put(master);
352
353         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
354         kmem_cache_free(ip_conntrack_cachep, ct);
355         atomic_dec(&ip_conntrack_count);
356 }
357
358 static void death_by_timeout(unsigned long ul_conntrack)
359 {
360         struct ip_conntrack *ct = (void *)ul_conntrack;
361
362         WRITE_LOCK(&ip_conntrack_lock);
363         clean_from_lists(ct);
364         WRITE_UNLOCK(&ip_conntrack_lock);
365         ip_conntrack_put(ct);
366 }
367
368 static inline int
369 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
370                     const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
374         return i->ctrack != ignored_conntrack
375                 && ip_ct_tuple_equal(tuple, &i->tuple);
376 }
377
378 static struct ip_conntrack_tuple_hash *
379 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
380                     const struct ip_conntrack *ignored_conntrack)
381 {
382         struct ip_conntrack_tuple_hash *h;
383         unsigned int hash = hash_conntrack(tuple);
384
385         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
386         h = LIST_FIND(&ip_conntrack_hash[hash],
387                       conntrack_tuple_cmp,
388                       struct ip_conntrack_tuple_hash *,
389                       tuple, ignored_conntrack);
390         return h;
391 }
392
393 /* Find a connection corresponding to a tuple. */
394 struct ip_conntrack_tuple_hash *
395 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
396                       const struct ip_conntrack *ignored_conntrack)
397 {
398         struct ip_conntrack_tuple_hash *h;
399
400         READ_LOCK(&ip_conntrack_lock);
401         h = __ip_conntrack_find(tuple, ignored_conntrack);
402         if (h)
403                 atomic_inc(&h->ctrack->ct_general.use);
404         READ_UNLOCK(&ip_conntrack_lock);
405
406         return h;
407 }
408
409 static inline struct ip_conntrack *
410 __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
411 {
412         struct ip_conntrack *ct
413                 = (struct ip_conntrack *)nfct->master;
414
415         /* ctinfo is the index of the nfct inside the conntrack */
416         *ctinfo = nfct - ct->infos;
417         IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
418         return ct;
419 }
420
421 /* Return conntrack and conntrack_info given skb->nfct->master */
422 struct ip_conntrack *
423 ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
424 {
425         if (skb->nfct) 
426                 return __ip_conntrack_get(skb->nfct, ctinfo);
427         return NULL;
428 }
429
430 /* Confirm a connection given skb->nfct; places it in hash table */
431 int
432 __ip_conntrack_confirm(struct nf_ct_info *nfct)
433 {
434         unsigned int hash, repl_hash;
435         struct ip_conntrack *ct;
436         enum ip_conntrack_info ctinfo;
437
438         ct = __ip_conntrack_get(nfct, &ctinfo);
439
440         /* ipt_REJECT uses ip_conntrack_attach to attach related
441            ICMP/TCP RST packets in other direction.  Actual packet
442            which created connection will be IP_CT_NEW or for an
443            expected connection, IP_CT_RELATED. */
444         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
445                 return NF_ACCEPT;
446
447         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
448         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
449
450         /* We're not in hash table, and we refuse to set up related
451            connections for unconfirmed conns.  But packet copies and
452            REJECT will give spurious warnings here. */
453         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
454
455         /* No external references means noone else could have
456            confirmed us. */
457         IP_NF_ASSERT(!is_confirmed(ct));
458         DEBUGP("Confirming conntrack %p\n", ct);
459
460         WRITE_LOCK(&ip_conntrack_lock);
461         /* See if there's one in the list already, including reverse:
462            NAT could have grabbed it without realizing, since we're
463            not in the hash.  If there is, we lost race. */
464         if (!LIST_FIND(&ip_conntrack_hash[hash],
465                        conntrack_tuple_cmp,
466                        struct ip_conntrack_tuple_hash *,
467                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
468             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
469                           conntrack_tuple_cmp,
470                           struct ip_conntrack_tuple_hash *,
471                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
472                 list_prepend(&ip_conntrack_hash[hash],
473                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
474                 list_prepend(&ip_conntrack_hash[repl_hash],
475                              &ct->tuplehash[IP_CT_DIR_REPLY]);
476                 /* Timer relative to confirmation time, not original
477                    setting time, otherwise we'd get timer wrap in
478                    weird delay cases. */
479                 ct->timeout.expires += jiffies;
480                 add_timer(&ct->timeout);
481                 atomic_inc(&ct->ct_general.use);
482                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
483                 WRITE_UNLOCK(&ip_conntrack_lock);
484                 return NF_ACCEPT;
485         }
486
487         WRITE_UNLOCK(&ip_conntrack_lock);
488         return NF_DROP;
489 }
490
491 /* Returns true if a connection correspondings to the tuple (required
492    for NAT). */
493 int
494 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
495                          const struct ip_conntrack *ignored_conntrack)
496 {
497         struct ip_conntrack_tuple_hash *h;
498
499         READ_LOCK(&ip_conntrack_lock);
500         h = __ip_conntrack_find(tuple, ignored_conntrack);
501         READ_UNLOCK(&ip_conntrack_lock);
502
503         return h != NULL;
504 }
505
506 /* Returns conntrack if it dealt with ICMP, and filled in skb fields */
507 struct ip_conntrack *
508 icmp_error_track(struct sk_buff *skb,
509                  enum ip_conntrack_info *ctinfo,
510                  unsigned int hooknum)
511 {
512         struct ip_conntrack_tuple innertuple, origtuple;
513         struct {
514                 struct icmphdr icmp;
515                 struct iphdr ip;
516         } inside;
517         struct ip_conntrack_protocol *innerproto;
518         struct ip_conntrack_tuple_hash *h;
519         int dataoff;
520
521         IP_NF_ASSERT(skb->nfct == NULL);
522
523         /* Not enough header? */
524         if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
525                 return NULL;
526
527         if (inside.icmp.type != ICMP_DEST_UNREACH
528             && inside.icmp.type != ICMP_SOURCE_QUENCH
529             && inside.icmp.type != ICMP_TIME_EXCEEDED
530             && inside.icmp.type != ICMP_PARAMETERPROB
531             && inside.icmp.type != ICMP_REDIRECT)
532                 return NULL;
533
534         /* Ignore ICMP's containing fragments (shouldn't happen) */
535         if (inside.ip.frag_off & htons(IP_OFFSET)) {
536                 DEBUGP("icmp_error_track: fragment of proto %u\n",
537                        inside.ip.protocol);
538                 return NULL;
539         }
540
541         innerproto = ip_ct_find_proto(inside.ip.protocol);
542         dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
543         /* Are they talking about one of our connections? */
544         if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
545                 DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
546                 return NULL;
547         }
548
549         /* Ordinarily, we'd expect the inverted tupleproto, but it's
550            been preserved inside the ICMP. */
551         if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
552                 DEBUGP("icmp_error_track: Can't invert tuple\n");
553                 return NULL;
554         }
555
556         *ctinfo = IP_CT_RELATED;
557
558         h = ip_conntrack_find_get(&innertuple, NULL);
559         if (!h) {
560                 /* Locally generated ICMPs will match inverted if they
561                    haven't been SNAT'ed yet */
562                 /* FIXME: NAT code has to handle half-done double NAT --RR */
563                 if (hooknum == NF_IP_LOCAL_OUT)
564                         h = ip_conntrack_find_get(&origtuple, NULL);
565
566                 if (!h) {
567                         DEBUGP("icmp_error_track: no match\n");
568                         return NULL;
569                 }
570                 /* Reverse direction from that found */
571                 if (DIRECTION(h) != IP_CT_DIR_REPLY)
572                         *ctinfo += IP_CT_IS_REPLY;
573         } else {
574                 if (DIRECTION(h) == IP_CT_DIR_REPLY)
575                         *ctinfo += IP_CT_IS_REPLY;
576         }
577
578         /* Update skb to refer to this connection */
579         skb->nfct = &h->ctrack->infos[*ctinfo];
580         return h->ctrack;
581 }
582
583 /* There's a small race here where we may free a just-assured
584    connection.  Too bad: we're in trouble anyway. */
585 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
586 {
587         return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
588 }
589
590 static int early_drop(struct list_head *chain)
591 {
592         /* Traverse backwards: gives us oldest, which is roughly LRU */
593         struct ip_conntrack_tuple_hash *h;
594         int dropped = 0;
595
596         READ_LOCK(&ip_conntrack_lock);
597         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
598         if (h)
599                 atomic_inc(&h->ctrack->ct_general.use);
600         READ_UNLOCK(&ip_conntrack_lock);
601
602         if (!h)
603                 return dropped;
604
605         if (del_timer(&h->ctrack->timeout)) {
606                 death_by_timeout((unsigned long)h->ctrack);
607                 dropped = 1;
608         }
609         ip_conntrack_put(h->ctrack);
610         return dropped;
611 }
612
613 #if defined(CONFIG_MIPS_BRCM)
614 static inline int regardless(const struct ip_conntrack_tuple_hash *i)
615 {
616         return 1;
617 }
618
619 static int regardless_drop(struct list_head *chain)
620 {
621         /* Traverse backwards: gives us oldest, which is roughly LRU */
622         struct ip_conntrack_tuple_hash *h;
623         int dropped = 0;
624
625         READ_LOCK(&ip_conntrack_lock);
626         h = LIST_FIND_B(chain, regardless, struct ip_conntrack_tuple_hash *);
627         if (h)
628                 atomic_inc(&h->ctrack->ct_general.use);
629         READ_UNLOCK(&ip_conntrack_lock);
630
631         if (!h)
632                 return dropped;
633
634         if (del_timer(&h->ctrack->timeout)) {
635                 death_by_timeout((unsigned long)h->ctrack);
636                 dropped = 1;
637         }
638         ip_conntrack_put(h->ctrack);
639         return dropped;
640 }
641 #endif
642
643 static inline int helper_cmp(const struct ip_conntrack_helper *i,
644                              const struct ip_conntrack_tuple *rtuple)
645 {
646         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
647 }
648
649 struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
650 {
651         return LIST_FIND(&helpers, helper_cmp,
652                          struct ip_conntrack_helper *,
653                          tuple);
654 }
655
656 /* Allocate a new conntrack: we return -ENOMEM if classification
657    failed due to stress.  Otherwise it really is unclassifiable. */
658 static struct ip_conntrack_tuple_hash *
659 init_conntrack(const struct ip_conntrack_tuple *tuple,
660                struct ip_conntrack_protocol *protocol,
661                struct sk_buff *skb)
662 {
663         struct ip_conntrack *conntrack;
664         struct ip_conntrack_tuple repl_tuple;
665         size_t hash;
666         struct ip_conntrack_expect *expected;
667         int i;
668         static unsigned int drop_next;
669
670         if (!ip_conntrack_hash_rnd_initted) {
671                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
672                 ip_conntrack_hash_rnd_initted = 1;
673         }
674
675         hash = hash_conntrack(tuple);
676
677         if (ip_conntrack_max &&
678             atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
679                 /* Try dropping from random chain, or else from the
680                    chain about to put into (in case they're trying to
681                    bomb one hash chain). */
682                 unsigned int next = (drop_next++)%ip_conntrack_htable_size;
683                 
684                 if (!early_drop(&ip_conntrack_hash[next])
685                     && !early_drop(&ip_conntrack_hash[hash])) {
686 #if defined(CONFIG_MIPS_BRCM)   
687                         /* Sorry, we have to kick one out regardless. */
688                          while (!regardless_drop(&ip_conntrack_hash[next]))
689                                 next = (drop_next++)%ip_conntrack_htable_size;
690 #else
691                         if (net_ratelimit())
692                                 printk(KERN_WARNING
693                                        "ip_conntrack: table full, dropping"
694                                        " packet.\n");
695                         return ERR_PTR(-ENOMEM);
696 #endif                  
697                 }
698         }
699
700         if (!invert_tuple(&repl_tuple, tuple, protocol)) {
701                 DEBUGP("Can't invert tuple.\n");
702                 return NULL;
703         }
704
705         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
706         if (!conntrack) {
707                 DEBUGP("Can't allocate conntrack.\n");
708                 return ERR_PTR(-ENOMEM);
709         }
710
711         memset(conntrack, 0, sizeof(*conntrack));
712         atomic_set(&conntrack->ct_general.use, 1);
713         conntrack->ct_general.destroy = destroy_conntrack;
714         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
715         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
716         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
717         conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
718         for (i=0; i < IP_CT_NUMBER; i++)
719                 conntrack->infos[i].master = &conntrack->ct_general;
720
721         if (!protocol->new(conntrack, skb)) {
722                 kmem_cache_free(ip_conntrack_cachep, conntrack);
723                 return NULL;
724         }
725         /* Don't set timer yet: wait for confirmation */
726         init_timer(&conntrack->timeout);
727         conntrack->timeout.data = (unsigned long)conntrack;
728         conntrack->timeout.function = death_by_timeout;
729
730         INIT_LIST_HEAD(&conntrack->sibling_list);
731
732         WRITE_LOCK(&ip_conntrack_lock);
733         /* Need finding and deleting of expected ONLY if we win race */
734         READ_LOCK(&ip_conntrack_expect_tuple_lock);
735         expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
736                              struct ip_conntrack_expect *, tuple);
737         READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
738
739         /* If master is not in hash table yet (ie. packet hasn't left
740            this machine yet), how can other end know about expected?
741            Hence these are not the droids you are looking for (if
742            master ct never got confirmed, we'd hold a reference to it
743            and weird things would happen to future packets). */
744         if (expected && !is_confirmed(expected->expectant))
745                 expected = NULL;
746
747         /* Look up the conntrack helper for master connections only */
748         if (!expected)
749                 conntrack->helper = ip_ct_find_helper(&repl_tuple);
750
751         /* If the expectation is dying, then this is a loser. */
752         if (expected
753             && expected->expectant->helper->timeout
754             && ! del_timer(&expected->timeout))
755                 expected = NULL;
756
757         if (expected) {
758                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
759                         conntrack, expected);
760                 /* Welcome, Mr. Bond.  We've been expecting you... */
761                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
762                 conntrack->master = expected;
763                 expected->sibling = conntrack;
764 #if CONFIG_IP_NF_CONNTRACK_MARK
765                 conntrack->mark = expected->expectant->mark;
766 #endif
767                 LIST_DELETE(&ip_conntrack_expect_list, expected);
768                 expected->expectant->expecting--;
769                 nf_conntrack_get(&master_ct(conntrack)->infos[0]);
770         }
771         atomic_inc(&ip_conntrack_count);
772         WRITE_UNLOCK(&ip_conntrack_lock);
773
774         if (expected && expected->expectfn)
775                 expected->expectfn(conntrack);
776         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
777 }
778
779 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
780 static inline struct ip_conntrack *
781 resolve_normal_ct(struct sk_buff *skb,
782                   struct ip_conntrack_protocol *proto,
783                   int *set_reply,
784                   unsigned int hooknum,
785                   enum ip_conntrack_info *ctinfo)
786 {
787         struct ip_conntrack_tuple tuple;
788         struct ip_conntrack_tuple_hash *h;
789
790         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
791
792         if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
793                 return NULL;
794
795         /* look for tuple match */
796         h = ip_conntrack_find_get(&tuple, NULL);
797         if (!h) {
798                 h = init_conntrack(&tuple, proto, skb);
799                 if (!h)
800                         return NULL;
801                 if (IS_ERR(h))
802                         return (void *)h;
803         }
804
805         /* It exists; we have (non-exclusive) reference. */
806         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
807                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
808                 /* Please set reply bit if this packet OK */
809                 *set_reply = 1;
810         } else {
811                 /* Once we've had two way comms, always ESTABLISHED. */
812                 if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
813                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
814                                h->ctrack);
815                         *ctinfo = IP_CT_ESTABLISHED;
816                 } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
817                         DEBUGP("ip_conntrack_in: related packet for %p\n",
818                                h->ctrack);
819                         *ctinfo = IP_CT_RELATED;
820                 } else {
821                         DEBUGP("ip_conntrack_in: new packet for %p\n",
822                                h->ctrack);
823                         *ctinfo = IP_CT_NEW;
824                 }
825                 *set_reply = 0;
826         }
827         skb->nfct = &h->ctrack->infos[*ctinfo];
828         return h->ctrack;
829 }
830
831 /* Netfilter hook itself. */
832 unsigned int ip_conntrack_in(unsigned int hooknum,
833                              struct sk_buff **pskb,
834                              const struct net_device *in,
835                              const struct net_device *out,
836                              int (*okfn)(struct sk_buff *))
837 {
838         struct ip_conntrack *ct;
839         enum ip_conntrack_info ctinfo;
840         struct ip_conntrack_protocol *proto;
841         int set_reply;
842         int ret;
843
844         /* Never happen */
845         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
846                 return NF_ACCEPT;
847                 if (net_ratelimit()) {
848                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
849                        (*pskb)->nh.iph->protocol, hooknum);
850                 }
851                 return NF_DROP;
852         }
853
854         /* FIXME: Do this right please. --RR */
855         (*pskb)->nfcache |= NFC_UNKNOWN;
856
857 /* Doesn't cover locally-generated broadcast, so not worth it. */
858 #if 0
859         /* Ignore broadcast: no `connection'. */
860         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
861                 printk("Broadcast packet!\n");
862                 return NF_ACCEPT;
863         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
864                    == htonl(0x000000FF)) {
865                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
866                        NIPQUAD((*pskb)->nh.iph->saddr),
867                        NIPQUAD((*pskb)->nh.iph->daddr),
868                        (*pskb)->sk, (*pskb)->pkt_type);
869         }
870 #endif
871
872         /* Previously seen (loopback or untracked)?  Ignore. */
873         if ((*pskb)->nfct)
874                 return NF_ACCEPT;
875
876         proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
877
878         /* It may be an icmp error... */
879         if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
880             && icmp_error_track(*pskb, &ctinfo, hooknum))
881                 return NF_ACCEPT;
882
883         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
884                 /* Not valid part of a connection */
885                 return NF_ACCEPT;
886
887         if (IS_ERR(ct))
888                 /* Too stressed to deal. */
889                 return NF_DROP;
890
891         IP_NF_ASSERT((*pskb)->nfct);
892
893         ret = proto->packet(ct, *pskb, ctinfo);
894         if (ret == -1) {
895                 /* Invalid */
896                 nf_conntrack_put((*pskb)->nfct);
897                 (*pskb)->nfct = NULL;
898                 return NF_ACCEPT;
899         }
900
901         if (ret != NF_DROP && ct->helper) {
902                 ret = ct->helper->help(*pskb, ct, ctinfo);
903                 if (ret == -1) {
904                         /* Invalid */
905                         nf_conntrack_put((*pskb)->nfct);
906                         (*pskb)->nfct = NULL;
907                         return NF_ACCEPT;
908                 }
909         }
910         if (set_reply)
911                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
912
913         return ret;
914 }
915
916 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
917                    const struct ip_conntrack_tuple *orig)
918 {
919         return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
920 }
921
922 static inline int resent_expect(const struct ip_conntrack_expect *i,
923                                 const struct ip_conntrack_tuple *tuple,
924                                 const struct ip_conntrack_tuple *mask)
925 {
926         DEBUGP("resent_expect\n");
927         DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
928         DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
929         DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
930         return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
931                  || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
932                 && ip_ct_tuple_equal(&i->mask, mask));
933 }
934
935 /* Would two expected things clash? */
936 static inline int expect_clash(const struct ip_conntrack_expect *i,
937                                const struct ip_conntrack_tuple *tuple,
938                                const struct ip_conntrack_tuple *mask)
939 {
940         /* Part covered by intersection of masks must be unequal,
941            otherwise they clash */
942         struct ip_conntrack_tuple intersect_mask
943                 = { { i->mask.src.ip & mask->src.ip,
944                       { i->mask.src.u.all & mask->src.u.all } },
945                     { i->mask.dst.ip & mask->dst.ip,
946                       { i->mask.dst.u.all & mask->dst.u.all },
947                       i->mask.dst.protonum & mask->dst.protonum } };
948
949         return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
950 }
951
952 inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
953 {
954         WRITE_LOCK(&ip_conntrack_lock);
955         unexpect_related(expect);
956         WRITE_UNLOCK(&ip_conntrack_lock);
957 }
958         
959 static void expectation_timed_out(unsigned long ul_expect)
960 {
961         struct ip_conntrack_expect *expect = (void *) ul_expect;
962
963         DEBUGP("expectation %p timed out\n", expect);   
964         WRITE_LOCK(&ip_conntrack_lock);
965         __unexpect_related(expect);
966         WRITE_UNLOCK(&ip_conntrack_lock);
967 }
968
969 struct ip_conntrack_expect *
970 ip_conntrack_expect_alloc(void)
971 {
972         struct ip_conntrack_expect *new;
973         
974         new = (struct ip_conntrack_expect *)
975                 kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
976         if (!new) {
977                 DEBUGP("expect_related: OOM allocating expect\n");
978                 return NULL;
979         }
980
981         /* tuple_cmp compares whole union, we have to initialized cleanly */
982         memset(new, 0, sizeof(struct ip_conntrack_expect));
983
984         return new;
985 }
986
987 static void
988 ip_conntrack_expect_insert(struct ip_conntrack_expect *new,
989                            struct ip_conntrack *related_to)
990 {
991         DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
992         new->expectant = related_to;
993         new->sibling = NULL;
994         atomic_set(&new->use, 1);
995
996         /* add to expected list for this connection */
997         list_add_tail(&new->expected_list, &related_to->sibling_list);
998         /* add to global list of expectations */
999         list_prepend(&ip_conntrack_expect_list, &new->list);
1000         /* add and start timer if required */
1001         if (related_to->helper->timeout) {
1002                 init_timer(&new->timeout);
1003                 new->timeout.data = (unsigned long)new;
1004                 new->timeout.function = expectation_timed_out;
1005                 new->timeout.expires = jiffies +
1006                                         related_to->helper->timeout * HZ;
1007                 add_timer(&new->timeout);
1008         }
1009         related_to->expecting++;
1010 }
1011
1012 /* Add a related connection. */
1013 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
1014                                 struct ip_conntrack *related_to)
1015 {
1016         struct ip_conntrack_expect *old;
1017         int ret = 0;
1018
1019         WRITE_LOCK(&ip_conntrack_lock);
1020         /* Because of the write lock, no reader can walk the lists,
1021          * so there is no need to use the tuple lock too */
1022
1023         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1024         DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1025         DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1026
1027         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
1028                         struct ip_conntrack_expect *, &expect->tuple, 
1029                         &expect->mask);
1030         if (old) {
1031                 /* Helper private data may contain offsets but no pointers
1032                    pointing into the payload - otherwise we should have to copy 
1033                    the data filled out by the helper over the old one */
1034                 DEBUGP("expect_related: resent packet\n");
1035                 if (related_to->helper->timeout) {
1036                         if (!del_timer(&old->timeout)) {
1037                                 /* expectation is dying. Fall through */
1038                                 goto out;
1039                         } else {
1040                                 old->timeout.expires = jiffies + 
1041                                         related_to->helper->timeout * HZ;
1042                                 add_timer(&old->timeout);
1043                         }
1044                 }
1045
1046                 WRITE_UNLOCK(&ip_conntrack_lock);
1047                 kfree(expect);
1048                 return -EEXIST;
1049
1050         } else if (related_to->helper->max_expected && 
1051                    related_to->expecting >= related_to->helper->max_expected) {
1052                 /* old == NULL */
1053                 if (!(related_to->helper->flags & 
1054                       IP_CT_HELPER_F_REUSE_EXPECT)) {
1055                         WRITE_UNLOCK(&ip_conntrack_lock);
1056                         if (net_ratelimit())
1057                                 printk(KERN_WARNING
1058                                        "ip_conntrack: max number of expected "
1059                                        "connections %i of %s reached for "
1060                                        "%u.%u.%u.%u->%u.%u.%u.%u\n",
1061                                        related_to->helper->max_expected,
1062                                        related_to->helper->name,
1063                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1064                                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1065                         kfree(expect);
1066                         return -EPERM;
1067                 }
1068                 DEBUGP("ip_conntrack: max number of expected "
1069                        "connections %i of %s reached for "
1070                        "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1071                        related_to->helper->max_expected,
1072                        related_to->helper->name,
1073                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1074                        NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1075  
1076                 /* choose the the oldest expectation to evict */
1077                 list_for_each_entry(old, &related_to->sibling_list, 
1078                                                       expected_list)
1079                         if (old->sibling == NULL)
1080                                 break;
1081
1082                 /* We cannot fail since related_to->expecting is the number
1083                  * of unconfirmed expectations */
1084                 IP_NF_ASSERT(old && old->sibling == NULL);
1085
1086                 /* newnat14 does not reuse the real allocated memory
1087                  * structures but rather unexpects the old and
1088                  * allocates a new.  unexpect_related will decrement
1089                  * related_to->expecting. 
1090                  */
1091                 unexpect_related(old);
1092                 ret = -EPERM;
1093         } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1094                              struct ip_conntrack_expect *, &expect->tuple, 
1095                              &expect->mask)) {
1096                 WRITE_UNLOCK(&ip_conntrack_lock);
1097                 DEBUGP("expect_related: busy!\n");
1098
1099                 kfree(expect);
1100                 return -EBUSY;
1101         }
1102
1103 out:    ip_conntrack_expect_insert(expect, related_to);
1104
1105         WRITE_UNLOCK(&ip_conntrack_lock);
1106
1107         return ret;
1108 }
1109
1110 /* Change tuple in an existing expectation */
1111 int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1112                                struct ip_conntrack_tuple *newtuple)
1113 {
1114         int ret;
1115
1116         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1117         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1118
1119         DEBUGP("change_expect:\n");
1120         DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
1121         DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
1122         DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
1123         if (expect->ct_tuple.dst.protonum == 0) {
1124                 /* Never seen before */
1125                 DEBUGP("change expect: never seen before\n");
1126                 if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
1127                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1128                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1129                         /* Force NAT to find an unused tuple */
1130                         ret = -1;
1131                 } else {
1132                         memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1133                         memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1134                         ret = 0;
1135                 }
1136         } else {
1137                 /* Resent packet */
1138                 DEBUGP("change expect: resent packet\n");
1139                 if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1140                         ret = 0;
1141                 } else {
1142                         /* Force NAT to choose again the same port */
1143                         ret = -1;
1144                 }
1145         }
1146         WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1147         
1148         return ret;
1149 }
1150
1151 /* Alter reply tuple (maybe alter helper).  If it's already taken,
1152    return 0 and don't do alteration. */
1153 int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1154                              const struct ip_conntrack_tuple *newreply)
1155 {
1156         WRITE_LOCK(&ip_conntrack_lock);
1157         if (__ip_conntrack_find(newreply, conntrack)) {
1158                 WRITE_UNLOCK(&ip_conntrack_lock);
1159                 return 0;
1160         }
1161         /* Should be unconfirmed, so not in hash table yet */
1162         IP_NF_ASSERT(!is_confirmed(conntrack));
1163
1164         DEBUGP("Altering reply tuple of %p to ", conntrack);
1165         DUMP_TUPLE(newreply);
1166
1167         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1168         if (!conntrack->master && list_empty(&conntrack->sibling_list))
1169                 conntrack->helper = ip_ct_find_helper(newreply);
1170         WRITE_UNLOCK(&ip_conntrack_lock);
1171
1172         return 1;
1173 }
1174
1175 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1176 {
1177         WRITE_LOCK(&ip_conntrack_lock);
1178         list_prepend(&helpers, me);
1179         WRITE_UNLOCK(&ip_conntrack_lock);
1180
1181         return 0;
1182 }
1183
1184 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1185                          const struct ip_conntrack_helper *me)
1186 {
1187         if (i->ctrack->helper == me) {
1188                 /* Get rid of any expected. */
1189                 remove_expectations(i->ctrack, 0);
1190                 /* And *then* set helper to NULL */
1191                 i->ctrack->helper = NULL;
1192         }
1193         return 0;
1194 }
1195
1196 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1197 {
1198         unsigned int i;
1199
1200         /* Need write lock here, to delete helper. */
1201         WRITE_LOCK(&ip_conntrack_lock);
1202         LIST_DELETE(&helpers, me);
1203
1204         /* Get rid of expecteds, set helpers to NULL. */
1205         for (i = 0; i < ip_conntrack_htable_size; i++)
1206                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1207                             struct ip_conntrack_tuple_hash *, me);
1208         WRITE_UNLOCK(&ip_conntrack_lock);
1209
1210         /* Someone could be still looking at the helper in a bh. */
1211         synchronize_net();
1212 }
1213
1214 /* Refresh conntrack for this many jiffies. */
1215 void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1216 {
1217         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1218
1219         /* If not in hash table, timer will not be active yet */
1220         if (!is_confirmed(ct))
1221                 ct->timeout.expires = extra_jiffies;
1222         else {
1223                 WRITE_LOCK(&ip_conntrack_lock);
1224                 /* Need del_timer for race avoidance (may already be dying). */
1225                 if (del_timer(&ct->timeout)) {
1226                         ct->timeout.expires = jiffies + extra_jiffies;
1227                         add_timer(&ct->timeout);
1228                 }
1229                 WRITE_UNLOCK(&ip_conntrack_lock);
1230         }
1231 }
1232
1233 /* Returns new sk_buff, or NULL */
1234 struct sk_buff *
1235 ip_ct_gather_frags(struct sk_buff *skb)
1236 {
1237         struct sock *sk = skb->sk;
1238 #ifdef CONFIG_NETFILTER_DEBUG
1239         unsigned int olddebug = skb->nf_debug;
1240 #endif
1241         if (sk) {
1242                 sock_hold(sk);
1243                 skb_orphan(skb);
1244         }
1245
1246         local_bh_disable(); 
1247         skb = ip_defrag(skb);
1248         local_bh_enable();
1249
1250         if (!skb) {
1251                 if (sk)
1252                         sock_put(sk);
1253                 return skb;
1254         }
1255
1256         if (sk) {
1257                 skb_set_owner_w(skb, sk);
1258                 sock_put(sk);
1259         }
1260
1261         ip_send_check(skb->nh.iph);
1262         skb->nfcache |= NFC_ALTERED;
1263 #ifdef CONFIG_NETFILTER_DEBUG
1264         /* Packet path as if nothing had happened. */
1265         skb->nf_debug = olddebug;
1266 #endif
1267         return skb;
1268 }
1269
1270 /* Used by ipt_REJECT. */
1271 static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1272 {
1273         struct ip_conntrack *ct;
1274         enum ip_conntrack_info ctinfo;
1275
1276         ct = __ip_conntrack_get(nfct, &ctinfo);
1277
1278         /* This ICMP is in reverse direction to the packet which
1279            caused it */
1280         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1281                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1282         else
1283                 ctinfo = IP_CT_RELATED;
1284
1285         /* Attach new skbuff, and increment count */
1286         nskb->nfct = &ct->infos[ctinfo];
1287         atomic_inc(&ct->ct_general.use);
1288 }
1289
1290 static inline int
1291 do_kill(const struct ip_conntrack_tuple_hash *i,
1292         int (*kill)(const struct ip_conntrack *i, void *data),
1293         void *data)
1294 {
1295         return kill(i->ctrack, data);
1296 }
1297
1298 /* Bring out ya dead! */
1299 static struct ip_conntrack_tuple_hash *
1300 get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1301                 void *data, unsigned int *bucket)
1302 {
1303         struct ip_conntrack_tuple_hash *h = NULL;
1304
1305         READ_LOCK(&ip_conntrack_lock);
1306         for (; !h && *bucket < ip_conntrack_htable_size; (*bucket)++) {
1307                 h = LIST_FIND(&ip_conntrack_hash[*bucket], do_kill,
1308                               struct ip_conntrack_tuple_hash *, kill, data);
1309         }
1310         if (h)
1311                 atomic_inc(&h->ctrack->ct_general.use);
1312         READ_UNLOCK(&ip_conntrack_lock);
1313
1314         return h;
1315 }
1316
1317 void
1318 ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1319                         void *data)
1320 {
1321         struct ip_conntrack_tuple_hash *h;
1322         unsigned int bucket = 0;
1323
1324         while ((h = get_next_corpse(kill, data, &bucket)) != NULL) {
1325                 /* Time to push up daises... */
1326                 if (del_timer(&h->ctrack->timeout))
1327                         death_by_timeout((unsigned long)h->ctrack);
1328                 /* ... else the timer will get him soon. */
1329
1330                 ip_conntrack_put(h->ctrack);
1331         }
1332 }
1333
1334 /* Fast function for those who don't want to parse /proc (and I don't
1335    blame them). */
1336 /* Reversing the socket's dst/src point of view gives us the reply
1337    mapping. */
1338 static int
1339 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1340 {
1341         struct inet_opt *inet = inet_sk(sk);
1342         struct ip_conntrack_tuple_hash *h;
1343         struct ip_conntrack_tuple tuple;
1344         
1345         IP_CT_TUPLE_U_BLANK(&tuple);
1346         tuple.src.ip = inet->rcv_saddr;
1347         tuple.src.u.tcp.port = inet->sport;
1348         tuple.dst.ip = inet->daddr;
1349         tuple.dst.u.tcp.port = inet->dport;
1350         tuple.dst.protonum = IPPROTO_TCP;
1351
1352         /* We only do TCP at the moment: is there a better way? */
1353         if (strcmp(sk->sk_prot->name, "TCP")) {
1354                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1355                 return -ENOPROTOOPT;
1356         }
1357
1358         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1359                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1360                        *len, sizeof(struct sockaddr_in));
1361                 return -EINVAL;
1362         }
1363
1364         h = ip_conntrack_find_get(&tuple, NULL);
1365         if (h) {
1366                 struct sockaddr_in sin;
1367
1368                 sin.sin_family = AF_INET;
1369                 sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1370                         .tuple.dst.u.tcp.port;
1371                 sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1372                         .tuple.dst.ip;
1373
1374                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1375                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1376                 ip_conntrack_put(h->ctrack);
1377                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1378                         return -EFAULT;
1379                 else
1380                         return 0;
1381         }
1382         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1383                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1384                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1385         return -ENOENT;
1386 }
1387
1388 static struct nf_sockopt_ops so_getorigdst = {
1389         .pf             = PF_INET,
1390         .get_optmin     = SO_ORIGINAL_DST,
1391         .get_optmax     = SO_ORIGINAL_DST+1,
1392         .get            = &getorigdst,
1393 };
1394
1395 static int kill_all(const struct ip_conntrack *i, void *data)
1396 {
1397         return 1;
1398 }
1399
1400 /* Mishearing the voices in his head, our hero wonders how he's
1401    supposed to kill the mall. */
1402 void ip_conntrack_cleanup(void)
1403 {
1404         ip_ct_attach = NULL;
1405         /* This makes sure all current packets have passed through
1406            netfilter framework.  Roll on, two-stage module
1407            delete... */
1408         synchronize_net();
1409  
1410  i_see_dead_people:
1411         ip_ct_selective_cleanup(kill_all, NULL);
1412         if (atomic_read(&ip_conntrack_count) != 0) {
1413                 schedule();
1414                 goto i_see_dead_people;
1415         }
1416
1417         kmem_cache_destroy(ip_conntrack_cachep);
1418         vfree(ip_conntrack_hash);
1419         nf_unregister_sockopt(&so_getorigdst);
1420 }
1421
1422 static int hashsize;
1423 MODULE_PARM(hashsize, "i");
1424
1425 int __init ip_conntrack_init(void)
1426 {
1427         unsigned int i;
1428         int ret;
1429
1430         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1431          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1432         if (hashsize) {
1433                 ip_conntrack_htable_size = hashsize;
1434         } else {
1435                 ip_conntrack_htable_size
1436                         = (((num_physpages << PAGE_SHIFT) / 16384)
1437                            / sizeof(struct list_head));
1438                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1439                         ip_conntrack_htable_size = 8192;
1440                 if (ip_conntrack_htable_size < 16)
1441                         ip_conntrack_htable_size = 16;
1442         }
1443         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1444
1445 #ifdef CONFIG_MIPS_BRCM
1446         ip_conntrack_max=0;
1447 #endif
1448         printk("ip_conntrack version %s (%u buckets, %d max)"
1449                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1450                ip_conntrack_htable_size, ip_conntrack_max,
1451                sizeof(struct ip_conntrack));
1452
1453         ret = nf_register_sockopt(&so_getorigdst);
1454         if (ret != 0) {
1455                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1456                 return ret;
1457         }
1458
1459         ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1460                                     * ip_conntrack_htable_size);
1461         if (!ip_conntrack_hash) {
1462                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1463                 goto err_unreg_sockopt;
1464         }
1465
1466         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1467                                                 sizeof(struct ip_conntrack), 0,
1468                                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
1469         if (!ip_conntrack_cachep) {
1470                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1471                 goto err_free_hash;
1472         }
1473         /* Don't NEED lock here, but good form anyway. */
1474         WRITE_LOCK(&ip_conntrack_lock);
1475         /* Sew in builtin protocols. */
1476         list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1477         list_append(&protocol_list, &ip_conntrack_protocol_udp);
1478         list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1479         list_append(&protocol_list, &ip_conntrack_protocol_esp);
1480         WRITE_UNLOCK(&ip_conntrack_lock);
1481
1482         for (i = 0; i < ip_conntrack_htable_size; i++)
1483                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1484
1485         /* For use by ipt_REJECT */
1486         ip_ct_attach = ip_conntrack_attach;
1487
1488         /* Set up fake conntrack:
1489             - to never be deleted, not in any hashes */
1490         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1491         /*  - and look it like as a confirmed connection */
1492         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1493         /*  - and prepare the ctinfo field for REJECT & NAT. */
1494         ip_conntrack_untracked.infos[IP_CT_NEW].master =
1495         ip_conntrack_untracked.infos[IP_CT_RELATED].master =
1496         ip_conntrack_untracked.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = 
1497                         &ip_conntrack_untracked.ct_general;
1498
1499         return ret;
1500
1501 err_free_hash:
1502         vfree(ip_conntrack_hash);
1503 err_unreg_sockopt:
1504         nf_unregister_sockopt(&so_getorigdst);
1505
1506         return -ENOMEM;
1507 }