more changes on original files
[linux-2.4.git] / net / ipv4 / ipvs / ip_vs_conn.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22  *
23  * Changes:
24  *
25  */
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/vmalloc.h>
30 #include <linux/ip.h>
31 #include <linux/tcp.h>                  /* for tcphdr */
32 #include <linux/in.h>
33 #include <linux/proc_fs.h>              /* for proc_net_* */
34 #include <asm/softirq.h>                /* for local_bh_* */
35 #include <net/ip.h>
36 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
37 #include <net/udp.h>
38 #include <net/icmp.h>                   /* for icmp_send */
39 #include <net/route.h>                  /* for ip_route_output */
40 #include <linux/netfilter.h>
41 #include <linux/netfilter_ipv4.h>
42 #include <linux/jhash.h>
43 #include <linux/random.h>
44
45 #include <net/ip_vs.h>
46
47
48 /*
49  *  Connection hash table: for input and output packets lookups of IPVS
50  */
51 static struct list_head *ip_vs_conn_tab;
52
53 /* SLAB cache for IPVS connections */
54 static kmem_cache_t *ip_vs_conn_cachep;
55
56 /* counter for current IPVS connections */
57 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
58
59 /* counter for no-client-port connections */
60 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
61
62 /* random value for IPVS connection hash */
63 static unsigned int ip_vs_conn_rnd;
64
65 /*
66  *  Fine locking granularity for big connection hash table
67  */
68 #define CT_LOCKARRAY_BITS  4
69 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
70 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
71
72 struct ip_vs_aligned_lock
73 {
74         rwlock_t        l;
75 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
76
77 /* lock array for conn table */
78 struct ip_vs_aligned_lock
79 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
80
81 static inline void ct_read_lock(unsigned key)
82 {
83         read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
84 }
85
86 static inline void ct_read_unlock(unsigned key)
87 {
88         read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
89 }
90
91 static inline void ct_write_lock(unsigned key)
92 {
93         write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
94 }
95
96 static inline void ct_write_unlock(unsigned key)
97 {
98         write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
99 }
100
101 static inline void ct_read_lock_bh(unsigned key)
102 {
103         read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
104 }
105
106 static inline void ct_read_unlock_bh(unsigned key)
107 {
108         read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
109 }
110
111 static inline void ct_write_lock_bh(unsigned key)
112 {
113         write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
114 }
115
116 static inline void ct_write_unlock_bh(unsigned key)
117 {
118         write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
119 }
120
121
122 /*
123  *      Returns hash value for IPVS connection entry
124  */
125 static unsigned
126 ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
127 {
128         return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
129                 & IP_VS_CONN_TAB_MASK;
130 }
131
132
133 /*
134  *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
135  *      returns bool success.
136  */
137 static int ip_vs_conn_hash(struct ip_vs_conn *cp)
138 {
139         unsigned hash;
140         int ret;
141
142         /* Hash by protocol, client address and port */
143         hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
144
145         ct_write_lock(hash);
146
147         if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
148                 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
149                 cp->flags |= IP_VS_CONN_F_HASHED;
150                 atomic_inc(&cp->refcnt);
151                 ret = 1;
152         } else {
153                 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
154                           "called from %p\n", __builtin_return_address(0));
155                 ret = 0;
156         }
157
158         ct_write_unlock(hash);
159
160         return ret;
161 }
162
163
164 /*
165  *      UNhashes ip_vs_conn from ip_vs_conn_tab.
166  *      returns bool success.
167  */
168 static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
169 {
170         unsigned hash;
171         int ret;
172
173         /* unhash it and decrease its reference counter */
174         hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
175         ct_write_lock(hash);
176
177         if (cp->flags & IP_VS_CONN_F_HASHED) {
178                 list_del(&cp->c_list);
179                 cp->flags &= ~IP_VS_CONN_F_HASHED;
180                 atomic_dec(&cp->refcnt);
181                 ret = 1;
182         } else
183                 ret = 0;
184
185         ct_write_unlock(hash);
186
187         return ret;
188 }
189
190
191 /*
192  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
193  *  Called for pkts coming from OUTside-to-INside.
194  *      s_addr, s_port: pkt source address (foreign host)
195  *      d_addr, d_port: pkt dest address (load balancer)
196  */
197 static inline struct ip_vs_conn *__ip_vs_conn_in_get
198 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
199 {
200         unsigned hash;
201         struct ip_vs_conn *cp;
202         struct list_head *l,*e;
203
204         hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
205         l = &ip_vs_conn_tab[hash];
206
207         ct_read_lock(hash);
208
209         for (e=l->next; e!=l; e=e->next) {
210                 cp = list_entry(e, struct ip_vs_conn, c_list);
211                 if (s_addr==cp->caddr && s_port==cp->cport &&
212                     d_port==cp->vport && d_addr==cp->vaddr &&
213                     ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
214                     protocol==cp->protocol) {
215                         /* HIT */
216                         atomic_inc(&cp->refcnt);
217                         ct_read_unlock(hash);
218                         return cp;
219                 }
220         }
221
222         ct_read_unlock(hash);
223
224         return NULL;
225 }
226
227 struct ip_vs_conn *ip_vs_conn_in_get
228 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
229 {
230         struct ip_vs_conn *cp;
231
232         cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
233         if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
234                 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
235
236         IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
237                   ip_vs_proto_name(protocol),
238                   NIPQUAD(s_addr), ntohs(s_port),
239                   NIPQUAD(d_addr), ntohs(d_port),
240                   cp?"hit":"not hit");
241
242         return cp;
243 }
244
245 /* Get reference to connection template */
246 struct ip_vs_conn *ip_vs_ct_in_get
247 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
248 {
249         unsigned hash;
250         struct ip_vs_conn *cp;
251
252         hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
253
254         ct_read_lock(hash);
255
256         list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
257                 if (s_addr==cp->caddr && s_port==cp->cport &&
258                     d_port==cp->vport && d_addr==cp->vaddr &&
259                     cp->flags & IP_VS_CONN_F_TEMPLATE &&
260                     protocol==cp->protocol) {
261                         /* HIT */
262                         atomic_inc(&cp->refcnt);
263                         goto out;
264                 }
265         }
266         cp = NULL;
267
268   out:
269         ct_read_unlock(hash);
270
271         IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
272                   ip_vs_proto_name(protocol),
273                   NIPQUAD(s_addr), ntohs(s_port),
274                   NIPQUAD(d_addr), ntohs(d_port),
275                   cp?"hit":"not hit");
276
277         return cp;
278 }
279
280 /*
281  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
282  *  Called for pkts coming from inside-to-OUTside.
283  *      s_addr, s_port: pkt source address (inside host)
284  *      d_addr, d_port: pkt dest address (foreign host)
285  */
286 struct ip_vs_conn *ip_vs_conn_out_get
287 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
288 {
289         unsigned hash;
290         struct ip_vs_conn *cp, *ret=NULL;
291         struct list_head *l,*e;
292
293         /*
294          *      Check for "full" addressed entries
295          */
296         hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
297         l = &ip_vs_conn_tab[hash];
298
299         ct_read_lock(hash);
300
301         for (e=l->next; e!=l; e=e->next) {
302                 cp = list_entry(e, struct ip_vs_conn, c_list);
303                 if (d_addr == cp->caddr && d_port == cp->cport &&
304                     s_port == cp->dport && s_addr == cp->daddr &&
305                     protocol == cp->protocol) {
306                         /* HIT */
307                         atomic_inc(&cp->refcnt);
308                         ret = cp;
309                         break;
310                 }
311         }
312
313         ct_read_unlock(hash);
314
315         IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
316                   ip_vs_proto_name(protocol),
317                   NIPQUAD(s_addr), ntohs(s_port),
318                   NIPQUAD(d_addr), ntohs(d_port),
319                   ret?"hit":"not hit");
320
321         return ret;
322 }
323
324
325 /*
326  *      Put back the conn and restart its timer with its timeout
327  */
328 void ip_vs_conn_put(struct ip_vs_conn *cp)
329 {
330         /* reset it expire in its timeout */
331         mod_timer(&cp->timer, jiffies+cp->timeout);
332
333         __ip_vs_conn_put(cp);
334 }
335
336
337 /*
338  *      Timeout table[state]
339  */
340 struct ip_vs_timeout_table vs_timeout_table = {
341         ATOMIC_INIT(0), /* refcnt */
342         0,              /* scale  */
343         {
344                 [IP_VS_S_NONE]          =       30*60*HZ,
345                 [IP_VS_S_ESTABLISHED]   =       15*60*HZ,
346                 [IP_VS_S_SYN_SENT]      =       2*60*HZ,
347                 [IP_VS_S_SYN_RECV]      =       1*60*HZ,
348                 [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
349                 [IP_VS_S_TIME_WAIT]     =       2*60*HZ,
350                 [IP_VS_S_CLOSE]         =       10*HZ,
351                 [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
352                 [IP_VS_S_LAST_ACK]      =       30*HZ,
353                 [IP_VS_S_LISTEN]        =       2*60*HZ,
354                 [IP_VS_S_SYNACK]        =       120*HZ,
355                 [IP_VS_S_UDP]           =       5*60*HZ,
356                 [IP_VS_S_ICMP]          =       1*60*HZ,
357                 [IP_VS_S_LAST]          =       2*HZ,
358         },      /* timeout */
359 };
360
361
362 struct ip_vs_timeout_table vs_timeout_table_dos = {
363         ATOMIC_INIT(0), /* refcnt */
364         0,              /* scale  */
365         {
366                 [IP_VS_S_NONE]          =       15*60*HZ,
367                 [IP_VS_S_ESTABLISHED]   =       8*60*HZ,
368                 [IP_VS_S_SYN_SENT]      =       60*HZ,
369                 [IP_VS_S_SYN_RECV]      =       10*HZ,
370                 [IP_VS_S_FIN_WAIT]      =       60*HZ,
371                 [IP_VS_S_TIME_WAIT]     =       60*HZ,
372                 [IP_VS_S_CLOSE]         =       10*HZ,
373                 [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
374                 [IP_VS_S_LAST_ACK]      =       30*HZ,
375                 [IP_VS_S_LISTEN]        =       2*60*HZ,
376                 [IP_VS_S_SYNACK]        =       100*HZ,
377                 [IP_VS_S_UDP]           =       3*60*HZ,
378                 [IP_VS_S_ICMP]          =       1*60*HZ,
379                 [IP_VS_S_LAST]          =       2*HZ,
380         },      /* timeout */
381 };
382
383
384 /*
385  *      Timeout table to use for the VS entries
386  *      If NULL we use the default table (vs_timeout_table).
387  *      Under flood attack we switch to vs_timeout_table_dos
388  */
389
390 static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
391
392 static const char * state_name_table[IP_VS_S_LAST+1] = {
393         [IP_VS_S_NONE]          =       "NONE",
394         [IP_VS_S_ESTABLISHED]   =       "ESTABLISHED",
395         [IP_VS_S_SYN_SENT]      =       "SYN_SENT",
396         [IP_VS_S_SYN_RECV]      =       "SYN_RECV",
397         [IP_VS_S_FIN_WAIT]      =       "FIN_WAIT",
398         [IP_VS_S_TIME_WAIT]     =       "TIME_WAIT",
399         [IP_VS_S_CLOSE]         =       "CLOSE",
400         [IP_VS_S_CLOSE_WAIT]    =       "CLOSE_WAIT",
401         [IP_VS_S_LAST_ACK]      =       "LAST_ACK",
402         [IP_VS_S_LISTEN]        =       "LISTEN",
403         [IP_VS_S_SYNACK]        =       "SYNACK",
404         [IP_VS_S_UDP]           =       "UDP",
405         [IP_VS_S_ICMP]          =       "ICMP",
406         [IP_VS_S_LAST]          =       "BUG!",
407 };
408
409 #define sNO IP_VS_S_NONE
410 #define sES IP_VS_S_ESTABLISHED
411 #define sSS IP_VS_S_SYN_SENT
412 #define sSR IP_VS_S_SYN_RECV
413 #define sFW IP_VS_S_FIN_WAIT
414 #define sTW IP_VS_S_TIME_WAIT
415 #define sCL IP_VS_S_CLOSE
416 #define sCW IP_VS_S_CLOSE_WAIT
417 #define sLA IP_VS_S_LAST_ACK
418 #define sLI IP_VS_S_LISTEN
419 #define sSA IP_VS_S_SYNACK
420
421 struct vs_tcp_states_t {
422         int next_state[IP_VS_S_LAST];   /* should be _LAST_TCP */
423 };
424
425 const char * ip_vs_state_name(int state)
426 {
427         if (state >= IP_VS_S_LAST)
428                 return "ERR!";
429         return state_name_table[state] ? state_name_table[state] : "?";
430 }
431
432 static struct vs_tcp_states_t vs_tcp_states [] = {
433 /*      INPUT */
434 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
435 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
436 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
437 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
438 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
439
440 /*      OUTPUT */
441 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
442 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
443 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
444 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
445 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
446
447 /*      INPUT-ONLY */
448 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
449 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
450 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
451 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
452 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
453 };
454
455 static struct vs_tcp_states_t vs_tcp_states_dos [] = {
456 /*      INPUT */
457 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
458 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
459 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
460 /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
461 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
462
463 /*      OUTPUT */
464 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
465 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
466 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
467 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
468 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
469
470 /*      INPUT-ONLY */
471 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
472 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
473 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
474 /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
475 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
476 };
477
478 static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
479
480 void ip_vs_secure_tcp_set(int on)
481 {
482         if (on) {
483                 ip_vs_state_table = vs_tcp_states_dos;
484                 ip_vs_timeout_table = &vs_timeout_table_dos;
485         } else {
486                 ip_vs_state_table = vs_tcp_states;
487                 ip_vs_timeout_table = &vs_timeout_table;
488         }
489 }
490
491
492 static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
493 {
494         /*
495          *      [0-3]: input states, [4-7]: output, [8-11] input only states.
496          */
497         if (th->rst)
498                 return state_off+3;
499         if (th->syn)
500                 return state_off+0;
501         if (th->fin)
502                 return state_off+1;
503         if (th->ack)
504                 return state_off+2;
505         return -1;
506 }
507
508
509 static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
510 {
511         struct ip_vs_timeout_table *vstim = cp->timeout_table;
512
513         /*
514          *      Use default timeout table if no specific for this entry
515          */
516         if (!vstim)
517                 vstim = &vs_timeout_table;
518
519         cp->timeout = vstim->timeout[cp->state=state];
520
521         if (vstim->scale) {
522                 int scale = vstim->scale;
523
524                 if (scale<0)
525                         cp->timeout >>= -scale;
526                 else if (scale > 0)
527                         cp->timeout <<= scale;
528         }
529
530         return state;
531 }
532
533
534 static inline int
535 vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
536 {
537         int state_idx;
538         int new_state = IP_VS_S_CLOSE;
539
540         /*
541          *    Update state offset to INPUT_ONLY if necessary
542          *    or delete NO_OUTPUT flag if output packet detected
543          */
544         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
545                 if (state_off == VS_STATE_OUTPUT)
546                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
547                 else
548                         state_off = VS_STATE_INPUT_ONLY;
549         }
550
551         if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
552                 IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
553                           state_off, state_idx);
554                 goto tcp_state_out;
555         }
556
557         new_state = ip_vs_state_table[state_idx].next_state[cp->state];
558
559   tcp_state_out:
560         if (new_state != cp->state) {
561                 struct ip_vs_dest *dest = cp->dest;
562
563                 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
564                           "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
565                           ip_vs_proto_name(cp->protocol),
566                           (state_off==VS_STATE_OUTPUT)?"output ":"input ",
567                           th->syn? 'S' : '.',
568                           th->fin? 'F' : '.',
569                           th->ack? 'A' : '.',
570                           th->rst? 'R' : '.',
571                           NIPQUAD(cp->daddr), ntohs(cp->dport),
572                           NIPQUAD(cp->caddr), ntohs(cp->cport),
573                           ip_vs_state_name(cp->state),
574                           ip_vs_state_name(new_state),
575                           atomic_read(&cp->refcnt));
576                 if (dest) {
577                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
578                             (new_state != IP_VS_S_ESTABLISHED)) {
579                                 atomic_dec(&dest->activeconns);
580                                 atomic_inc(&dest->inactconns);
581                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
582                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
583                                    (new_state == IP_VS_S_ESTABLISHED)) {
584                                 atomic_inc(&dest->activeconns);
585                                 atomic_dec(&dest->inactconns);
586                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
587                         }
588                 }
589         }
590
591         return vs_set_state_timeout(cp, new_state);
592 }
593
594
595 /*
596  *      Handle state transitions
597  */
598 int ip_vs_set_state(struct ip_vs_conn *cp,
599                     int state_off, struct iphdr *iph, void *tp)
600 {
601         int ret;
602
603         spin_lock(&cp->lock);
604         switch (iph->protocol) {
605         case IPPROTO_TCP:
606                 ret = vs_tcp_state(cp, state_off, tp);
607                 break;
608         case IPPROTO_UDP:
609                 ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
610                 break;
611         case IPPROTO_ICMP:
612                 ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
613                 break;
614         default:
615                 ret = -1;
616         }
617         spin_unlock(&cp->lock);
618
619         return ret;
620 }
621
622
623 /*
624  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
625  */
626 int ip_vs_conn_listen(struct ip_vs_conn *cp)
627 {
628         vs_set_state_timeout(cp, IP_VS_S_LISTEN);
629         return cp->timeout;
630 }
631
632
633 /*
634  *      Bypass transmitter
635  *      Let packets bypass the destination when the destination is not
636  *      available, it may be only used in transparent cache cluster.
637  */
638 static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
639 {
640         struct rtable *rt;                      /* Route to the other host */
641         struct iphdr  *iph = skb->nh.iph;
642         u8     tos = iph->tos;
643         int    mtu;
644
645         EnterFunction(10);
646
647         if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
648                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
649                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
650                 goto tx_error_icmp;
651         }
652
653         /* MTU checking */
654         mtu = rt->u.dst.pmtu;
655         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
656                 ip_rt_put(rt);
657                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
658                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
659                 goto tx_error;
660         }
661
662         /* update checksum because skb might be defragmented */
663         ip_send_check(iph);
664
665         if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
666                 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
667                         ip_rt_put(rt);
668                         IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
669                         goto tx_error;
670                 }
671         }
672
673         /* drop old route */
674         dst_release(skb->dst);
675         skb->dst = &rt->u.dst;
676
677 #ifdef CONFIG_NETFILTER_DEBUG
678         skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
679 #endif /* CONFIG_NETFILTER_DEBUG */
680         skb->nfcache |= NFC_IPVS_PROPERTY;
681         ip_send(skb);
682
683         LeaveFunction(10);
684         return NF_STOLEN;
685
686   tx_error_icmp:
687         dst_link_failure(skb);
688   tx_error:
689         kfree_skb(skb);
690         return NF_STOLEN;
691 }
692
693
694 /*
695  *      NULL transmitter (do nothing except return NF_ACCEPT)
696  */
697 static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
698 {
699         return NF_ACCEPT;
700 }
701
702
703 /*
704  *      NAT transmitter (only for outside-to-inside nat forwarding)
705  */
706 static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
707 {
708         struct rtable *rt;              /* Route to the other host */
709         struct iphdr  *iph;
710         union ip_vs_tphdr h;
711         int ihl;
712         unsigned short size;
713         int mtu;
714
715         EnterFunction(10);
716
717         /*
718          * If it has ip_vs_app helper, the helper may change the payload,
719          * so it needs full checksum checking and checksum calculation.
720          * If not, only the header (such as IP address and port number)
721          * will be changed, so it is fast to do incremental checksum update,
722          * and let the destination host  do final checksum checking.
723          */
724
725         if (cp->app && skb_is_nonlinear(skb)
726             && skb_linearize(skb, GFP_ATOMIC) != 0)
727                 return NF_DROP;
728
729         iph = skb->nh.iph;
730         ihl = iph->ihl << 2;
731         h.raw = (char*) iph + ihl;
732         size = ntohs(iph->tot_len) - ihl;
733
734         /* do TCP/UDP checksum checking if it has application helper */
735         if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
736                 switch (skb->ip_summed) {
737                 case CHECKSUM_NONE:
738                         skb->csum = csum_partial(h.raw, size, 0);
739
740                 case CHECKSUM_HW:
741                         if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
742                                               iph->protocol, skb->csum)) {
743                                 IP_VS_DBG_RL("Incoming failed %s checksum "
744                                              "from %d.%d.%d.%d (size=%d)!\n",
745                                              ip_vs_proto_name(iph->protocol),
746                                              NIPQUAD(iph->saddr),
747                                              size);
748                                 goto tx_error;
749                         }
750                         break;
751                 default:
752                         /* CHECKSUM_UNNECESSARY */
753                         break;
754                 }
755         }
756
757         /*
758          *  Check if it is no_cport connection ...
759          */
760         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
761                 if (ip_vs_conn_unhash(cp)) {
762                         spin_lock(&cp->lock);
763                         if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
764                                 atomic_dec(&ip_vs_conn_no_cport_cnt);
765                                 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
766                                 cp->cport = h.portp[0];
767                                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
768                         }
769                         spin_unlock(&cp->lock);
770
771                         /* hash on new dport */
772                         ip_vs_conn_hash(cp);
773                 }
774         }
775
776         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
777                 goto tx_error_icmp;
778
779         /* MTU checking */
780         mtu = rt->u.dst.pmtu;
781         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
782                 ip_rt_put(rt);
783                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
784                 IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
785                 goto tx_error;
786         }
787
788         /* drop old route */
789         dst_release(skb->dst);
790         skb->dst = &rt->u.dst;
791
792         /* copy-on-write the packet before mangling it */
793         if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
794                 return NF_DROP;
795
796         /* mangle the packet */
797         iph->daddr = cp->daddr;
798         h.portp[1] = cp->dport;
799
800         /*
801          *      Attempt ip_vs_app call.
802          *      will fix ip_vs_conn and iph ack_seq stuff
803          */
804         if (ip_vs_app_pkt_in(cp, skb) != 0) {
805                 /* skb data has probably changed, update pointers */
806                 iph = skb->nh.iph;
807                 h.raw = (char*) iph + ihl;
808                 size = skb->len - ihl;
809         }
810
811         /*
812          *      Adjust TCP/UDP checksums
813          */
814         if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
815                 /* Only port and addr are changed, do fast csum update */
816                 ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
817                                         cp->vport, cp->dport, iph->protocol);
818                 if (skb->ip_summed == CHECKSUM_HW)
819                         skb->ip_summed = CHECKSUM_NONE;
820         } else {
821                 /* full checksum calculation */
822                 switch (iph->protocol) {
823                 case IPPROTO_TCP:
824                         h.th->check = 0;
825                         h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
826                                                         size, iph->protocol,
827                                                         csum_partial(h.raw, size, 0));
828                         break;
829                 case IPPROTO_UDP:
830                         h.uh->check = 0;
831                         h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
832                                                         size, iph->protocol,
833                                                         csum_partial(h.raw, size, 0));
834                         if (h.uh->check == 0)
835                                 h.uh->check = 0xFFFF;
836                         break;
837                 }
838                 skb->ip_summed = CHECKSUM_UNNECESSARY;
839         }
840         ip_send_check(iph);
841
842         IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
843                   NIPQUAD(iph->daddr), ntohs(h.portp[1]));
844
845         /* FIXME: when application helper enlarges the packet and the length
846            is larger than the MTU of outgoing device, there will be still
847            MTU problem. */
848
849 #ifdef CONFIG_NETFILTER_DEBUG
850         skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
851 #endif /* CONFIG_NETFILTER_DEBUG */
852         skb->nfcache |= NFC_IPVS_PROPERTY;
853         ip_send(skb);
854
855         LeaveFunction(10);
856         return NF_STOLEN;
857
858   tx_error_icmp:
859         dst_link_failure(skb);
860   tx_error:
861         kfree_skb(skb);
862         return NF_STOLEN;
863 }
864
865
866 /*
867  *   IP Tunneling transmitter
868  *
869  *   This function encapsulates the packet in a new IP packet, its
870  *   destination will be set to cp->daddr. Most code of this function
871  *   is taken from ipip.c.
872  *
873  *   It is used in VS/TUN cluster. The load balancer selects a real
874  *   server from a cluster based on a scheduling algorithm,
875  *   encapsulates the request packet and forwards it to the selected
876  *   server. For example, all real servers are configured with
877  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
878  *   the encapsulated packet, it will decapsulate the packet, processe
879  *   the request and return the response packets directly to the client
880  *   without passing the load balancer. This can greatly increase the
881  *   scalability of virtual server.
882  */
883 static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
884 {
885         struct rtable *rt;                      /* Route to the other host */
886         struct net_device *tdev;                /* Device to other host */
887         struct iphdr  *old_iph = skb->nh.iph;
888         u8     tos = old_iph->tos;
889         u16    df = old_iph->frag_off;
890         struct iphdr  *iph;                     /* Our new IP header */
891         int    max_headroom;                    /* The extra header space needed */
892         int    mtu;
893
894         EnterFunction(10);
895
896         if (skb->protocol != __constant_htons(ETH_P_IP)) {
897                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
898                              "ETH_P_IP: %d, skb protocol: %d\n",
899                              __constant_htons(ETH_P_IP), skb->protocol);
900                 goto tx_error;
901         }
902
903         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
904                 goto tx_error_icmp;
905
906         tdev = rt->u.dst.dev;
907
908         mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
909         if (mtu < 68) {
910                 ip_rt_put(rt);
911                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
912                 goto tx_error;
913         }
914         if (skb->dst && mtu < skb->dst->pmtu)
915                 skb->dst->pmtu = mtu;
916
917         df |= (old_iph->frag_off&__constant_htons(IP_DF));
918
919         if ((old_iph->frag_off&__constant_htons(IP_DF))
920             && mtu < ntohs(old_iph->tot_len)) {
921                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
922                 ip_rt_put(rt);
923                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
924                 goto tx_error;
925         }
926
927         /* update checksum because skb might be defragmented */
928         ip_send_check(old_iph);
929
930         /*
931          * Okay, now see if we can stuff it in the buffer as-is.
932          */
933         max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
934
935         if (skb_headroom(skb) < max_headroom
936             || skb_cloned(skb) || skb_shared(skb)) {
937                 struct sk_buff *new_skb =
938                         skb_realloc_headroom(skb, max_headroom);
939                 if (!new_skb) {
940                         ip_rt_put(rt);
941                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
942                         return NF_DROP;
943                 }
944                 kfree_skb(skb);
945                 skb = new_skb;
946                 old_iph = skb->nh.iph;
947         }
948
949         skb->h.raw = skb->nh.raw;
950         skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
951         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
952
953         /* drop old route */
954         dst_release(skb->dst);
955         skb->dst = &rt->u.dst;
956
957         /*
958          *      Push down and install the IPIP header.
959          */
960         iph                     =       skb->nh.iph;
961         iph->version            =       4;
962         iph->ihl                =       sizeof(struct iphdr)>>2;
963         iph->frag_off           =       df;
964         iph->protocol           =       IPPROTO_IPIP;
965         iph->tos                =       tos;
966         iph->daddr              =       rt->rt_dst;
967         iph->saddr              =       rt->rt_src;
968         iph->ttl                =       old_iph->ttl;
969         iph->tot_len            =       htons(skb->len);
970         ip_select_ident(iph, &rt->u.dst, NULL);
971         ip_send_check(iph);
972
973         skb->ip_summed = CHECKSUM_NONE;
974 #ifdef CONFIG_NETFILTER_DEBUG
975         skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
976 #endif /* CONFIG_NETFILTER_DEBUG */
977         skb->nfcache |= NFC_IPVS_PROPERTY;
978         ip_send(skb);
979
980         LeaveFunction(10);
981
982         return NF_STOLEN;
983
984   tx_error_icmp:
985         dst_link_failure(skb);
986   tx_error:
987         kfree_skb(skb);
988         return NF_STOLEN;
989 }
990
991
992 /*
993  *      Direct Routing transmitter
994  */
995 static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
996 {
997         struct rtable *rt;                      /* Route to the other host */
998         struct iphdr  *iph = skb->nh.iph;
999         int    mtu;
1000
1001         EnterFunction(10);
1002
1003         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
1004                 goto tx_error_icmp;
1005
1006         /* MTU checking */
1007         mtu = rt->u.dst.pmtu;
1008         if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
1009                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
1010                 ip_rt_put(rt);
1011                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
1012                 goto tx_error;
1013         }
1014
1015         /* update checksum because skb might be defragmented */
1016         ip_send_check(iph);
1017
1018         if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
1019                 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
1020                         ip_rt_put(rt);
1021                         IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
1022                         goto tx_error;
1023                 }
1024         }
1025
1026         /* drop old route */
1027         dst_release(skb->dst);
1028         skb->dst = &rt->u.dst;
1029
1030 #ifdef CONFIG_NETFILTER_DEBUG
1031         skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
1032 #endif /* CONFIG_NETFILTER_DEBUG */
1033         skb->nfcache |= NFC_IPVS_PROPERTY;
1034         ip_send(skb);
1035
1036 #if 0000
1037         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
1038                 do_ip_send);
1039 #endif
1040         LeaveFunction(10);
1041         return NF_STOLEN;
1042
1043   tx_error_icmp:
1044         dst_link_failure(skb);
1045   tx_error:
1046         kfree_skb(skb);
1047         return NF_STOLEN;
1048 }
1049
1050
1051 /*
1052  *  Bind a connection entry with the corresponding packet_xmit.
1053  *  Called by ip_vs_conn_new.
1054  */
1055 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
1056 {
1057         switch (IP_VS_FWD_METHOD(cp)) {
1058         case IP_VS_CONN_F_MASQ:
1059                 cp->packet_xmit = ip_vs_nat_xmit;
1060                 break;
1061
1062         case IP_VS_CONN_F_TUNNEL:
1063                 cp->packet_xmit = ip_vs_tunnel_xmit;
1064                 break;
1065
1066         case IP_VS_CONN_F_DROUTE:
1067                 cp->packet_xmit = ip_vs_dr_xmit;
1068                 break;
1069
1070         case IP_VS_CONN_F_LOCALNODE:
1071                 cp->packet_xmit = ip_vs_null_xmit;
1072                 break;
1073
1074         case IP_VS_CONN_F_BYPASS:
1075                 cp->packet_xmit = ip_vs_bypass_xmit;
1076                 break;
1077         }
1078 }
1079
1080
1081 /*
1082  *  Bind a connection entry with a virtual service destination
1083  *  Called just after a new connection entry is created.
1084  */
1085 static inline void
1086 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1087 {
1088         /* if dest is NULL, then return directly */
1089         if (!dest)
1090                 return;
1091
1092         /* Increase the refcnt counter of the dest */
1093         atomic_inc(&dest->refcnt);
1094
1095         /* Bind with the destination and its corresponding transmitter */
1096         cp->flags |= atomic_read(&dest->conn_flags);
1097         cp->dest = dest;
1098
1099         IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1100                   "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
1101                   ip_vs_proto_name(cp->protocol),
1102                   NIPQUAD(cp->caddr), ntohs(cp->cport),
1103                   NIPQUAD(cp->vaddr), ntohs(cp->vport),
1104                   NIPQUAD(cp->daddr), ntohs(cp->dport),
1105                   ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1106                   cp->flags, atomic_read(&cp->refcnt),
1107                   atomic_read(&dest->refcnt));
1108 }
1109
1110
1111 /*
1112  *  Unbind a connection entry with its VS destination
1113  *  Called by the ip_vs_conn_expire function.
1114  */
1115 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1116 {
1117         struct ip_vs_dest *dest = cp->dest;
1118
1119         /* if dest is NULL, then return directly */
1120         if (!dest)
1121                 return;
1122
1123         IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
1124                   "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
1125                   "s:%s flg:%X cnt:%d destcnt:%d\n",
1126                   ip_vs_proto_name(cp->protocol),
1127                   NIPQUAD(cp->caddr), ntohs(cp->cport),
1128                   NIPQUAD(cp->vaddr), ntohs(cp->vport),
1129                   NIPQUAD(cp->daddr), ntohs(cp->dport),
1130                   ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
1131                   cp->flags, atomic_read(&cp->refcnt),
1132                   atomic_read(&dest->refcnt));
1133
1134         /*
1135          * Decrease the inactconns or activeconns counter
1136          * if it is not a connection template
1137          */
1138         if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1139                 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1140                         atomic_dec(&dest->inactconns);
1141                 } else {
1142                         atomic_dec(&dest->activeconns);
1143                 }
1144         }
1145
1146         /*
1147          * Simply decrease the refcnt of the dest, because the
1148          * dest will be either in service's destination list
1149          * or in the trash.
1150          */
1151         atomic_dec(&dest->refcnt);
1152 }
1153
1154
1155 /*
1156  *  Checking if the destination of a connection template is available.
1157  *  If available, return 1, otherwise invalidate this connection
1158  *  template and return 0.
1159  */
1160 int ip_vs_check_template(struct ip_vs_conn *ct)
1161 {
1162         struct ip_vs_dest *dest = ct->dest;
1163
1164         /*
1165          * Checking the dest server status.
1166          */
1167         if ((dest == NULL) ||
1168             !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 
1169             (sysctl_ip_vs_expire_quiescent_template && 
1170              (atomic_read(&dest->weight) == 0))) {
1171                 IP_VS_DBG(9, "check_template: dest not available for "
1172                           "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
1173                           "-> d:%u.%u.%u.%u:%d\n",
1174                           ip_vs_proto_name(ct->protocol),
1175                           NIPQUAD(ct->caddr), ntohs(ct->cport),
1176                           NIPQUAD(ct->vaddr), ntohs(ct->vport),
1177                           NIPQUAD(ct->daddr), ntohs(ct->dport));
1178
1179                 /*
1180                  * Invalidate the connection template
1181                  */
1182                 if (ct->vport != 65535) {
1183                         if (ip_vs_conn_unhash(ct)) {
1184                                 ct->dport = 65535;
1185                                 ct->vport = 65535;
1186                                 ct->cport = 0;
1187                                 ip_vs_conn_hash(ct);
1188                         }
1189                 }
1190
1191                 /*
1192                  * Simply decrease the refcnt of the template,
1193                  * don't restart its timer.
1194                  */
1195                 atomic_dec(&ct->refcnt);
1196                 return 0;
1197         }
1198         return 1;
1199 }
1200
1201
1202 static inline void
1203 ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
1204 {
1205         atomic_inc(&vstim->refcnt);
1206         cp->timeout_table = vstim;
1207 }
1208
1209 static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
1210 {
1211         struct ip_vs_timeout_table *vstim = cp->timeout_table;
1212
1213         if (!vstim)
1214                 return;
1215         cp->timeout_table = NULL;
1216         atomic_dec(&vstim->refcnt);
1217 }
1218
1219
1220 static void ip_vs_conn_expire(unsigned long data)
1221 {
1222         struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
1223
1224         if (cp->timeout_table)
1225                 cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
1226         else
1227                 cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
1228
1229         /*
1230          *      hey, I'm using it
1231          */
1232         atomic_inc(&cp->refcnt);
1233
1234         /*
1235          *      do I control anybody?
1236          */
1237         if (atomic_read(&cp->n_control))
1238                 goto expire_later;
1239
1240         /*
1241          *      unhash it if it is hashed in the conn table
1242          */
1243         if (!ip_vs_conn_unhash(cp))
1244                 goto expire_later;
1245
1246         /*
1247          *      refcnt==1 implies I'm the only one referrer
1248          */
1249         if (likely(atomic_read(&cp->refcnt) == 1)) {
1250                 /* make sure that there is no timer on it now */
1251                 if (timer_pending(&cp->timer))
1252                         del_timer(&cp->timer);
1253
1254                 /* does anybody control me? */
1255                 if (cp->control)
1256                         ip_vs_control_del(cp);
1257
1258                 ip_vs_unbind_dest(cp);
1259                 ip_vs_unbind_app(cp);
1260                 ip_vs_timeout_detach(cp);
1261                 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
1262                         atomic_dec(&ip_vs_conn_no_cport_cnt);
1263                 atomic_dec(&ip_vs_conn_count);
1264
1265                 kmem_cache_free(ip_vs_conn_cachep, cp);
1266                 return;
1267         }
1268
1269         /* hash it back to the table */
1270         ip_vs_conn_hash(cp);
1271
1272   expire_later:
1273         IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
1274                   atomic_read(&cp->refcnt)-1,
1275                   atomic_read(&cp->n_control));
1276
1277         ip_vs_conn_put(cp);
1278 }
1279
1280
1281 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1282 {
1283         cp->timeout = 0;
1284         mod_timer(&cp->timer, jiffies);
1285 }
1286
1287 /*
1288  *  Create a new connection entry and hash it into the ip_vs_conn_tab.
1289  */
1290 struct ip_vs_conn *
1291 ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
1292                __u32 daddr, __u16 dport, unsigned flags,
1293                struct ip_vs_dest *dest)
1294 {
1295         struct ip_vs_conn *cp;
1296
1297         cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1298         if (cp == NULL) {
1299                 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
1300                 return NULL;
1301         }
1302
1303         memset(cp, 0, sizeof(*cp));
1304         INIT_LIST_HEAD(&cp->c_list);
1305         init_timer(&cp->timer);
1306         cp->timer.data     = (unsigned long)cp;
1307         cp->timer.function = ip_vs_conn_expire;
1308         ip_vs_timeout_attach(cp, ip_vs_timeout_table);
1309         cp->protocol       = proto;
1310         cp->caddr          = caddr;
1311         cp->cport          = cport;
1312         cp->vaddr          = vaddr;
1313         cp->vport          = vport;
1314         cp->daddr          = daddr;
1315         cp->dport          = dport;
1316         cp->flags          = flags;
1317         cp->app_data       = NULL;
1318         cp->control        = NULL;
1319         cp->lock           = SPIN_LOCK_UNLOCKED;
1320
1321         atomic_set(&cp->n_control, 0);
1322         atomic_set(&cp->in_pkts, 0);
1323
1324         atomic_inc(&ip_vs_conn_count);
1325         if (flags & IP_VS_CONN_F_NO_CPORT)
1326                 atomic_inc(&ip_vs_conn_no_cport_cnt);
1327
1328         /* Bind its application helper (only for VS/NAT) if any */
1329         ip_vs_bind_app(cp);
1330
1331         /* Bind the connection with a destination server */
1332         ip_vs_bind_dest(cp, dest);
1333
1334         /* Set its state and timeout */
1335         vs_set_state_timeout(cp, IP_VS_S_NONE);
1336
1337         /* Bind its packet transmitter */
1338         ip_vs_bind_xmit(cp);
1339
1340         /*
1341          * Set the entry is referenced by the current thread before hashing
1342          * it in the table, so that other thread run ip_vs_random_dropentry
1343          * but cannot drop this entry.
1344          */
1345         atomic_set(&cp->refcnt, 1);
1346
1347         /* Hash it in the ip_vs_conn_tab finally */
1348         ip_vs_conn_hash(cp);
1349
1350         return cp;
1351 }
1352
1353
1354 /*
1355  *      /proc/net/ip_vs_conn entries
1356  */
1357 static int
1358 ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
1359 {
1360         off_t pos=0;
1361         int idx, len=0;
1362         char temp[70];
1363         struct ip_vs_conn *cp;
1364         struct list_head *l, *e;
1365
1366         pos = 128;
1367         if (pos > offset) {
1368                 len += sprintf(buffer+len, "%-127s\n",
1369                                "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires");
1370         }
1371
1372         for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1373                 /*
1374                  *      Lock is actually only need in next loop
1375                  *      we are called from uspace: must stop bh.
1376                  */
1377                 ct_read_lock_bh(idx);
1378
1379                 l = &ip_vs_conn_tab[idx];
1380                 for (e=l->next; e!=l; e=e->next) {
1381                         cp = list_entry(e, struct ip_vs_conn, c_list);
1382                         pos += 128;
1383                         if (pos <= offset)
1384                                 continue;
1385                         sprintf(temp,
1386                                 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
1387                                 ip_vs_proto_name(cp->protocol),
1388                                 ntohl(cp->caddr), ntohs(cp->cport),
1389                                 ntohl(cp->vaddr), ntohs(cp->vport),
1390                                 ntohl(cp->daddr), ntohs(cp->dport),
1391                                 ip_vs_state_name(cp->state),
1392                                 (cp->timer.expires-jiffies)/HZ);
1393                         len += sprintf(buffer+len, "%-127s\n", temp);
1394                         if (pos >= offset+length) {
1395                                 ct_read_unlock_bh(idx);
1396                                 goto done;
1397                         }
1398                 }
1399                 ct_read_unlock_bh(idx);
1400         }
1401
1402   done:
1403         *start = buffer+len-(pos-offset);       /* Start of wanted data */
1404         len = pos-offset;
1405         if (len > length)
1406                 len = length;
1407         if (len < 0)
1408                 len = 0;
1409         return len;
1410 }
1411
1412
1413 /*
1414  *      Randomly drop connection entries before running out of memory
1415  */
1416 static inline int todrop_entry(struct ip_vs_conn *cp)
1417 {
1418         /*
1419          * The drop rate array needs tuning for real environments.
1420          * Called from timer bh only => no locking
1421          */
1422         static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1423         static char todrop_counter[9] = {0};
1424         int i;
1425
1426         /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1427            This will leave enough time for normal connection to get
1428            through. */
1429         if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
1430                 return 0;
1431
1432         /* Don't drop the entry if its number of incoming packets is not
1433            located in [0, 8] */
1434         i = atomic_read(&cp->in_pkts);
1435         if (i > 8 || i < 0) return 0;
1436
1437         if (!todrop_rate[i]) return 0;
1438         if (--todrop_counter[i] > 0) return 0;
1439
1440         todrop_counter[i] = todrop_rate[i];
1441         return 1;
1442 }
1443
1444
1445 void ip_vs_random_dropentry(void)
1446 {
1447         int idx;
1448         struct ip_vs_conn *cp;
1449         struct list_head *l,*e;
1450
1451         /*
1452          * Randomly scan 1/32 of the whole table every second
1453          */
1454         for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
1455                 unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
1456
1457                 /*
1458                  *  Lock is actually needed in this loop.
1459                  */
1460                 ct_write_lock(hash);
1461
1462                 l = &ip_vs_conn_tab[hash];
1463                 for (e=l->next; e!=l; e=e->next) {
1464                         cp = list_entry(e, struct ip_vs_conn, c_list);
1465                         if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1466                                 /* connection template */
1467                                 continue;
1468                         switch(cp->state) {
1469                         case IP_VS_S_SYN_RECV:
1470                         case IP_VS_S_SYNACK:
1471                                 break;
1472
1473                         case IP_VS_S_ESTABLISHED:
1474                         case IP_VS_S_UDP:
1475                                 if (todrop_entry(cp))
1476                                         break;
1477                                 continue;
1478
1479                         default:
1480                                 continue;
1481                         }
1482
1483                         IP_VS_DBG(4, "del connection\n");
1484                         ip_vs_conn_expire_now(cp);
1485                         if (cp->control) {
1486                                 IP_VS_DBG(4, "del conn template\n");
1487                                 ip_vs_conn_expire_now(cp->control);
1488                         }
1489                 }
1490                 ct_write_unlock(hash);
1491         }
1492 }
1493
1494
1495 /*
1496  *      Flush all the connection entries in the ip_vs_conn_tab
1497  */
1498 static void ip_vs_conn_flush(void)
1499 {
1500         int idx;
1501         struct ip_vs_conn *cp;
1502         struct list_head *l,*e;
1503
1504   flush_again:
1505         for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
1506                 /*
1507                  *  Lock is actually needed in this loop.
1508                  */
1509                 ct_write_lock_bh(idx);
1510
1511                 l = &ip_vs_conn_tab[idx];
1512                 for (e=l->next; e!=l; e=e->next) {
1513                         cp = list_entry(e, struct ip_vs_conn, c_list);
1514
1515                         IP_VS_DBG(4, "del connection\n");
1516                         ip_vs_conn_expire_now(cp);
1517                         if (cp->control) {
1518                                 IP_VS_DBG(4, "del conn template\n");
1519                                 ip_vs_conn_expire_now(cp->control);
1520                         }
1521                 }
1522                 ct_write_unlock_bh(idx);
1523         }
1524
1525         /* the counter may be not NULL, because maybe some conn entries
1526            are run by slow timer handler or unhashed but still referred */
1527         if (atomic_read(&ip_vs_conn_count) != 0) {
1528                 schedule();
1529                 goto flush_again;
1530         }
1531 }
1532
1533
1534 int ip_vs_conn_init(void)
1535 {
1536         int idx;
1537
1538         /*
1539          * Allocate the connection hash table and initialize its list heads
1540          */
1541         ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
1542         if (!ip_vs_conn_tab)
1543                 return -ENOMEM;
1544
1545         IP_VS_INFO("Connection hash table configured "
1546                    "(size=%d, memory=%ldKbytes)\n",
1547                    IP_VS_CONN_TAB_SIZE,
1548                    (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
1549         IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
1550                   sizeof(struct ip_vs_conn));
1551
1552         for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
1553                 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1554         }
1555
1556         for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1557                 __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
1558         }
1559
1560         /* Allocate ip_vs_conn slab cache */
1561         ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1562                                               sizeof(struct ip_vs_conn), 0,
1563                                               SLAB_HWCACHE_ALIGN, NULL, NULL);
1564         if (!ip_vs_conn_cachep) {
1565                 vfree(ip_vs_conn_tab);
1566                 return -ENOMEM;
1567         }
1568
1569         proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
1570
1571         /* calculate the random value for connection hash */
1572         get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1573
1574         return 0;
1575 }
1576
1577 void ip_vs_conn_cleanup(void)
1578 {
1579         /* flush all the connection entries first */
1580         ip_vs_conn_flush();
1581
1582         /* Release the empty cache */
1583         kmem_cache_destroy(ip_vs_conn_cachep);
1584         proc_net_remove("ip_vs_conn");
1585         vfree(ip_vs_conn_tab);
1586 }