[IPVS]: Bind connections on stanby if the destination exists
[powerpc.git] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/mutex.h>
37
38 #include <net/net_namespace.h>
39 #include <net/ip.h>
40 #include <net/route.h>
41 #include <net/sock.h>
42
43 #include <asm/uaccess.h>
44
45 #include <net/ip_vs.h>
46
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex);
49
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
72
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_expire_quiescent_template = 0;
82 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level = 0;
88
89 int ip_vs_get_debug_level(void)
90 {
91         return sysctl_ip_vs_debug_level;
92 }
93 #endif
94
95 /*
96  *      update_defense_level is called from keventd and from sysctl,
97  *      so it needs to protect itself from softirqs
98  */
99 static void update_defense_level(void)
100 {
101         struct sysinfo i;
102         static int old_secure_tcp = 0;
103         int availmem;
104         int nomem;
105         int to_change = -1;
106
107         /* we only count free and buffered memory (in pages) */
108         si_meminfo(&i);
109         availmem = i.freeram + i.bufferram;
110         /* however in linux 2.5 the i.bufferram is total page cache size,
111            we need adjust it */
112         /* si_swapinfo(&i); */
113         /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115         nomem = (availmem < sysctl_ip_vs_amemthresh);
116
117         local_bh_disable();
118
119         /* drop_entry */
120         spin_lock(&__ip_vs_dropentry_lock);
121         switch (sysctl_ip_vs_drop_entry) {
122         case 0:
123                 atomic_set(&ip_vs_dropentry, 0);
124                 break;
125         case 1:
126                 if (nomem) {
127                         atomic_set(&ip_vs_dropentry, 1);
128                         sysctl_ip_vs_drop_entry = 2;
129                 } else {
130                         atomic_set(&ip_vs_dropentry, 0);
131                 }
132                 break;
133         case 2:
134                 if (nomem) {
135                         atomic_set(&ip_vs_dropentry, 1);
136                 } else {
137                         atomic_set(&ip_vs_dropentry, 0);
138                         sysctl_ip_vs_drop_entry = 1;
139                 };
140                 break;
141         case 3:
142                 atomic_set(&ip_vs_dropentry, 1);
143                 break;
144         }
145         spin_unlock(&__ip_vs_dropentry_lock);
146
147         /* drop_packet */
148         spin_lock(&__ip_vs_droppacket_lock);
149         switch (sysctl_ip_vs_drop_packet) {
150         case 0:
151                 ip_vs_drop_rate = 0;
152                 break;
153         case 1:
154                 if (nomem) {
155                         ip_vs_drop_rate = ip_vs_drop_counter
156                                 = sysctl_ip_vs_amemthresh /
157                                 (sysctl_ip_vs_amemthresh-availmem);
158                         sysctl_ip_vs_drop_packet = 2;
159                 } else {
160                         ip_vs_drop_rate = 0;
161                 }
162                 break;
163         case 2:
164                 if (nomem) {
165                         ip_vs_drop_rate = ip_vs_drop_counter
166                                 = sysctl_ip_vs_amemthresh /
167                                 (sysctl_ip_vs_amemthresh-availmem);
168                 } else {
169                         ip_vs_drop_rate = 0;
170                         sysctl_ip_vs_drop_packet = 1;
171                 }
172                 break;
173         case 3:
174                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175                 break;
176         }
177         spin_unlock(&__ip_vs_droppacket_lock);
178
179         /* secure_tcp */
180         write_lock(&__ip_vs_securetcp_lock);
181         switch (sysctl_ip_vs_secure_tcp) {
182         case 0:
183                 if (old_secure_tcp >= 2)
184                         to_change = 0;
185                 break;
186         case 1:
187                 if (nomem) {
188                         if (old_secure_tcp < 2)
189                                 to_change = 1;
190                         sysctl_ip_vs_secure_tcp = 2;
191                 } else {
192                         if (old_secure_tcp >= 2)
193                                 to_change = 0;
194                 }
195                 break;
196         case 2:
197                 if (nomem) {
198                         if (old_secure_tcp < 2)
199                                 to_change = 1;
200                 } else {
201                         if (old_secure_tcp >= 2)
202                                 to_change = 0;
203                         sysctl_ip_vs_secure_tcp = 1;
204                 }
205                 break;
206         case 3:
207                 if (old_secure_tcp < 2)
208                         to_change = 1;
209                 break;
210         }
211         old_secure_tcp = sysctl_ip_vs_secure_tcp;
212         if (to_change >= 0)
213                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214         write_unlock(&__ip_vs_securetcp_lock);
215
216         local_bh_enable();
217 }
218
219
220 /*
221  *      Timer for checking the defense
222  */
223 #define DEFENSE_TIMER_PERIOD    1*HZ
224 static void defense_work_handler(struct work_struct *work);
225 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         update_defense_level();
230         if (atomic_read(&ip_vs_dropentry))
231                 ip_vs_random_dropentry();
232
233         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234 }
235
236 int
237 ip_vs_use_count_inc(void)
238 {
239         return try_module_get(THIS_MODULE);
240 }
241
242 void
243 ip_vs_use_count_dec(void)
244 {
245         module_put(THIS_MODULE);
246 }
247
248
249 /*
250  *      Hash table: for virtual service lookups
251  */
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261 /*
262  *      Hash table: for real service lookups
263  */
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270 /*
271  *      Trash for destinations
272  */
273 static LIST_HEAD(ip_vs_dest_trash);
274
275 /*
276  *      FTP & NULL virtual service counters
277  */
278 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282 /*
283  *      Returns hash value for virtual service
284  */
285 static __inline__ unsigned
286 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
287 {
288         register unsigned porth = ntohs(port);
289
290         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291                 & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Returns hash value of fwmark for virtual service lookup
296  */
297 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298 {
299         return fwmark & IP_VS_SVC_TAB_MASK;
300 }
301
302 /*
303  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304  *      or in the ip_vs_svc_fwm_table by fwmark.
305  *      Should be called with locked tables.
306  */
307 static int ip_vs_svc_hash(struct ip_vs_service *svc)
308 {
309         unsigned hash;
310
311         if (svc->flags & IP_VS_SVC_F_HASHED) {
312                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313                           "called from %p\n", __builtin_return_address(0));
314                 return 0;
315         }
316
317         if (svc->fwmark == 0) {
318                 /*
319                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
320                  */
321                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323         } else {
324                 /*
325                  *  Hash it by fwmark in ip_vs_svc_fwm_table
326                  */
327                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329         }
330
331         svc->flags |= IP_VS_SVC_F_HASHED;
332         /* increase its refcnt because it is referenced by the svc table */
333         atomic_inc(&svc->refcnt);
334         return 1;
335 }
336
337
338 /*
339  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340  *      Should be called with locked tables.
341  */
342 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343 {
344         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346                           "called from %p\n", __builtin_return_address(0));
347                 return 0;
348         }
349
350         if (svc->fwmark == 0) {
351                 /* Remove it from the ip_vs_svc_table table */
352                 list_del(&svc->s_list);
353         } else {
354                 /* Remove it from the ip_vs_svc_fwm_table table */
355                 list_del(&svc->f_list);
356         }
357
358         svc->flags &= ~IP_VS_SVC_F_HASHED;
359         atomic_dec(&svc->refcnt);
360         return 1;
361 }
362
363
364 /*
365  *      Get service by {proto,addr,port} in the service table.
366  */
367 static __inline__ struct ip_vs_service *
368 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
369 {
370         unsigned hash;
371         struct ip_vs_service *svc;
372
373         /* Check for "full" addressed entries */
374         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377                 if ((svc->addr == vaddr)
378                     && (svc->port == vport)
379                     && (svc->protocol == protocol)) {
380                         /* HIT */
381                         atomic_inc(&svc->usecnt);
382                         return svc;
383                 }
384         }
385
386         return NULL;
387 }
388
389
390 /*
391  *      Get service by {fwmark} in the service table.
392  */
393 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394 {
395         unsigned hash;
396         struct ip_vs_service *svc;
397
398         /* Check for fwmark addressed entries */
399         hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402                 if (svc->fwmark == fwmark) {
403                         /* HIT */
404                         atomic_inc(&svc->usecnt);
405                         return svc;
406                 }
407         }
408
409         return NULL;
410 }
411
412 struct ip_vs_service *
413 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
414 {
415         struct ip_vs_service *svc;
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423                 goto out;
424
425         /*
426          *      Check the table hashed by <protocol,addr,port>
427          *      for "full" addressed entries
428          */
429         svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431         if (svc == NULL
432             && protocol == IPPROTO_TCP
433             && atomic_read(&ip_vs_ftpsvc_counter)
434             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435                 /*
436                  * Check if ftp service entry exists, the packet
437                  * might belong to FTP data connections.
438                  */
439                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440         }
441
442         if (svc == NULL
443             && atomic_read(&ip_vs_nullsvc_counter)) {
444                 /*
445                  * Check if the catch-all port (port zero) exists
446                  */
447                 svc = __ip_vs_service_get(protocol, vaddr, 0);
448         }
449
450   out:
451         read_unlock(&__ip_vs_svc_lock);
452
453         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454                   fwmark, ip_vs_proto_name(protocol),
455                   NIPQUAD(vaddr), ntohs(vport),
456                   svc?"hit":"not hit");
457
458         return svc;
459 }
460
461
462 static inline void
463 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464 {
465         atomic_inc(&svc->refcnt);
466         dest->svc = svc;
467 }
468
469 static inline void
470 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
471 {
472         struct ip_vs_service *svc = dest->svc;
473
474         dest->svc = NULL;
475         if (atomic_dec_and_test(&svc->refcnt))
476                 kfree(svc);
477 }
478
479
480 /*
481  *      Returns hash value for real service
482  */
483 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
484 {
485         register unsigned porth = ntohs(port);
486
487         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488                 & IP_VS_RTAB_MASK;
489 }
490
491 /*
492  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493  *      should be called with locked tables.
494  */
495 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496 {
497         unsigned hash;
498
499         if (!list_empty(&dest->d_list)) {
500                 return 0;
501         }
502
503         /*
504          *      Hash by proto,addr,port,
505          *      which are the parameters of the real service.
506          */
507         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508         list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510         return 1;
511 }
512
513 /*
514  *      UNhashes ip_vs_dest from ip_vs_rtable.
515  *      should be called with locked tables.
516  */
517 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518 {
519         /*
520          * Remove it from the ip_vs_rtable table.
521          */
522         if (!list_empty(&dest->d_list)) {
523                 list_del(&dest->d_list);
524                 INIT_LIST_HEAD(&dest->d_list);
525         }
526
527         return 1;
528 }
529
530 /*
531  *      Lookup real service by <proto,addr,port> in the real service table.
532  */
533 struct ip_vs_dest *
534 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
535 {
536         unsigned hash;
537         struct ip_vs_dest *dest;
538
539         /*
540          *      Check for "full" addressed entries
541          *      Return the first found entry
542          */
543         hash = ip_vs_rs_hashkey(daddr, dport);
544
545         read_lock(&__ip_vs_rs_lock);
546         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547                 if ((dest->addr == daddr)
548                     && (dest->port == dport)
549                     && ((dest->protocol == protocol) ||
550                         dest->vfwmark)) {
551                         /* HIT */
552                         read_unlock(&__ip_vs_rs_lock);
553                         return dest;
554                 }
555         }
556         read_unlock(&__ip_vs_rs_lock);
557
558         return NULL;
559 }
560
561 /*
562  *      Lookup destination by {addr,port} in the given service
563  */
564 static struct ip_vs_dest *
565 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
566 {
567         struct ip_vs_dest *dest;
568
569         /*
570          * Find the destination for the given service
571          */
572         list_for_each_entry(dest, &svc->destinations, n_list) {
573                 if ((dest->addr == daddr) && (dest->port == dport)) {
574                         /* HIT */
575                         return dest;
576                 }
577         }
578
579         return NULL;
580 }
581
582 /*
583  * Find destination by {daddr,dport,vaddr,protocol}
584  * Cretaed to be used in ip_vs_process_message() in
585  * the backup synchronization daemon. It finds the
586  * destination to be bound to the received connection
587  * on the backup.
588  *
589  * ip_vs_lookup_real_service() looked promissing, but
590  * seems not working as expected.
591  */
592 struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
593                                     __be32 vaddr, __be16 vport, __u16 protocol)
594 {
595         struct ip_vs_dest *dest;
596         struct ip_vs_service *svc;
597
598         svc = ip_vs_service_get(0, protocol, vaddr, vport);
599         if (!svc)
600                 return NULL;
601         dest = ip_vs_lookup_dest(svc, daddr, dport);
602         if (dest)
603                 atomic_inc(&dest->refcnt);
604         ip_vs_service_put(svc);
605         return dest;
606 }
607 EXPORT_SYMBOL(ip_vs_find_dest);
608
609 /*
610  *  Lookup dest by {svc,addr,port} in the destination trash.
611  *  The destination trash is used to hold the destinations that are removed
612  *  from the service table but are still referenced by some conn entries.
613  *  The reason to add the destination trash is when the dest is temporary
614  *  down (either by administrator or by monitor program), the dest can be
615  *  picked back from the trash, the remaining connections to the dest can
616  *  continue, and the counting information of the dest is also useful for
617  *  scheduling.
618  */
619 static struct ip_vs_dest *
620 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
621 {
622         struct ip_vs_dest *dest, *nxt;
623
624         /*
625          * Find the destination in trash
626          */
627         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
628                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
629                           "dest->refcnt=%d\n",
630                           dest->vfwmark,
631                           NIPQUAD(dest->addr), ntohs(dest->port),
632                           atomic_read(&dest->refcnt));
633                 if (dest->addr == daddr &&
634                     dest->port == dport &&
635                     dest->vfwmark == svc->fwmark &&
636                     dest->protocol == svc->protocol &&
637                     (svc->fwmark ||
638                      (dest->vaddr == svc->addr &&
639                       dest->vport == svc->port))) {
640                         /* HIT */
641                         return dest;
642                 }
643
644                 /*
645                  * Try to purge the destination from trash if not referenced
646                  */
647                 if (atomic_read(&dest->refcnt) == 1) {
648                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
649                                   "from trash\n",
650                                   dest->vfwmark,
651                                   NIPQUAD(dest->addr), ntohs(dest->port));
652                         list_del(&dest->n_list);
653                         ip_vs_dst_reset(dest);
654                         __ip_vs_unbind_svc(dest);
655                         kfree(dest);
656                 }
657         }
658
659         return NULL;
660 }
661
662
663 /*
664  *  Clean up all the destinations in the trash
665  *  Called by the ip_vs_control_cleanup()
666  *
667  *  When the ip_vs_control_clearup is activated by ipvs module exit,
668  *  the service tables must have been flushed and all the connections
669  *  are expired, and the refcnt of each destination in the trash must
670  *  be 1, so we simply release them here.
671  */
672 static void ip_vs_trash_cleanup(void)
673 {
674         struct ip_vs_dest *dest, *nxt;
675
676         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
677                 list_del(&dest->n_list);
678                 ip_vs_dst_reset(dest);
679                 __ip_vs_unbind_svc(dest);
680                 kfree(dest);
681         }
682 }
683
684
685 static void
686 ip_vs_zero_stats(struct ip_vs_stats *stats)
687 {
688         spin_lock_bh(&stats->lock);
689         memset(stats, 0, (char *)&stats->lock - (char *)stats);
690         spin_unlock_bh(&stats->lock);
691         ip_vs_zero_estimator(stats);
692 }
693
694 /*
695  *      Update a destination in the given service
696  */
697 static void
698 __ip_vs_update_dest(struct ip_vs_service *svc,
699                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
700 {
701         int conn_flags;
702
703         /* set the weight and the flags */
704         atomic_set(&dest->weight, udest->weight);
705         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
706
707         /* check if local node and update the flags */
708         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
709                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
710                         | IP_VS_CONN_F_LOCALNODE;
711         }
712
713         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
714         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
715                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
716         } else {
717                 /*
718                  *    Put the real service in ip_vs_rtable if not present.
719                  *    For now only for NAT!
720                  */
721                 write_lock_bh(&__ip_vs_rs_lock);
722                 ip_vs_rs_hash(dest);
723                 write_unlock_bh(&__ip_vs_rs_lock);
724         }
725         atomic_set(&dest->conn_flags, conn_flags);
726
727         /* bind the service */
728         if (!dest->svc) {
729                 __ip_vs_bind_svc(dest, svc);
730         } else {
731                 if (dest->svc != svc) {
732                         __ip_vs_unbind_svc(dest);
733                         ip_vs_zero_stats(&dest->stats);
734                         __ip_vs_bind_svc(dest, svc);
735                 }
736         }
737
738         /* set the dest status flags */
739         dest->flags |= IP_VS_DEST_F_AVAILABLE;
740
741         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
742                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
743         dest->u_threshold = udest->u_threshold;
744         dest->l_threshold = udest->l_threshold;
745 }
746
747
748 /*
749  *      Create a destination for the given service
750  */
751 static int
752 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
753                struct ip_vs_dest **dest_p)
754 {
755         struct ip_vs_dest *dest;
756         unsigned atype;
757
758         EnterFunction(2);
759
760         atype = inet_addr_type(udest->addr);
761         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
762                 return -EINVAL;
763
764         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
765         if (dest == NULL) {
766                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
767                 return -ENOMEM;
768         }
769
770         dest->protocol = svc->protocol;
771         dest->vaddr = svc->addr;
772         dest->vport = svc->port;
773         dest->vfwmark = svc->fwmark;
774         dest->addr = udest->addr;
775         dest->port = udest->port;
776
777         atomic_set(&dest->activeconns, 0);
778         atomic_set(&dest->inactconns, 0);
779         atomic_set(&dest->persistconns, 0);
780         atomic_set(&dest->refcnt, 0);
781
782         INIT_LIST_HEAD(&dest->d_list);
783         spin_lock_init(&dest->dst_lock);
784         spin_lock_init(&dest->stats.lock);
785         __ip_vs_update_dest(svc, dest, udest);
786         ip_vs_new_estimator(&dest->stats);
787
788         *dest_p = dest;
789
790         LeaveFunction(2);
791         return 0;
792 }
793
794
795 /*
796  *      Add a destination into an existing service
797  */
798 static int
799 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
800 {
801         struct ip_vs_dest *dest;
802         __be32 daddr = udest->addr;
803         __be16 dport = udest->port;
804         int ret;
805
806         EnterFunction(2);
807
808         if (udest->weight < 0) {
809                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
810                 return -ERANGE;
811         }
812
813         if (udest->l_threshold > udest->u_threshold) {
814                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
815                           "upper threshold\n");
816                 return -ERANGE;
817         }
818
819         /*
820          * Check if the dest already exists in the list
821          */
822         dest = ip_vs_lookup_dest(svc, daddr, dport);
823         if (dest != NULL) {
824                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
825                 return -EEXIST;
826         }
827
828         /*
829          * Check if the dest already exists in the trash and
830          * is from the same service
831          */
832         dest = ip_vs_trash_get_dest(svc, daddr, dport);
833         if (dest != NULL) {
834                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
835                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
836                           NIPQUAD(daddr), ntohs(dport),
837                           atomic_read(&dest->refcnt),
838                           dest->vfwmark,
839                           NIPQUAD(dest->vaddr),
840                           ntohs(dest->vport));
841                 __ip_vs_update_dest(svc, dest, udest);
842
843                 /*
844                  * Get the destination from the trash
845                  */
846                 list_del(&dest->n_list);
847
848                 ip_vs_new_estimator(&dest->stats);
849
850                 write_lock_bh(&__ip_vs_svc_lock);
851
852                 /*
853                  * Wait until all other svc users go away.
854                  */
855                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
856
857                 list_add(&dest->n_list, &svc->destinations);
858                 svc->num_dests++;
859
860                 /* call the update_service function of its scheduler */
861                 svc->scheduler->update_service(svc);
862
863                 write_unlock_bh(&__ip_vs_svc_lock);
864                 return 0;
865         }
866
867         /*
868          * Allocate and initialize the dest structure
869          */
870         ret = ip_vs_new_dest(svc, udest, &dest);
871         if (ret) {
872                 return ret;
873         }
874
875         /*
876          * Add the dest entry into the list
877          */
878         atomic_inc(&dest->refcnt);
879
880         write_lock_bh(&__ip_vs_svc_lock);
881
882         /*
883          * Wait until all other svc users go away.
884          */
885         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
886
887         list_add(&dest->n_list, &svc->destinations);
888         svc->num_dests++;
889
890         /* call the update_service function of its scheduler */
891         svc->scheduler->update_service(svc);
892
893         write_unlock_bh(&__ip_vs_svc_lock);
894
895         LeaveFunction(2);
896
897         return 0;
898 }
899
900
901 /*
902  *      Edit a destination in the given service
903  */
904 static int
905 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
906 {
907         struct ip_vs_dest *dest;
908         __be32 daddr = udest->addr;
909         __be16 dport = udest->port;
910
911         EnterFunction(2);
912
913         if (udest->weight < 0) {
914                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
915                 return -ERANGE;
916         }
917
918         if (udest->l_threshold > udest->u_threshold) {
919                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
920                           "upper threshold\n");
921                 return -ERANGE;
922         }
923
924         /*
925          *  Lookup the destination list
926          */
927         dest = ip_vs_lookup_dest(svc, daddr, dport);
928         if (dest == NULL) {
929                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
930                 return -ENOENT;
931         }
932
933         __ip_vs_update_dest(svc, dest, udest);
934
935         write_lock_bh(&__ip_vs_svc_lock);
936
937         /* Wait until all other svc users go away */
938         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
939
940         /* call the update_service, because server weight may be changed */
941         svc->scheduler->update_service(svc);
942
943         write_unlock_bh(&__ip_vs_svc_lock);
944
945         LeaveFunction(2);
946
947         return 0;
948 }
949
950
951 /*
952  *      Delete a destination (must be already unlinked from the service)
953  */
954 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
955 {
956         ip_vs_kill_estimator(&dest->stats);
957
958         /*
959          *  Remove it from the d-linked list with the real services.
960          */
961         write_lock_bh(&__ip_vs_rs_lock);
962         ip_vs_rs_unhash(dest);
963         write_unlock_bh(&__ip_vs_rs_lock);
964
965         /*
966          *  Decrease the refcnt of the dest, and free the dest
967          *  if nobody refers to it (refcnt=0). Otherwise, throw
968          *  the destination into the trash.
969          */
970         if (atomic_dec_and_test(&dest->refcnt)) {
971                 ip_vs_dst_reset(dest);
972                 /* simply decrease svc->refcnt here, let the caller check
973                    and release the service if nobody refers to it.
974                    Only user context can release destination and service,
975                    and only one user context can update virtual service at a
976                    time, so the operation here is OK */
977                 atomic_dec(&dest->svc->refcnt);
978                 kfree(dest);
979         } else {
980                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
981                           "dest->refcnt=%d\n",
982                           NIPQUAD(dest->addr), ntohs(dest->port),
983                           atomic_read(&dest->refcnt));
984                 list_add(&dest->n_list, &ip_vs_dest_trash);
985                 atomic_inc(&dest->refcnt);
986         }
987 }
988
989
990 /*
991  *      Unlink a destination from the given service
992  */
993 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
994                                 struct ip_vs_dest *dest,
995                                 int svcupd)
996 {
997         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
998
999         /*
1000          *  Remove it from the d-linked destination list.
1001          */
1002         list_del(&dest->n_list);
1003         svc->num_dests--;
1004         if (svcupd) {
1005                 /*
1006                  *  Call the update_service function of its scheduler
1007                  */
1008                 svc->scheduler->update_service(svc);
1009         }
1010 }
1011
1012
1013 /*
1014  *      Delete a destination server in the given service
1015  */
1016 static int
1017 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1018 {
1019         struct ip_vs_dest *dest;
1020         __be32 daddr = udest->addr;
1021         __be16 dport = udest->port;
1022
1023         EnterFunction(2);
1024
1025         dest = ip_vs_lookup_dest(svc, daddr, dport);
1026         if (dest == NULL) {
1027                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1028                 return -ENOENT;
1029         }
1030
1031         write_lock_bh(&__ip_vs_svc_lock);
1032
1033         /*
1034          *      Wait until all other svc users go away.
1035          */
1036         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1037
1038         /*
1039          *      Unlink dest from the service
1040          */
1041         __ip_vs_unlink_dest(svc, dest, 1);
1042
1043         write_unlock_bh(&__ip_vs_svc_lock);
1044
1045         /*
1046          *      Delete the destination
1047          */
1048         __ip_vs_del_dest(dest);
1049
1050         LeaveFunction(2);
1051
1052         return 0;
1053 }
1054
1055
1056 /*
1057  *      Add a service into the service hash table
1058  */
1059 static int
1060 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1061 {
1062         int ret = 0;
1063         struct ip_vs_scheduler *sched = NULL;
1064         struct ip_vs_service *svc = NULL;
1065
1066         /* increase the module use count */
1067         ip_vs_use_count_inc();
1068
1069         /* Lookup the scheduler by 'u->sched_name' */
1070         sched = ip_vs_scheduler_get(u->sched_name);
1071         if (sched == NULL) {
1072                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1073                            u->sched_name);
1074                 ret = -ENOENT;
1075                 goto out_mod_dec;
1076         }
1077
1078         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1079         if (svc == NULL) {
1080                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1081                 ret = -ENOMEM;
1082                 goto out_err;
1083         }
1084
1085         /* I'm the first user of the service */
1086         atomic_set(&svc->usecnt, 1);
1087         atomic_set(&svc->refcnt, 0);
1088
1089         svc->protocol = u->protocol;
1090         svc->addr = u->addr;
1091         svc->port = u->port;
1092         svc->fwmark = u->fwmark;
1093         svc->flags = u->flags;
1094         svc->timeout = u->timeout * HZ;
1095         svc->netmask = u->netmask;
1096
1097         INIT_LIST_HEAD(&svc->destinations);
1098         rwlock_init(&svc->sched_lock);
1099         spin_lock_init(&svc->stats.lock);
1100
1101         /* Bind the scheduler */
1102         ret = ip_vs_bind_scheduler(svc, sched);
1103         if (ret)
1104                 goto out_err;
1105         sched = NULL;
1106
1107         /* Update the virtual service counters */
1108         if (svc->port == FTPPORT)
1109                 atomic_inc(&ip_vs_ftpsvc_counter);
1110         else if (svc->port == 0)
1111                 atomic_inc(&ip_vs_nullsvc_counter);
1112
1113         ip_vs_new_estimator(&svc->stats);
1114         ip_vs_num_services++;
1115
1116         /* Hash the service into the service table */
1117         write_lock_bh(&__ip_vs_svc_lock);
1118         ip_vs_svc_hash(svc);
1119         write_unlock_bh(&__ip_vs_svc_lock);
1120
1121         *svc_p = svc;
1122         return 0;
1123
1124   out_err:
1125         if (svc != NULL) {
1126                 if (svc->scheduler)
1127                         ip_vs_unbind_scheduler(svc);
1128                 if (svc->inc) {
1129                         local_bh_disable();
1130                         ip_vs_app_inc_put(svc->inc);
1131                         local_bh_enable();
1132                 }
1133                 kfree(svc);
1134         }
1135         ip_vs_scheduler_put(sched);
1136
1137   out_mod_dec:
1138         /* decrease the module use count */
1139         ip_vs_use_count_dec();
1140
1141         return ret;
1142 }
1143
1144
1145 /*
1146  *      Edit a service and bind it with a new scheduler
1147  */
1148 static int
1149 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1150 {
1151         struct ip_vs_scheduler *sched, *old_sched;
1152         int ret = 0;
1153
1154         /*
1155          * Lookup the scheduler, by 'u->sched_name'
1156          */
1157         sched = ip_vs_scheduler_get(u->sched_name);
1158         if (sched == NULL) {
1159                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1160                            u->sched_name);
1161                 return -ENOENT;
1162         }
1163         old_sched = sched;
1164
1165         write_lock_bh(&__ip_vs_svc_lock);
1166
1167         /*
1168          * Wait until all other svc users go away.
1169          */
1170         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1171
1172         /*
1173          * Set the flags and timeout value
1174          */
1175         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1176         svc->timeout = u->timeout * HZ;
1177         svc->netmask = u->netmask;
1178
1179         old_sched = svc->scheduler;
1180         if (sched != old_sched) {
1181                 /*
1182                  * Unbind the old scheduler
1183                  */
1184                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1185                         old_sched = sched;
1186                         goto out;
1187                 }
1188
1189                 /*
1190                  * Bind the new scheduler
1191                  */
1192                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1193                         /*
1194                          * If ip_vs_bind_scheduler fails, restore the old
1195                          * scheduler.
1196                          * The main reason of failure is out of memory.
1197                          *
1198                          * The question is if the old scheduler can be
1199                          * restored all the time. TODO: if it cannot be
1200                          * restored some time, we must delete the service,
1201                          * otherwise the system may crash.
1202                          */
1203                         ip_vs_bind_scheduler(svc, old_sched);
1204                         old_sched = sched;
1205                         goto out;
1206                 }
1207         }
1208
1209   out:
1210         write_unlock_bh(&__ip_vs_svc_lock);
1211
1212         if (old_sched)
1213                 ip_vs_scheduler_put(old_sched);
1214
1215         return ret;
1216 }
1217
1218
1219 /*
1220  *      Delete a service from the service list
1221  *      - The service must be unlinked, unlocked and not referenced!
1222  *      - We are called under _bh lock
1223  */
1224 static void __ip_vs_del_service(struct ip_vs_service *svc)
1225 {
1226         struct ip_vs_dest *dest, *nxt;
1227         struct ip_vs_scheduler *old_sched;
1228
1229         ip_vs_num_services--;
1230         ip_vs_kill_estimator(&svc->stats);
1231
1232         /* Unbind scheduler */
1233         old_sched = svc->scheduler;
1234         ip_vs_unbind_scheduler(svc);
1235         if (old_sched)
1236                 ip_vs_scheduler_put(old_sched);
1237
1238         /* Unbind app inc */
1239         if (svc->inc) {
1240                 ip_vs_app_inc_put(svc->inc);
1241                 svc->inc = NULL;
1242         }
1243
1244         /*
1245          *    Unlink the whole destination list
1246          */
1247         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1248                 __ip_vs_unlink_dest(svc, dest, 0);
1249                 __ip_vs_del_dest(dest);
1250         }
1251
1252         /*
1253          *    Update the virtual service counters
1254          */
1255         if (svc->port == FTPPORT)
1256                 atomic_dec(&ip_vs_ftpsvc_counter);
1257         else if (svc->port == 0)
1258                 atomic_dec(&ip_vs_nullsvc_counter);
1259
1260         /*
1261          *    Free the service if nobody refers to it
1262          */
1263         if (atomic_read(&svc->refcnt) == 0)
1264                 kfree(svc);
1265
1266         /* decrease the module use count */
1267         ip_vs_use_count_dec();
1268 }
1269
1270 /*
1271  *      Delete a service from the service list
1272  */
1273 static int ip_vs_del_service(struct ip_vs_service *svc)
1274 {
1275         if (svc == NULL)
1276                 return -EEXIST;
1277
1278         /*
1279          * Unhash it from the service table
1280          */
1281         write_lock_bh(&__ip_vs_svc_lock);
1282
1283         ip_vs_svc_unhash(svc);
1284
1285         /*
1286          * Wait until all the svc users go away.
1287          */
1288         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1289
1290         __ip_vs_del_service(svc);
1291
1292         write_unlock_bh(&__ip_vs_svc_lock);
1293
1294         return 0;
1295 }
1296
1297
1298 /*
1299  *      Flush all the virtual services
1300  */
1301 static int ip_vs_flush(void)
1302 {
1303         int idx;
1304         struct ip_vs_service *svc, *nxt;
1305
1306         /*
1307          * Flush the service table hashed by <protocol,addr,port>
1308          */
1309         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1310                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1311                         write_lock_bh(&__ip_vs_svc_lock);
1312                         ip_vs_svc_unhash(svc);
1313                         /*
1314                          * Wait until all the svc users go away.
1315                          */
1316                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1317                         __ip_vs_del_service(svc);
1318                         write_unlock_bh(&__ip_vs_svc_lock);
1319                 }
1320         }
1321
1322         /*
1323          * Flush the service table hashed by fwmark
1324          */
1325         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1326                 list_for_each_entry_safe(svc, nxt,
1327                                          &ip_vs_svc_fwm_table[idx], f_list) {
1328                         write_lock_bh(&__ip_vs_svc_lock);
1329                         ip_vs_svc_unhash(svc);
1330                         /*
1331                          * Wait until all the svc users go away.
1332                          */
1333                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1334                         __ip_vs_del_service(svc);
1335                         write_unlock_bh(&__ip_vs_svc_lock);
1336                 }
1337         }
1338
1339         return 0;
1340 }
1341
1342
1343 /*
1344  *      Zero counters in a service or all services
1345  */
1346 static int ip_vs_zero_service(struct ip_vs_service *svc)
1347 {
1348         struct ip_vs_dest *dest;
1349
1350         write_lock_bh(&__ip_vs_svc_lock);
1351         list_for_each_entry(dest, &svc->destinations, n_list) {
1352                 ip_vs_zero_stats(&dest->stats);
1353         }
1354         ip_vs_zero_stats(&svc->stats);
1355         write_unlock_bh(&__ip_vs_svc_lock);
1356         return 0;
1357 }
1358
1359 static int ip_vs_zero_all(void)
1360 {
1361         int idx;
1362         struct ip_vs_service *svc;
1363
1364         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1365                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1366                         ip_vs_zero_service(svc);
1367                 }
1368         }
1369
1370         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1371                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1372                         ip_vs_zero_service(svc);
1373                 }
1374         }
1375
1376         ip_vs_zero_stats(&ip_vs_stats);
1377         return 0;
1378 }
1379
1380
1381 static int
1382 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1383                      void __user *buffer, size_t *lenp, loff_t *ppos)
1384 {
1385         int *valp = table->data;
1386         int val = *valp;
1387         int rc;
1388
1389         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1390         if (write && (*valp != val)) {
1391                 if ((*valp < 0) || (*valp > 3)) {
1392                         /* Restore the correct value */
1393                         *valp = val;
1394                 } else {
1395                         update_defense_level();
1396                 }
1397         }
1398         return rc;
1399 }
1400
1401
1402 static int
1403 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1404                        void __user *buffer, size_t *lenp, loff_t *ppos)
1405 {
1406         int *valp = table->data;
1407         int val[2];
1408         int rc;
1409
1410         /* backup the value first */
1411         memcpy(val, valp, sizeof(val));
1412
1413         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1414         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1415                 /* Restore the correct value */
1416                 memcpy(valp, val, sizeof(val));
1417         }
1418         return rc;
1419 }
1420
1421
1422 /*
1423  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1424  */
1425
1426 static struct ctl_table vs_vars[] = {
1427         {
1428                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1429                 .procname       = "amemthresh",
1430                 .data           = &sysctl_ip_vs_amemthresh,
1431                 .maxlen         = sizeof(int),
1432                 .mode           = 0644,
1433                 .proc_handler   = &proc_dointvec,
1434         },
1435 #ifdef CONFIG_IP_VS_DEBUG
1436         {
1437                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1438                 .procname       = "debug_level",
1439                 .data           = &sysctl_ip_vs_debug_level,
1440                 .maxlen         = sizeof(int),
1441                 .mode           = 0644,
1442                 .proc_handler   = &proc_dointvec,
1443         },
1444 #endif
1445         {
1446                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1447                 .procname       = "am_droprate",
1448                 .data           = &sysctl_ip_vs_am_droprate,
1449                 .maxlen         = sizeof(int),
1450                 .mode           = 0644,
1451                 .proc_handler   = &proc_dointvec,
1452         },
1453         {
1454                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1455                 .procname       = "drop_entry",
1456                 .data           = &sysctl_ip_vs_drop_entry,
1457                 .maxlen         = sizeof(int),
1458                 .mode           = 0644,
1459                 .proc_handler   = &proc_do_defense_mode,
1460         },
1461         {
1462                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1463                 .procname       = "drop_packet",
1464                 .data           = &sysctl_ip_vs_drop_packet,
1465                 .maxlen         = sizeof(int),
1466                 .mode           = 0644,
1467                 .proc_handler   = &proc_do_defense_mode,
1468         },
1469         {
1470                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1471                 .procname       = "secure_tcp",
1472                 .data           = &sysctl_ip_vs_secure_tcp,
1473                 .maxlen         = sizeof(int),
1474                 .mode           = 0644,
1475                 .proc_handler   = &proc_do_defense_mode,
1476         },
1477 #if 0
1478         {
1479                 .ctl_name       = NET_IPV4_VS_TO_ES,
1480                 .procname       = "timeout_established",
1481                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1482                 .maxlen         = sizeof(int),
1483                 .mode           = 0644,
1484                 .proc_handler   = &proc_dointvec_jiffies,
1485         },
1486         {
1487                 .ctl_name       = NET_IPV4_VS_TO_SS,
1488                 .procname       = "timeout_synsent",
1489                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1490                 .maxlen         = sizeof(int),
1491                 .mode           = 0644,
1492                 .proc_handler   = &proc_dointvec_jiffies,
1493         },
1494         {
1495                 .ctl_name       = NET_IPV4_VS_TO_SR,
1496                 .procname       = "timeout_synrecv",
1497                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1498                 .maxlen         = sizeof(int),
1499                 .mode           = 0644,
1500                 .proc_handler   = &proc_dointvec_jiffies,
1501         },
1502         {
1503                 .ctl_name       = NET_IPV4_VS_TO_FW,
1504                 .procname       = "timeout_finwait",
1505                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1506                 .maxlen         = sizeof(int),
1507                 .mode           = 0644,
1508                 .proc_handler   = &proc_dointvec_jiffies,
1509         },
1510         {
1511                 .ctl_name       = NET_IPV4_VS_TO_TW,
1512                 .procname       = "timeout_timewait",
1513                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1514                 .maxlen         = sizeof(int),
1515                 .mode           = 0644,
1516                 .proc_handler   = &proc_dointvec_jiffies,
1517         },
1518         {
1519                 .ctl_name       = NET_IPV4_VS_TO_CL,
1520                 .procname       = "timeout_close",
1521                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1522                 .maxlen         = sizeof(int),
1523                 .mode           = 0644,
1524                 .proc_handler   = &proc_dointvec_jiffies,
1525         },
1526         {
1527                 .ctl_name       = NET_IPV4_VS_TO_CW,
1528                 .procname       = "timeout_closewait",
1529                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1530                 .maxlen         = sizeof(int),
1531                 .mode           = 0644,
1532                 .proc_handler   = &proc_dointvec_jiffies,
1533         },
1534         {
1535                 .ctl_name       = NET_IPV4_VS_TO_LA,
1536                 .procname       = "timeout_lastack",
1537                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1538                 .maxlen         = sizeof(int),
1539                 .mode           = 0644,
1540                 .proc_handler   = &proc_dointvec_jiffies,
1541         },
1542         {
1543                 .ctl_name       = NET_IPV4_VS_TO_LI,
1544                 .procname       = "timeout_listen",
1545                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1546                 .maxlen         = sizeof(int),
1547                 .mode           = 0644,
1548                 .proc_handler   = &proc_dointvec_jiffies,
1549         },
1550         {
1551                 .ctl_name       = NET_IPV4_VS_TO_SA,
1552                 .procname       = "timeout_synack",
1553                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1554                 .maxlen         = sizeof(int),
1555                 .mode           = 0644,
1556                 .proc_handler   = &proc_dointvec_jiffies,
1557         },
1558         {
1559                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1560                 .procname       = "timeout_udp",
1561                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1562                 .maxlen         = sizeof(int),
1563                 .mode           = 0644,
1564                 .proc_handler   = &proc_dointvec_jiffies,
1565         },
1566         {
1567                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1568                 .procname       = "timeout_icmp",
1569                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1570                 .maxlen         = sizeof(int),
1571                 .mode           = 0644,
1572                 .proc_handler   = &proc_dointvec_jiffies,
1573         },
1574 #endif
1575         {
1576                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1577                 .procname       = "cache_bypass",
1578                 .data           = &sysctl_ip_vs_cache_bypass,
1579                 .maxlen         = sizeof(int),
1580                 .mode           = 0644,
1581                 .proc_handler   = &proc_dointvec,
1582         },
1583         {
1584                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1585                 .procname       = "expire_nodest_conn",
1586                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1587                 .maxlen         = sizeof(int),
1588                 .mode           = 0644,
1589                 .proc_handler   = &proc_dointvec,
1590         },
1591         {
1592                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1593                 .procname       = "expire_quiescent_template",
1594                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1595                 .maxlen         = sizeof(int),
1596                 .mode           = 0644,
1597                 .proc_handler   = &proc_dointvec,
1598         },
1599         {
1600                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1601                 .procname       = "sync_threshold",
1602                 .data           = &sysctl_ip_vs_sync_threshold,
1603                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1604                 .mode           = 0644,
1605                 .proc_handler   = &proc_do_sync_threshold,
1606         },
1607         {
1608                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1609                 .procname       = "nat_icmp_send",
1610                 .data           = &sysctl_ip_vs_nat_icmp_send,
1611                 .maxlen         = sizeof(int),
1612                 .mode           = 0644,
1613                 .proc_handler   = &proc_dointvec,
1614         },
1615         { .ctl_name = 0 }
1616 };
1617
1618 static ctl_table vs_table[] = {
1619         {
1620                 .ctl_name       = NET_IPV4_VS,
1621                 .procname       = "vs",
1622                 .mode           = 0555,
1623                 .child          = vs_vars
1624         },
1625         { .ctl_name = 0 }
1626 };
1627
1628 static ctl_table ipvs_ipv4_table[] = {
1629         {
1630                 .ctl_name       = NET_IPV4,
1631                 .procname       = "ipv4",
1632                 .mode           = 0555,
1633                 .child          = vs_table,
1634         },
1635         { .ctl_name = 0 }
1636 };
1637
1638 static ctl_table vs_root_table[] = {
1639         {
1640                 .ctl_name       = CTL_NET,
1641                 .procname       = "net",
1642                 .mode           = 0555,
1643                 .child          = ipvs_ipv4_table,
1644         },
1645         { .ctl_name = 0 }
1646 };
1647
1648 static struct ctl_table_header * sysctl_header;
1649
1650 #ifdef CONFIG_PROC_FS
1651
1652 struct ip_vs_iter {
1653         struct list_head *table;
1654         int bucket;
1655 };
1656
1657 /*
1658  *      Write the contents of the VS rule table to a PROCfs file.
1659  *      (It is kept just for backward compatibility)
1660  */
1661 static inline const char *ip_vs_fwd_name(unsigned flags)
1662 {
1663         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1664         case IP_VS_CONN_F_LOCALNODE:
1665                 return "Local";
1666         case IP_VS_CONN_F_TUNNEL:
1667                 return "Tunnel";
1668         case IP_VS_CONN_F_DROUTE:
1669                 return "Route";
1670         default:
1671                 return "Masq";
1672         }
1673 }
1674
1675
1676 /* Get the Nth entry in the two lists */
1677 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1678 {
1679         struct ip_vs_iter *iter = seq->private;
1680         int idx;
1681         struct ip_vs_service *svc;
1682
1683         /* look in hash by protocol */
1684         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1685                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1686                         if (pos-- == 0){
1687                                 iter->table = ip_vs_svc_table;
1688                                 iter->bucket = idx;
1689                                 return svc;
1690                         }
1691                 }
1692         }
1693
1694         /* keep looking in fwmark */
1695         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1696                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1697                         if (pos-- == 0) {
1698                                 iter->table = ip_vs_svc_fwm_table;
1699                                 iter->bucket = idx;
1700                                 return svc;
1701                         }
1702                 }
1703         }
1704
1705         return NULL;
1706 }
1707
1708 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1709 {
1710
1711         read_lock_bh(&__ip_vs_svc_lock);
1712         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1713 }
1714
1715
1716 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1717 {
1718         struct list_head *e;
1719         struct ip_vs_iter *iter;
1720         struct ip_vs_service *svc;
1721
1722         ++*pos;
1723         if (v == SEQ_START_TOKEN)
1724                 return ip_vs_info_array(seq,0);
1725
1726         svc = v;
1727         iter = seq->private;
1728
1729         if (iter->table == ip_vs_svc_table) {
1730                 /* next service in table hashed by protocol */
1731                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1732                         return list_entry(e, struct ip_vs_service, s_list);
1733
1734
1735                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1736                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1737                                             s_list) {
1738                                 return svc;
1739                         }
1740                 }
1741
1742                 iter->table = ip_vs_svc_fwm_table;
1743                 iter->bucket = -1;
1744                 goto scan_fwmark;
1745         }
1746
1747         /* next service in hashed by fwmark */
1748         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1749                 return list_entry(e, struct ip_vs_service, f_list);
1750
1751  scan_fwmark:
1752         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1753                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1754                                     f_list)
1755                         return svc;
1756         }
1757
1758         return NULL;
1759 }
1760
1761 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1762 {
1763         read_unlock_bh(&__ip_vs_svc_lock);
1764 }
1765
1766
1767 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1768 {
1769         if (v == SEQ_START_TOKEN) {
1770                 seq_printf(seq,
1771                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1772                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1773                 seq_puts(seq,
1774                          "Prot LocalAddress:Port Scheduler Flags\n");
1775                 seq_puts(seq,
1776                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1777         } else {
1778                 const struct ip_vs_service *svc = v;
1779                 const struct ip_vs_iter *iter = seq->private;
1780                 const struct ip_vs_dest *dest;
1781
1782                 if (iter->table == ip_vs_svc_table)
1783                         seq_printf(seq, "%s  %08X:%04X %s ",
1784                                    ip_vs_proto_name(svc->protocol),
1785                                    ntohl(svc->addr),
1786                                    ntohs(svc->port),
1787                                    svc->scheduler->name);
1788                 else
1789                         seq_printf(seq, "FWM  %08X %s ",
1790                                    svc->fwmark, svc->scheduler->name);
1791
1792                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1793                         seq_printf(seq, "persistent %d %08X\n",
1794                                 svc->timeout,
1795                                 ntohl(svc->netmask));
1796                 else
1797                         seq_putc(seq, '\n');
1798
1799                 list_for_each_entry(dest, &svc->destinations, n_list) {
1800                         seq_printf(seq,
1801                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1802                                    ntohl(dest->addr), ntohs(dest->port),
1803                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1804                                    atomic_read(&dest->weight),
1805                                    atomic_read(&dest->activeconns),
1806                                    atomic_read(&dest->inactconns));
1807                 }
1808         }
1809         return 0;
1810 }
1811
1812 static const struct seq_operations ip_vs_info_seq_ops = {
1813         .start = ip_vs_info_seq_start,
1814         .next  = ip_vs_info_seq_next,
1815         .stop  = ip_vs_info_seq_stop,
1816         .show  = ip_vs_info_seq_show,
1817 };
1818
1819 static int ip_vs_info_open(struct inode *inode, struct file *file)
1820 {
1821         return seq_open_private(file, &ip_vs_info_seq_ops,
1822                         sizeof(struct ip_vs_iter));
1823 }
1824
1825 static const struct file_operations ip_vs_info_fops = {
1826         .owner   = THIS_MODULE,
1827         .open    = ip_vs_info_open,
1828         .read    = seq_read,
1829         .llseek  = seq_lseek,
1830         .release = seq_release_private,
1831 };
1832
1833 #endif
1834
1835 struct ip_vs_stats ip_vs_stats;
1836
1837 #ifdef CONFIG_PROC_FS
1838 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1839 {
1840
1841 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1842         seq_puts(seq,
1843                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1844         seq_printf(seq,
1845                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1846
1847         spin_lock_bh(&ip_vs_stats.lock);
1848         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1849                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1850                    (unsigned long long) ip_vs_stats.inbytes,
1851                    (unsigned long long) ip_vs_stats.outbytes);
1852
1853 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1854         seq_puts(seq,
1855                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1856         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1857                         ip_vs_stats.cps,
1858                         ip_vs_stats.inpps,
1859                         ip_vs_stats.outpps,
1860                         ip_vs_stats.inbps,
1861                         ip_vs_stats.outbps);
1862         spin_unlock_bh(&ip_vs_stats.lock);
1863
1864         return 0;
1865 }
1866
1867 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1868 {
1869         return single_open(file, ip_vs_stats_show, NULL);
1870 }
1871
1872 static const struct file_operations ip_vs_stats_fops = {
1873         .owner = THIS_MODULE,
1874         .open = ip_vs_stats_seq_open,
1875         .read = seq_read,
1876         .llseek = seq_lseek,
1877         .release = single_release,
1878 };
1879
1880 #endif
1881
1882 /*
1883  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1884  */
1885 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1886 {
1887         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1888                   u->tcp_timeout,
1889                   u->tcp_fin_timeout,
1890                   u->udp_timeout);
1891
1892 #ifdef CONFIG_IP_VS_PROTO_TCP
1893         if (u->tcp_timeout) {
1894                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1895                         = u->tcp_timeout * HZ;
1896         }
1897
1898         if (u->tcp_fin_timeout) {
1899                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1900                         = u->tcp_fin_timeout * HZ;
1901         }
1902 #endif
1903
1904 #ifdef CONFIG_IP_VS_PROTO_UDP
1905         if (u->udp_timeout) {
1906                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1907                         = u->udp_timeout * HZ;
1908         }
1909 #endif
1910         return 0;
1911 }
1912
1913
1914 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1915 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1916 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1917                                  sizeof(struct ip_vs_dest_user))
1918 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1919 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1920 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1921
1922 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1923         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1925         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1926         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1927         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1928         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1929         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1930         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1931         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1932         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1933         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1934 };
1935
1936 static int
1937 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1938 {
1939         int ret;
1940         unsigned char arg[MAX_ARG_LEN];
1941         struct ip_vs_service_user *usvc;
1942         struct ip_vs_service *svc;
1943         struct ip_vs_dest_user *udest;
1944
1945         if (!capable(CAP_NET_ADMIN))
1946                 return -EPERM;
1947
1948         if (len != set_arglen[SET_CMDID(cmd)]) {
1949                 IP_VS_ERR("set_ctl: len %u != %u\n",
1950                           len, set_arglen[SET_CMDID(cmd)]);
1951                 return -EINVAL;
1952         }
1953
1954         if (copy_from_user(arg, user, len) != 0)
1955                 return -EFAULT;
1956
1957         /* increase the module use count */
1958         ip_vs_use_count_inc();
1959
1960         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1961                 ret = -ERESTARTSYS;
1962                 goto out_dec;
1963         }
1964
1965         if (cmd == IP_VS_SO_SET_FLUSH) {
1966                 /* Flush the virtual service */
1967                 ret = ip_vs_flush();
1968                 goto out_unlock;
1969         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1970                 /* Set timeout values for (tcp tcpfin udp) */
1971                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1972                 goto out_unlock;
1973         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1974                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1975                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1976                 goto out_unlock;
1977         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1978                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1979                 ret = stop_sync_thread(dm->state);
1980                 goto out_unlock;
1981         }
1982
1983         usvc = (struct ip_vs_service_user *)arg;
1984         udest = (struct ip_vs_dest_user *)(usvc + 1);
1985
1986         if (cmd == IP_VS_SO_SET_ZERO) {
1987                 /* if no service address is set, zero counters in all */
1988                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1989                         ret = ip_vs_zero_all();
1990                         goto out_unlock;
1991                 }
1992         }
1993
1994         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1995         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1996                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1997                           usvc->protocol, NIPQUAD(usvc->addr),
1998                           ntohs(usvc->port), usvc->sched_name);
1999                 ret = -EFAULT;
2000                 goto out_unlock;
2001         }
2002
2003         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2004         if (usvc->fwmark == 0)
2005                 svc = __ip_vs_service_get(usvc->protocol,
2006                                           usvc->addr, usvc->port);
2007         else
2008                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2009
2010         if (cmd != IP_VS_SO_SET_ADD
2011             && (svc == NULL || svc->protocol != usvc->protocol)) {
2012                 ret = -ESRCH;
2013                 goto out_unlock;
2014         }
2015
2016         switch (cmd) {
2017         case IP_VS_SO_SET_ADD:
2018                 if (svc != NULL)
2019                         ret = -EEXIST;
2020                 else
2021                         ret = ip_vs_add_service(usvc, &svc);
2022                 break;
2023         case IP_VS_SO_SET_EDIT:
2024                 ret = ip_vs_edit_service(svc, usvc);
2025                 break;
2026         case IP_VS_SO_SET_DEL:
2027                 ret = ip_vs_del_service(svc);
2028                 if (!ret)
2029                         goto out_unlock;
2030                 break;
2031         case IP_VS_SO_SET_ZERO:
2032                 ret = ip_vs_zero_service(svc);
2033                 break;
2034         case IP_VS_SO_SET_ADDDEST:
2035                 ret = ip_vs_add_dest(svc, udest);
2036                 break;
2037         case IP_VS_SO_SET_EDITDEST:
2038                 ret = ip_vs_edit_dest(svc, udest);
2039                 break;
2040         case IP_VS_SO_SET_DELDEST:
2041                 ret = ip_vs_del_dest(svc, udest);
2042                 break;
2043         default:
2044                 ret = -EINVAL;
2045         }
2046
2047         if (svc)
2048                 ip_vs_service_put(svc);
2049
2050   out_unlock:
2051         mutex_unlock(&__ip_vs_mutex);
2052   out_dec:
2053         /* decrease the module use count */
2054         ip_vs_use_count_dec();
2055
2056         return ret;
2057 }
2058
2059
2060 static void
2061 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2062 {
2063         spin_lock_bh(&src->lock);
2064         memcpy(dst, src, (char*)&src->lock - (char*)src);
2065         spin_unlock_bh(&src->lock);
2066 }
2067
2068 static void
2069 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2070 {
2071         dst->protocol = src->protocol;
2072         dst->addr = src->addr;
2073         dst->port = src->port;
2074         dst->fwmark = src->fwmark;
2075         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2076         dst->flags = src->flags;
2077         dst->timeout = src->timeout / HZ;
2078         dst->netmask = src->netmask;
2079         dst->num_dests = src->num_dests;
2080         ip_vs_copy_stats(&dst->stats, &src->stats);
2081 }
2082
2083 static inline int
2084 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2085                             struct ip_vs_get_services __user *uptr)
2086 {
2087         int idx, count=0;
2088         struct ip_vs_service *svc;
2089         struct ip_vs_service_entry entry;
2090         int ret = 0;
2091
2092         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2093                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2094                         if (count >= get->num_services)
2095                                 goto out;
2096                         memset(&entry, 0, sizeof(entry));
2097                         ip_vs_copy_service(&entry, svc);
2098                         if (copy_to_user(&uptr->entrytable[count],
2099                                          &entry, sizeof(entry))) {
2100                                 ret = -EFAULT;
2101                                 goto out;
2102                         }
2103                         count++;
2104                 }
2105         }
2106
2107         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2108                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2109                         if (count >= get->num_services)
2110                                 goto out;
2111                         memset(&entry, 0, sizeof(entry));
2112                         ip_vs_copy_service(&entry, svc);
2113                         if (copy_to_user(&uptr->entrytable[count],
2114                                          &entry, sizeof(entry))) {
2115                                 ret = -EFAULT;
2116                                 goto out;
2117                         }
2118                         count++;
2119                 }
2120         }
2121   out:
2122         return ret;
2123 }
2124
2125 static inline int
2126 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2127                          struct ip_vs_get_dests __user *uptr)
2128 {
2129         struct ip_vs_service *svc;
2130         int ret = 0;
2131
2132         if (get->fwmark)
2133                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2134         else
2135                 svc = __ip_vs_service_get(get->protocol,
2136                                           get->addr, get->port);
2137         if (svc) {
2138                 int count = 0;
2139                 struct ip_vs_dest *dest;
2140                 struct ip_vs_dest_entry entry;
2141
2142                 list_for_each_entry(dest, &svc->destinations, n_list) {
2143                         if (count >= get->num_dests)
2144                                 break;
2145
2146                         entry.addr = dest->addr;
2147                         entry.port = dest->port;
2148                         entry.conn_flags = atomic_read(&dest->conn_flags);
2149                         entry.weight = atomic_read(&dest->weight);
2150                         entry.u_threshold = dest->u_threshold;
2151                         entry.l_threshold = dest->l_threshold;
2152                         entry.activeconns = atomic_read(&dest->activeconns);
2153                         entry.inactconns = atomic_read(&dest->inactconns);
2154                         entry.persistconns = atomic_read(&dest->persistconns);
2155                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2156                         if (copy_to_user(&uptr->entrytable[count],
2157                                          &entry, sizeof(entry))) {
2158                                 ret = -EFAULT;
2159                                 break;
2160                         }
2161                         count++;
2162                 }
2163                 ip_vs_service_put(svc);
2164         } else
2165                 ret = -ESRCH;
2166         return ret;
2167 }
2168
2169 static inline void
2170 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2171 {
2172 #ifdef CONFIG_IP_VS_PROTO_TCP
2173         u->tcp_timeout =
2174                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2175         u->tcp_fin_timeout =
2176                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2177 #endif
2178 #ifdef CONFIG_IP_VS_PROTO_UDP
2179         u->udp_timeout =
2180                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2181 #endif
2182 }
2183
2184
2185 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2186 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2187 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2188 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2189 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2190 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2191 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2192
2193 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2194         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2195         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2196         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2197         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2198         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2199         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2200         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2201 };
2202
2203 static int
2204 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2205 {
2206         unsigned char arg[128];
2207         int ret = 0;
2208
2209         if (!capable(CAP_NET_ADMIN))
2210                 return -EPERM;
2211
2212         if (*len < get_arglen[GET_CMDID(cmd)]) {
2213                 IP_VS_ERR("get_ctl: len %u < %u\n",
2214                           *len, get_arglen[GET_CMDID(cmd)]);
2215                 return -EINVAL;
2216         }
2217
2218         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2219                 return -EFAULT;
2220
2221         if (mutex_lock_interruptible(&__ip_vs_mutex))
2222                 return -ERESTARTSYS;
2223
2224         switch (cmd) {
2225         case IP_VS_SO_GET_VERSION:
2226         {
2227                 char buf[64];
2228
2229                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2230                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2231                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2232                         ret = -EFAULT;
2233                         goto out;
2234                 }
2235                 *len = strlen(buf)+1;
2236         }
2237         break;
2238
2239         case IP_VS_SO_GET_INFO:
2240         {
2241                 struct ip_vs_getinfo info;
2242                 info.version = IP_VS_VERSION_CODE;
2243                 info.size = IP_VS_CONN_TAB_SIZE;
2244                 info.num_services = ip_vs_num_services;
2245                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2246                         ret = -EFAULT;
2247         }
2248         break;
2249
2250         case IP_VS_SO_GET_SERVICES:
2251         {
2252                 struct ip_vs_get_services *get;
2253                 int size;
2254
2255                 get = (struct ip_vs_get_services *)arg;
2256                 size = sizeof(*get) +
2257                         sizeof(struct ip_vs_service_entry) * get->num_services;
2258                 if (*len != size) {
2259                         IP_VS_ERR("length: %u != %u\n", *len, size);
2260                         ret = -EINVAL;
2261                         goto out;
2262                 }
2263                 ret = __ip_vs_get_service_entries(get, user);
2264         }
2265         break;
2266
2267         case IP_VS_SO_GET_SERVICE:
2268         {
2269                 struct ip_vs_service_entry *entry;
2270                 struct ip_vs_service *svc;
2271
2272                 entry = (struct ip_vs_service_entry *)arg;
2273                 if (entry->fwmark)
2274                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2275                 else
2276                         svc = __ip_vs_service_get(entry->protocol,
2277                                                   entry->addr, entry->port);
2278                 if (svc) {
2279                         ip_vs_copy_service(entry, svc);
2280                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2281                                 ret = -EFAULT;
2282                         ip_vs_service_put(svc);
2283                 } else
2284                         ret = -ESRCH;
2285         }
2286         break;
2287
2288         case IP_VS_SO_GET_DESTS:
2289         {
2290                 struct ip_vs_get_dests *get;
2291                 int size;
2292
2293                 get = (struct ip_vs_get_dests *)arg;
2294                 size = sizeof(*get) +
2295                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2296                 if (*len != size) {
2297                         IP_VS_ERR("length: %u != %u\n", *len, size);
2298                         ret = -EINVAL;
2299                         goto out;
2300                 }
2301                 ret = __ip_vs_get_dest_entries(get, user);
2302         }
2303         break;
2304
2305         case IP_VS_SO_GET_TIMEOUT:
2306         {
2307                 struct ip_vs_timeout_user t;
2308
2309                 __ip_vs_get_timeouts(&t);
2310                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2311                         ret = -EFAULT;
2312         }
2313         break;
2314
2315         case IP_VS_SO_GET_DAEMON:
2316         {
2317                 struct ip_vs_daemon_user d[2];
2318
2319                 memset(&d, 0, sizeof(d));
2320                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2321                         d[0].state = IP_VS_STATE_MASTER;
2322                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2323                         d[0].syncid = ip_vs_master_syncid;
2324                 }
2325                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2326                         d[1].state = IP_VS_STATE_BACKUP;
2327                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2328                         d[1].syncid = ip_vs_backup_syncid;
2329                 }
2330                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2331                         ret = -EFAULT;
2332         }
2333         break;
2334
2335         default:
2336                 ret = -EINVAL;
2337         }
2338
2339   out:
2340         mutex_unlock(&__ip_vs_mutex);
2341         return ret;
2342 }
2343
2344
2345 static struct nf_sockopt_ops ip_vs_sockopts = {
2346         .pf             = PF_INET,
2347         .set_optmin     = IP_VS_BASE_CTL,
2348         .set_optmax     = IP_VS_SO_SET_MAX+1,
2349         .set            = do_ip_vs_set_ctl,
2350         .get_optmin     = IP_VS_BASE_CTL,
2351         .get_optmax     = IP_VS_SO_GET_MAX+1,
2352         .get            = do_ip_vs_get_ctl,
2353         .owner          = THIS_MODULE,
2354 };
2355
2356
2357 int ip_vs_control_init(void)
2358 {
2359         int ret;
2360         int idx;
2361
2362         EnterFunction(2);
2363
2364         ret = nf_register_sockopt(&ip_vs_sockopts);
2365         if (ret) {
2366                 IP_VS_ERR("cannot register sockopt.\n");
2367                 return ret;
2368         }
2369
2370         proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2371         proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2372
2373         sysctl_header = register_sysctl_table(vs_root_table);
2374
2375         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2376         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2377                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2378                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2379         }
2380         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2381                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2382         }
2383
2384         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2385         spin_lock_init(&ip_vs_stats.lock);
2386         ip_vs_new_estimator(&ip_vs_stats);
2387
2388         /* Hook the defense timer */
2389         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2390
2391         LeaveFunction(2);
2392         return 0;
2393 }
2394
2395
2396 void ip_vs_control_cleanup(void)
2397 {
2398         EnterFunction(2);
2399         ip_vs_trash_cleanup();
2400         cancel_rearming_delayed_work(&defense_work);
2401         cancel_work_sync(&defense_work.work);
2402         ip_vs_kill_estimator(&ip_vs_stats);
2403         unregister_sysctl_table(sysctl_header);
2404         proc_net_remove(&init_net, "ip_vs_stats");
2405         proc_net_remove(&init_net, "ip_vs");
2406         nf_unregister_sockopt(&ip_vs_sockopts);
2407         LeaveFunction(2);
2408 }