2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_core.c,v 1.31.2.5 2003/07/29 14:37:12 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
27 #include <linux/config.h>
28 #include <linux/module.h>
29 #include <linux/types.h>
30 #include <linux/kernel.h>
31 #include <linux/errno.h>
33 #include <linux/tcp.h>
34 #include <linux/icmp.h>
39 #include <net/icmp.h> /* for icmp_send */
40 #include <net/route.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv4.h>
45 #include <net/ip_vs.h>
48 EXPORT_SYMBOL(register_ip_vs_scheduler);
49 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
50 EXPORT_SYMBOL(ip_vs_skb_replace);
51 EXPORT_SYMBOL(ip_vs_proto_name);
52 EXPORT_SYMBOL(ip_vs_conn_new);
53 EXPORT_SYMBOL(ip_vs_conn_in_get);
54 EXPORT_SYMBOL(ip_vs_conn_out_get);
55 EXPORT_SYMBOL(ip_vs_conn_listen);
56 EXPORT_SYMBOL(ip_vs_conn_put);
57 #ifdef CONFIG_IP_VS_DEBUG
58 EXPORT_SYMBOL(ip_vs_get_debug_level);
60 EXPORT_SYMBOL(check_for_ip_vs_out);
63 /* ID used in ICMP lookups */
64 #define icmp_id(icmph) ((icmph->un).echo.id)
66 const char *ip_vs_proto_name(unsigned proto)
80 sprintf(buf, "IP_%d", proto);
87 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
89 struct ip_vs_dest *dest = cp->dest;
90 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
91 spin_lock(&dest->stats.lock);
93 dest->stats.inbytes += skb->len;
94 spin_unlock(&dest->stats.lock);
96 spin_lock(&dest->svc->stats.lock);
97 dest->svc->stats.inpkts++;
98 dest->svc->stats.inbytes += skb->len;
99 spin_unlock(&dest->svc->stats.lock);
101 spin_lock(&ip_vs_stats.lock);
102 ip_vs_stats.inpkts++;
103 ip_vs_stats.inbytes += skb->len;
104 spin_unlock(&ip_vs_stats.lock);
110 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
112 struct ip_vs_dest *dest = cp->dest;
113 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
114 spin_lock(&dest->stats.lock);
115 dest->stats.outpkts++;
116 dest->stats.outbytes += skb->len;
117 spin_unlock(&dest->stats.lock);
119 spin_lock(&dest->svc->stats.lock);
120 dest->svc->stats.outpkts++;
121 dest->svc->stats.outbytes += skb->len;
122 spin_unlock(&dest->svc->stats.lock);
124 spin_lock(&ip_vs_stats.lock);
125 ip_vs_stats.outpkts++;
126 ip_vs_stats.outbytes += skb->len;
127 spin_unlock(&ip_vs_stats.lock);
133 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
135 spin_lock(&cp->dest->stats.lock);
136 cp->dest->stats.conns++;
137 spin_unlock(&cp->dest->stats.lock);
139 spin_lock(&svc->stats.lock);
141 spin_unlock(&svc->stats.lock);
143 spin_lock(&ip_vs_stats.lock);
145 spin_unlock(&ip_vs_stats.lock);
149 * IPVS persistent scheduling function
150 * It creates a connection entry according to its template if exists,
151 * or selects a server and creates a connection entry plus a template.
152 * Locking: we are svc user (svc->refcnt), so we hold all dests too
154 static struct ip_vs_conn *
155 ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
157 struct ip_vs_conn *cp = NULL;
158 struct ip_vs_dest *dest;
160 struct ip_vs_conn *ct;
161 __u16 dport; /* destination port to forward */
162 __u32 snet; /* source network of the client, after masking */
164 portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
166 /* Mask saddr with the netmask to adjust template granularity */
167 snet = iph->saddr & svc->netmask;
169 IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
170 "mnet %u.%u.%u.%u\n",
171 NIPQUAD(iph->saddr), ntohs(portp[0]),
172 NIPQUAD(iph->daddr), ntohs(portp[1]),
176 * As far as we know, FTP is a very complicated network protocol, and
177 * it uses control connection and data connections. For active FTP,
178 * FTP server initialize data connection to the client, its source port
179 * is often 20. For passive FTP, FTP server tells the clients the port
180 * that it passively listens to, and the client issues the data
181 * connection. In the tunneling or direct routing mode, the load
182 * balancer is on the client-to-server half of connection, the port
183 * number is unknown to the load balancer. So, a conn template like
184 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
185 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
186 * is created for other persistent services.
188 if (portp[1] == svc->port) {
189 /* Check if a template already exists */
190 if (svc->port != FTPPORT)
191 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
192 iph->daddr, portp[1]);
194 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
197 if (!ct || !ip_vs_check_template(ct)) {
199 * No template found or the dest of the connection
200 * template is not available.
202 dest = svc->scheduler->schedule(svc, iph);
204 IP_VS_DBG(1, "P-schedule: no dest found.\n");
209 * Create a template like <protocol,caddr,0,
210 * vaddr,vport,daddr,dport> for non-ftp service,
211 * and <protocol,caddr,0,vaddr,0,daddr,0>
214 if (svc->port != FTPPORT)
215 ct = ip_vs_conn_new(iph->protocol,
217 iph->daddr, portp[1],
218 dest->addr, dest->port,
219 IP_VS_CONN_F_TEMPLATE,
222 ct = ip_vs_conn_new(iph->protocol,
226 IP_VS_CONN_F_TEMPLATE,
231 ct->timeout = svc->timeout;
233 /* set destination with the found template */
239 * Note: persistent fwmark-based services and persistent
240 * port zero service are handled here.
241 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
242 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
245 ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
246 htonl(svc->fwmark), 0);
248 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
251 if (!ct || !ip_vs_check_template(ct)) {
253 * If it is not persistent port zero, return NULL,
254 * otherwise create a connection template.
259 dest = svc->scheduler->schedule(svc, iph);
261 IP_VS_DBG(1, "P-schedule: no dest found.\n");
266 * Create a template according to the service
269 ct = ip_vs_conn_new(IPPROTO_IP,
271 htonl(svc->fwmark), 0,
273 IP_VS_CONN_F_TEMPLATE,
276 ct = ip_vs_conn_new(iph->protocol,
280 IP_VS_CONN_F_TEMPLATE,
285 ct->timeout = svc->timeout;
287 /* set destination with the found template */
294 * Create a new connection according to the template
296 cp = ip_vs_conn_new(iph->protocol,
297 iph->saddr, portp[0],
298 iph->daddr, portp[1],
308 * Increase the inactive connection counter
309 * because it is in Syn-Received
310 * state (inactive) when the connection is created.
312 atomic_inc(&dest->inactconns);
317 ip_vs_control_add(cp, ct);
325 * IPVS main scheduling function
326 * It selects a server according to the virtual service, and
327 * creates a connection entry.
329 static struct ip_vs_conn *
330 ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
332 struct ip_vs_conn *cp = NULL;
333 struct ip_vs_dest *dest;
339 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
340 return ip_vs_sched_persist(svc, iph);
343 * Non-persistent service
345 portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
346 if (!svc->fwmark && portp[1] != svc->port) {
348 IP_VS_ERR("Schedule: port zero only supported "
349 "in persistent services, "
350 "check your ipvs configuration\n");
354 dest = svc->scheduler->schedule(svc, iph);
356 IP_VS_DBG(1, "Schedule: no dest found.\n");
361 * Create a connection entry.
363 cp = ip_vs_conn_new(iph->protocol,
364 iph->saddr, portp[0],
365 iph->daddr, portp[1],
366 dest->addr, dest->port?dest->port:portp[1],
373 * Increase the inactive connection counter because it is in
374 * Syn-Received state (inactive) when the connection is created.
376 atomic_inc(&dest->inactconns);
378 IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
379 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
380 ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
381 NIPQUAD(cp->caddr), ntohs(cp->cport),
382 NIPQUAD(cp->vaddr), ntohs(cp->vport),
383 NIPQUAD(cp->daddr), ntohs(cp->dport),
384 cp->flags, atomic_read(&cp->refcnt));
391 * Pass or drop the packet.
392 * Called by ip_vs_in, when the virtual service is available but
393 * no destination is available for a new connection.
395 static int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb)
397 struct iphdr *iph = skb->nh.iph;
398 __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
400 /* if it is fwmark-based service, the cache_bypass sysctl is up
401 and the destination is RTN_UNICAST (and not local), then create
402 a cache_bypass connection entry */
403 if (sysctl_ip_vs_cache_bypass && svc->fwmark
404 && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
406 struct ip_vs_conn *cp;
408 ip_vs_service_put(svc);
410 /* create a new connection entry */
411 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
412 cp = ip_vs_conn_new(iph->protocol,
413 iph->saddr, portp[0],
414 iph->daddr, portp[1],
424 ip_vs_in_stats(cp, skb);
427 ip_vs_set_state(cp, VS_STATE_INPUT, iph, portp);
429 /* transmit the first SYN packet */
430 ret = cp->packet_xmit(skb, cp);
432 atomic_inc(&cp->in_pkts);
438 * When the virtual ftp service is presented, packets destined
439 * for other services on the VIP may get here (except services
440 * listed in the ipvs table), pass the packets, because it is
441 * not ipvs job to decide to drop the packets.
443 if ((svc->port == FTPPORT) && (portp[1] != FTPPORT)) {
444 ip_vs_service_put(svc);
448 ip_vs_service_put(svc);
451 * Notify the client that the destination is unreachable, and
452 * release the socket buffer.
453 * Since it is in IP layer, the TCP socket is not actually
454 * created, the TCP RST packet cannot be sent, instead that
455 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
457 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
464 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
465 * chain, and is used for VS/NAT.
466 * It detects packets for VS/NAT connections and sends the packets
467 * immediately. This can avoid that iptable_nat mangles the packets
470 static unsigned int ip_vs_post_routing(unsigned int hooknum,
471 struct sk_buff **skb_p,
472 const struct net_device *in,
473 const struct net_device *out,
474 int (*okfn)(struct sk_buff *))
476 struct sk_buff *skb = *skb_p;
478 if (!(skb->nfcache & NFC_IPVS_PROPERTY))
481 /* The packet was sent from IPVS, exit this chain */
489 * Handle ICMP messages in the inside-to-outside direction (outgoing).
490 * Find any that might be relevant, check against existing connections,
491 * forward to the right destination host if relevant.
492 * Currently handles error types - unreachable, quench, ttl exceeded.
493 * (Only used in VS/NAT)
495 static int ip_vs_out_icmp(struct sk_buff **skb_p)
497 struct sk_buff *skb = *skb_p;
499 struct icmphdr *icmph;
500 struct iphdr *ciph; /* The ip header contained within the ICMP */
501 __u16 *pptr; /* port numbers from TCP/UDP contained header */
504 unsigned short clen, csize;
505 struct ip_vs_conn *cp;
507 /* reassemble IP fragments, but will it happen in ICMP packets?? */
508 if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
509 skb = ip_defrag(skb, IP_DEFRAG_VS_OUT);
515 if (skb_is_nonlinear(skb)) {
516 if (skb_linearize(skb, GFP_ATOMIC) != 0)
518 ip_send_check(skb->nh.iph);
523 icmph = (struct icmphdr *)((char *)iph + ihl);
524 len = ntohs(iph->tot_len) - ihl;
525 if (len < sizeof(struct icmphdr))
528 IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
529 icmph->type, ntohs(icmp_id(icmph)),
530 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
533 * Work through seeing if this is for us.
534 * These checks are supposed to be in an order that means easy
535 * things are checked first to speed up processing.... however
536 * this means that some packets will manage to get a long way
537 * down this stack and then be rejected, but that's life.
539 if ((icmph->type != ICMP_DEST_UNREACH) &&
540 (icmph->type != ICMP_SOURCE_QUENCH) &&
541 (icmph->type != ICMP_TIME_EXCEEDED))
544 /* Now find the contained IP header */
545 clen = len - sizeof(struct icmphdr);
546 if (clen < sizeof(struct iphdr))
548 ciph = (struct iphdr *) (icmph + 1);
549 csize = ciph->ihl << 2;
553 /* We are only interested ICMPs generated from TCP or UDP packets */
554 if (ciph->protocol != IPPROTO_UDP && ciph->protocol != IPPROTO_TCP)
557 /* Skip non-first embedded TCP/UDP fragments */
558 if (ciph->frag_off & __constant_htons(IP_OFFSET))
561 /* We need at least TCP/UDP ports here */
562 if (clen < csize + sizeof(struct udphdr))
566 * Find the ports involved - this packet was
567 * incoming so the ports are right way round
568 * (but reversed relative to outer IP header!)
570 pptr = (__u16 *)&(((char *)ciph)[csize]);
572 /* Ensure the checksum is correct */
573 if (ip_compute_csum((unsigned char *) icmph, len)) {
574 /* Failed checksum! */
575 IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
576 NIPQUAD(iph->saddr));
580 IP_VS_DBG(11, "Handling outgoing ICMP for "
581 "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n",
582 NIPQUAD(ciph->saddr), ntohs(pptr[0]),
583 NIPQUAD(ciph->daddr), ntohs(pptr[1]));
585 /* ciph content is actually <protocol, caddr, cport, daddr, dport> */
586 cp = ip_vs_conn_out_get(ciph->protocol, ciph->daddr, pptr[1],
587 ciph->saddr, pptr[0]);
591 if (IP_VS_FWD_METHOD(cp) != 0) {
592 IP_VS_ERR("shouldn't reach here, because the box is on the"
593 "half connection in the tun/dr module.\n");
596 /* Now we do real damage to this packet...! */
597 /* First change the source IP address, and recalc checksum */
598 iph->saddr = cp->vaddr;
601 /* Now change the *dest* address in the contained IP */
602 ciph->daddr = cp->vaddr;
605 /* the TCP/UDP dest port - cannot redo check */
608 /* And finally the ICMP checksum */
610 icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
611 skb->ip_summed = CHECKSUM_UNNECESSARY;
613 /* do the statistics and put it back */
614 ip_vs_out_stats(cp, skb);
617 IP_VS_DBG(11, "Forwarding correct outgoing ICMP to "
618 "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n",
619 NIPQUAD(ciph->saddr), ntohs(pptr[0]),
620 NIPQUAD(ciph->daddr), ntohs(pptr[1]));
622 skb->nfcache |= NFC_IPVS_PROPERTY;
629 * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
630 * Check if outgoing packet belongs to the established ip_vs_conn,
631 * rewrite addresses of the packet and send it on its way...
633 static unsigned int ip_vs_out(unsigned int hooknum,
634 struct sk_buff **skb_p,
635 const struct net_device *in,
636 const struct net_device *out,
637 int (*okfn)(struct sk_buff *))
639 struct sk_buff *skb = *skb_p;
642 struct ip_vs_conn *cp;
648 if (skb->nfcache & NFC_IPVS_PROPERTY)
652 if (iph->protocol == IPPROTO_ICMP)
653 return ip_vs_out_icmp(skb_p);
655 /* let it go if other IP protocols */
656 if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP)
659 /* reassemble IP fragments */
660 if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
661 skb = ip_defrag(skb, IP_DEFRAG_VS_OUT);
668 /* make sure that protocol header available in skb data area,
669 note that skb data area may be reallocated. */
671 if (ip_vs_header_check(skb, iph->protocol, ihl) == -1)
675 h.raw = (char*) iph + ihl;
678 * Check if the packet belongs to an old entry
680 cp = ip_vs_conn_out_get(iph->protocol, iph->saddr, h.portp[0],
681 iph->daddr, h.portp[1]);
683 if (sysctl_ip_vs_nat_icmp_send &&
684 ip_vs_lookup_real_service(iph->protocol,
685 iph->saddr, h.portp[0])) {
687 * Notify the real server: there is no existing
688 * entry if it is not RST packet or not TCP packet.
690 if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
691 icmp_send(skb, ICMP_DEST_UNREACH,
692 ICMP_PORT_UNREACH, 0);
697 IP_VS_DBG(12, "packet for %s %d.%d.%d.%d:%d "
698 "continue traversal as normal.\n",
699 ip_vs_proto_name(iph->protocol),
702 if (skb_is_nonlinear(skb))
708 * If it has ip_vs_app helper, the helper may change the payload,
709 * so it needs full checksum checking and checksum calculation.
710 * If not, only the header (addr/port) is changed, so it is fast
711 * to do incremental checksum update, and let the destination host
712 * do final checksum checking.
715 if (cp->app && skb_is_nonlinear(skb)) {
716 if (skb_linearize(skb, GFP_ATOMIC) != 0) {
721 h.raw = (char*) iph + ihl;
724 size = skb->len - ihl;
725 IP_VS_DBG(11, "O-pkt: %s size=%d\n",
726 ip_vs_proto_name(iph->protocol), size);
728 /* do TCP/UDP checksum checking if it has application helper */
729 if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
730 switch (skb->ip_summed) {
732 skb->csum = csum_partial(h.raw, size, 0);
734 if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
735 iph->protocol, skb->csum)) {
737 IP_VS_DBG_RL("Outgoing failed %s checksum "
738 "from %d.%d.%d.%d (size=%d)!\n",
739 ip_vs_proto_name(iph->protocol),
746 /* CHECKSUM_UNNECESSARY */
751 IP_VS_DBG(11, "Outgoing %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d\n",
752 ip_vs_proto_name(iph->protocol),
753 NIPQUAD(iph->saddr), ntohs(h.portp[0]),
754 NIPQUAD(iph->daddr), ntohs(h.portp[1]));
756 /* mangle the packet */
757 iph->saddr = cp->vaddr;
758 h.portp[0] = cp->vport;
761 * Call application helper if needed
763 if (ip_vs_app_pkt_out(cp, skb) != 0) {
764 /* skb data has probably changed, update pointers */
766 h.raw = (char*)iph + ihl;
767 size = skb->len - ihl;
771 * Adjust TCP/UDP checksums
773 if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
774 /* Only port and addr are changed, do fast csum update */
775 ip_vs_fast_check_update(&h, cp->daddr, cp->vaddr,
776 cp->dport, cp->vport, iph->protocol);
777 if (skb->ip_summed == CHECKSUM_HW)
778 skb->ip_summed = CHECKSUM_NONE;
780 /* full checksum calculation */
781 switch (iph->protocol) {
784 skb->csum = csum_partial(h.raw, size, 0);
785 h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
788 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
789 ip_vs_proto_name(iph->protocol), h.th->check,
790 (char*)&(h.th->check) - (char*)h.raw);
794 skb->csum = csum_partial(h.raw, size, 0);
795 h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
798 if (h.uh->check == 0)
799 h.uh->check = 0xFFFF;
800 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
801 ip_vs_proto_name(iph->protocol), h.uh->check,
802 (char*)&(h.uh->check) - (char*)h.raw);
808 ip_vs_out_stats(cp, skb);
809 ip_vs_set_state(cp, VS_STATE_OUTPUT, iph, h.portp);
812 skb->nfcache |= NFC_IPVS_PROPERTY;
820 * Check if the packet is for VS/NAT connections, then send it
822 * Called by ip_fw_compact to detect packets for VS/NAT before
823 * they are changed by ipchains masquerading code.
825 unsigned int check_for_ip_vs_out(struct sk_buff **skb_p,
826 int (*okfn)(struct sk_buff *))
830 ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL);
831 if (ret != NF_ACCEPT) {
834 /* send the packet immediately if it is already mangled
836 if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) {
846 * Handle ICMP messages in the outside-to-inside direction (incoming)
847 * and sometimes in outgoing direction from ip_vs_forward_icmp.
848 * Find any that might be relevant, check against existing connections,
849 * forward to the right destination host if relevant.
850 * Currently handles error types - unreachable, quench, ttl exceeded.
852 static int ip_vs_in_icmp(struct sk_buff **skb_p)
854 struct sk_buff *skb = *skb_p;
856 struct icmphdr *icmph;
857 struct iphdr *ciph; /* The ip header contained within the ICMP */
858 __u16 *pptr; /* port numbers from TCP/UDP contained header */
860 unsigned short clen, csize;
861 struct ip_vs_conn *cp;
862 struct rtable *rt; /* Route to the other host */
865 if (skb_is_nonlinear(skb)) {
866 if (skb_linearize(skb, GFP_ATOMIC) != 0)
872 icmph = (struct icmphdr *)((char *)iph + (iph->ihl << 2));
873 len = ntohs(iph->tot_len) - (iph->ihl<<2);
874 if (len < sizeof(struct icmphdr))
877 IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
878 icmph->type, ntohs(icmp_id(icmph)),
879 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
881 if ((icmph->type != ICMP_DEST_UNREACH) &&
882 (icmph->type != ICMP_SOURCE_QUENCH) &&
883 (icmph->type != ICMP_TIME_EXCEEDED))
887 * If we get here we have an ICMP error of one of the above 3 types
888 * Now find the contained IP header
890 clen = len - sizeof(struct icmphdr);
891 if (clen < sizeof(struct iphdr))
893 ciph = (struct iphdr *) (icmph + 1);
894 csize = ciph->ihl << 2;
898 /* We are only interested ICMPs generated from TCP or UDP packets */
899 if (ciph->protocol != IPPROTO_UDP && ciph->protocol != IPPROTO_TCP)
902 /* Skip non-first embedded TCP/UDP fragments */
903 if (ciph->frag_off & __constant_htons(IP_OFFSET))
906 /* We need at least TCP/UDP ports here */
907 if (clen < csize + sizeof(struct udphdr))
910 /* Ensure the checksum is correct */
911 if (ip_compute_csum((unsigned char *) icmph, len)) {
912 /* Failed checksum! */
913 IP_VS_ERR_RL("incoming ICMP: failed checksum from "
914 "%d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
918 pptr = (__u16 *)&(((char *)ciph)[csize]);
920 IP_VS_DBG(11, "Handling incoming ICMP for "
921 "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n",
922 NIPQUAD(ciph->saddr), ntohs(pptr[0]),
923 NIPQUAD(ciph->daddr), ntohs(pptr[1]));
925 /* This is pretty much what ip_vs_conn_in_get() does,
926 except parameters are in the reverse order */
927 cp = ip_vs_conn_in_get(ciph->protocol,
928 ciph->daddr, pptr[1],
929 ciph->saddr, pptr[0]);
933 ip_vs_in_stats(cp, skb);
935 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
936 forwarded directly here, because there is no need to
937 translate address/port back */
938 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
941 ret = cp->packet_xmit(skb, cp);
944 atomic_inc(&cp->in_pkts);
950 * mangle and send the packet here
952 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
956 mtu = rt->u.dst.pmtu;
957 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
959 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
960 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
965 dst_release(skb->dst);
966 skb->dst = &rt->u.dst;
968 /* copy-on-write the packet before mangling it */
969 if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len,
970 &iph, (unsigned char**)&icmph)) {
974 ciph = (struct iphdr *) (icmph + 1);
975 pptr = (__u16 *)&(((char *)ciph)[csize]);
977 /* The ICMP packet for VS/NAT must be written to correct addresses
978 before being forwarded to the right server */
980 /* First change the dest IP address, and recalc checksum */
981 iph->daddr = cp->daddr;
984 /* Now change the *source* address in the contained IP */
985 ciph->saddr = cp->daddr;
988 /* the TCP/UDP source port - cannot redo check */
991 /* And finally the ICMP checksum */
993 icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
994 skb->ip_summed = CHECKSUM_UNNECESSARY;
996 IP_VS_DBG(11, "Forwarding incoming ICMP to "
997 "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n",
998 NIPQUAD(ciph->saddr), ntohs(pptr[0]),
999 NIPQUAD(ciph->daddr), ntohs(pptr[1]));
1001 #ifdef CONFIG_NETFILTER_DEBUG
1002 skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
1003 #endif /* CONFIG_NETFILTER_DEBUG */
1009 dst_link_failure(skb);
1018 * Check if it's for virtual services, look it up,
1019 * and send it on its way...
1021 static unsigned int ip_vs_in(unsigned int hooknum,
1022 struct sk_buff **skb_p,
1023 const struct net_device *in,
1024 const struct net_device *out,
1025 int (*okfn)(struct sk_buff *))
1027 struct sk_buff *skb = *skb_p;
1028 struct iphdr *iph = skb->nh.iph;
1029 union ip_vs_tphdr h;
1030 struct ip_vs_conn *cp;
1031 struct ip_vs_service *svc;
1036 * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
1037 * ... don't know why 1st test DOES NOT include 2nd (?)
1039 if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) {
1040 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
1043 NIPQUAD(iph->daddr));
1047 if (iph->protocol == IPPROTO_ICMP)
1048 return ip_vs_in_icmp(skb_p);
1050 /* let it go if other IP protocols */
1051 if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP)
1054 /* make sure that protocol header available in skb data area,
1055 note that skb data area may be reallocated. */
1056 ihl = iph->ihl << 2;
1057 if (ip_vs_header_check(skb, iph->protocol, ihl) == -1)
1060 h.raw = (char*) iph + ihl;
1063 * Check if the packet belongs to an existing connection entry
1065 cp = ip_vs_conn_in_get(iph->protocol, iph->saddr, h.portp[0],
1066 iph->daddr, h.portp[1]);
1069 (h.th->syn || (iph->protocol!=IPPROTO_TCP)) &&
1070 (svc = ip_vs_service_get(skb->nfmark, iph->protocol,
1071 iph->daddr, h.portp[1]))) {
1072 if (ip_vs_todrop()) {
1074 * It seems that we are very loaded.
1075 * We have to drop this packet :(
1077 ip_vs_service_put(svc);
1082 * Let the virtual server select a real server for the
1083 * incoming connection, and create a connection entry.
1085 cp = ip_vs_schedule(svc, iph);
1087 return ip_vs_leave(svc, skb);
1088 ip_vs_conn_stats(cp, svc);
1089 ip_vs_service_put(svc);
1093 /* sorry, all this trouble for a no-hit :) */
1094 IP_VS_DBG(12, "packet for %s %d.%d.%d.%d:%d continue "
1095 "traversal as normal.\n",
1096 ip_vs_proto_name(iph->protocol),
1097 NIPQUAD(iph->daddr),
1102 IP_VS_DBG(11, "Incoming %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d\n",
1103 ip_vs_proto_name(iph->protocol),
1104 NIPQUAD(iph->saddr), ntohs(h.portp[0]),
1105 NIPQUAD(iph->daddr), ntohs(h.portp[1]));
1107 /* Check the server status */
1108 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1109 /* the destination server is not available */
1111 if (sysctl_ip_vs_expire_nodest_conn) {
1112 /* try to expire the connection immediately */
1113 ip_vs_conn_expire_now(cp);
1115 /* don't restart its timer, and silently
1117 __ip_vs_conn_put(cp);
1121 ip_vs_in_stats(cp, skb);
1122 ip_vs_set_state(cp, VS_STATE_INPUT, iph, h.portp);
1123 if (cp->packet_xmit)
1124 ret = cp->packet_xmit(skb, cp);
1126 IP_VS_DBG_RL("warning: packet_xmit is null");
1130 /* increase its packet counter and check if it is needed
1131 to be synchronized */
1132 atomic_inc(&cp->in_pkts);
1133 if (ip_vs_sync_state & IP_VS_STATE_MASTER &&
1134 (cp->protocol != IPPROTO_TCP ||
1135 cp->state == IP_VS_S_ESTABLISHED) &&
1136 (atomic_read(&cp->in_pkts) % 50 == sysctl_ip_vs_sync_threshold))
1137 ip_vs_sync_conn(cp);
1145 * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
1146 * packets destined for 0.0.0.0/0.
1147 * When fwmark-based virtual service is used, such as transparent
1148 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1149 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1150 * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
1151 * and send them to ip_vs_in_icmp.
1153 static unsigned int ip_vs_forward_icmp(unsigned int hooknum,
1154 struct sk_buff **skb_p,
1155 const struct net_device *in,
1156 const struct net_device *out,
1157 int (*okfn)(struct sk_buff *))
1159 struct sk_buff *skb = *skb_p;
1160 struct iphdr *iph = skb->nh.iph;
1162 if (iph->protocol != IPPROTO_ICMP)
1165 if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
1166 skb = ip_defrag(skb, IP_DEFRAG_VS_FWD);
1172 return ip_vs_in_icmp(skb_p);
1176 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1177 or VS/NAT(change destination), so that filtering rules can be
1179 static struct nf_hook_ops ip_vs_in_ops = {
1181 ip_vs_in, PF_INET, NF_IP_LOCAL_IN, 100
1184 /* After packet filtering, change source only for VS/NAT */
1185 static struct nf_hook_ops ip_vs_out_ops = {
1187 ip_vs_out, PF_INET, NF_IP_FORWARD, 100
1190 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1191 destined for 0.0.0.0/0, which is for incoming IPVS connections */
1192 static struct nf_hook_ops ip_vs_forward_icmp_ops = {
1194 ip_vs_forward_icmp, PF_INET, NF_IP_FORWARD, 99
1197 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1198 static struct nf_hook_ops ip_vs_post_routing_ops = {
1200 ip_vs_post_routing, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC-1
1205 * Initialize IP Virtual Server
1207 static int __init ip_vs_init(void)
1211 ret = ip_vs_control_init();
1213 IP_VS_ERR("can't setup control.\n");
1214 goto cleanup_nothing;
1217 ret = ip_vs_conn_init();
1219 IP_VS_ERR("can't setup connection table.\n");
1220 goto cleanup_control;
1223 ret = ip_vs_app_init();
1225 IP_VS_ERR("can't setup application helper.\n");
1229 ret = nf_register_hook(&ip_vs_in_ops);
1231 IP_VS_ERR("can't register in hook.\n");
1234 ret = nf_register_hook(&ip_vs_out_ops);
1236 IP_VS_ERR("can't register out hook.\n");
1239 ret = nf_register_hook(&ip_vs_post_routing_ops);
1241 IP_VS_ERR("can't register post_routing hook.\n");
1242 goto cleanup_outops;
1244 ret = nf_register_hook(&ip_vs_forward_icmp_ops);
1246 IP_VS_ERR("can't register forward_icmp hook.\n");
1247 goto cleanup_postroutingops;
1250 IP_VS_INFO("ipvs loaded.\n");
1253 cleanup_postroutingops:
1254 nf_unregister_hook(&ip_vs_post_routing_ops);
1256 nf_unregister_hook(&ip_vs_out_ops);
1258 nf_unregister_hook(&ip_vs_in_ops);
1260 ip_vs_app_cleanup();
1262 ip_vs_conn_cleanup();
1264 ip_vs_control_cleanup();
1269 static void __exit ip_vs_cleanup(void)
1271 nf_unregister_hook(&ip_vs_forward_icmp_ops);
1272 nf_unregister_hook(&ip_vs_post_routing_ops);
1273 nf_unregister_hook(&ip_vs_out_ops);
1274 nf_unregister_hook(&ip_vs_in_ops);
1275 ip_vs_app_cleanup();
1276 ip_vs_conn_cleanup();
1277 ip_vs_control_cleanup();
1278 IP_VS_INFO("ipvs unloaded.\n");
1281 module_init(ip_vs_init);
1282 module_exit(ip_vs_cleanup);
1283 MODULE_LICENSE("GPL");