2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen semantics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
69 #include <net/transp_v6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
74 #include <net/netdma.h>
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
82 int sysctl_tcp_tw_reuse;
83 int sysctl_tcp_low_latency;
85 /* Check TCP sequence numbers in ICMP packets. */
86 #define ICMP_MIN_LENGTH 8
88 /* Socket used for sending RSTs */
89 static struct socket *tcp_socket;
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
93 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
94 .lhash_lock = RW_LOCK_UNLOCKED,
95 .lhash_users = ATOMIC_INIT(0),
96 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
99 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
101 return inet_csk_get_port(&tcp_hashinfo, sk, snum,
102 inet_csk_bind_conflict);
105 static void tcp_v4_hash(struct sock *sk)
107 inet_hash(&tcp_hashinfo, sk);
110 void tcp_unhash(struct sock *sk)
112 inet_unhash(&tcp_hashinfo, sk);
115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
117 return secure_tcp_sequence_number(skb->nh.iph->daddr,
123 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
125 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126 struct tcp_sock *tp = tcp_sk(sk);
128 /* With PAWS, it is safe from the viewpoint
129 of data integrity. Even without PAWS it is safe provided sequence
130 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
132 Actually, the idea is close to VJ's one, only timestamp cache is
133 held not per host, but per port pair and TW bucket is used as state
136 If TW bucket has been already destroyed we fall back to VJ's scheme
137 and use initial timestamp retrieved from peer table.
139 if (tcptw->tw_ts_recent_stamp &&
140 (twp == NULL || (sysctl_tcp_tw_reuse &&
141 xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
142 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
143 if (tp->write_seq == 0)
145 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
146 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
154 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
156 /* This will initiate an outgoing connection. */
157 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
159 struct inet_sock *inet = inet_sk(sk);
160 struct tcp_sock *tp = tcp_sk(sk);
161 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
167 if (addr_len < sizeof(struct sockaddr_in))
170 if (usin->sin_family != AF_INET)
171 return -EAFNOSUPPORT;
173 nexthop = daddr = usin->sin_addr.s_addr;
174 if (inet->opt && inet->opt->srr) {
177 nexthop = inet->opt->faddr;
180 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
181 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
183 inet->sport, usin->sin_port, sk);
187 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
192 if (!inet->opt || !inet->opt->srr)
196 inet->saddr = rt->rt_src;
197 inet->rcv_saddr = inet->saddr;
199 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
200 /* Reset inherited state */
201 tp->rx_opt.ts_recent = 0;
202 tp->rx_opt.ts_recent_stamp = 0;
206 if (tcp_death_row.sysctl_tw_recycle &&
207 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
208 struct inet_peer *peer = rt_get_peer(rt);
210 /* VJ's idea. We save last timestamp seen from
211 * the destination in peer table, when entering state TIME-WAIT
212 * and initialize rx_opt.ts_recent from it, when trying new connection.
215 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
216 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217 tp->rx_opt.ts_recent = peer->tcp_ts;
221 inet->dport = usin->sin_port;
224 inet_csk(sk)->icsk_ext_hdr_len = 0;
226 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
228 tp->rx_opt.mss_clamp = 536;
230 /* Socket identity is still unknown (sport may be zero).
231 * However we set state to SYN-SENT and not releasing socket
232 * lock select source port, enter ourselves into the hash tables and
233 * complete initialization after this.
235 tcp_set_state(sk, TCP_SYN_SENT);
236 err = inet_hash_connect(&tcp_death_row, sk);
240 err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
244 /* OK, now commit destination to socket. */
245 sk->sk_gso_type = SKB_GSO_TCPV4;
246 sk_setup_caps(sk, &rt->u.dst);
249 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
254 inet->id = tp->write_seq ^ jiffies;
256 err = tcp_connect(sk);
264 /* This unhashes the socket and releases the local port, if necessary. */
265 tcp_set_state(sk, TCP_CLOSE);
267 sk->sk_route_caps = 0;
273 * This routine does path mtu discovery as defined in RFC1191.
275 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 struct dst_entry *dst;
278 struct inet_sock *inet = inet_sk(sk);
280 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
281 * send out by Linux are always <576bytes so they should go through
284 if (sk->sk_state == TCP_LISTEN)
287 /* We don't check in the destentry if pmtu discovery is forbidden
288 * on this route. We just assume that no packet_to_big packets
289 * are send back when pmtu discovery is not active.
290 * There is a small race when the user changes this flag in the
291 * route, but I think that's acceptable.
293 if ((dst = __sk_dst_check(sk, 0)) == NULL)
296 dst->ops->update_pmtu(dst, mtu);
298 /* Something is about to be wrong... Remember soft error
299 * for the case, if this connection will not able to recover.
301 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
302 sk->sk_err_soft = EMSGSIZE;
306 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
307 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
308 tcp_sync_mss(sk, mtu);
310 /* Resend the TCP packet because it's
311 * clear that the old packet has been
312 * dropped. This is the new "fast" path mtu
315 tcp_simple_retransmit(sk);
316 } /* else let the usual retransmit timer handle it */
320 * This routine is called by the ICMP module when it gets some
321 * sort of error condition. If err < 0 then the socket should
322 * be closed and the error returned to the user. If err > 0
323 * it's just the icmp type << 8 | icmp code. After adjustment
324 * header points to the first 8 bytes of the tcp header. We need
325 * to find the appropriate port.
327 * The locking strategy used here is very "optimistic". When
328 * someone else accesses the socket the ICMP is just dropped
329 * and for some paths there is no check at all.
330 * A more general error queue to queue errors for later handling
331 * is probably better.
335 void tcp_v4_err(struct sk_buff *skb, u32 info)
337 struct iphdr *iph = (struct iphdr *)skb->data;
338 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340 struct inet_sock *inet;
341 int type = skb->h.icmph->type;
342 int code = skb->h.icmph->code;
347 if (skb->len < (iph->ihl << 2) + 8) {
348 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
352 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
353 th->source, inet_iif(skb));
355 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
358 if (sk->sk_state == TCP_TIME_WAIT) {
359 inet_twsk_put((struct inet_timewait_sock *)sk);
364 /* If too many ICMPs get dropped on busy
365 * servers this needs to be solved differently.
367 if (sock_owned_by_user(sk))
368 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370 if (sk->sk_state == TCP_CLOSE)
374 seq = ntohl(th->seq);
375 if (sk->sk_state != TCP_LISTEN &&
376 !between(seq, tp->snd_una, tp->snd_nxt)) {
377 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
382 case ICMP_SOURCE_QUENCH:
383 /* Just silently ignore these. */
385 case ICMP_PARAMETERPROB:
388 case ICMP_DEST_UNREACH:
389 if (code > NR_ICMP_UNREACH)
392 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393 if (!sock_owned_by_user(sk))
394 do_pmtu_discovery(sk, iph, info);
398 err = icmp_err_convert[code].errno;
400 case ICMP_TIME_EXCEEDED:
407 switch (sk->sk_state) {
408 struct request_sock *req, **prev;
410 if (sock_owned_by_user(sk))
413 req = inet_csk_search_req(sk, &prev, th->dest,
414 iph->daddr, iph->saddr);
418 /* ICMPs are not backlogged, hence we cannot get
419 an established socket here.
423 if (seq != tcp_rsk(req)->snt_isn) {
424 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
429 * Still in SYN_RECV, just remove it silently.
430 * There is no good way to pass the error to the newly
431 * created socket, and POSIX does not want network
432 * errors returned from accept().
434 inet_csk_reqsk_queue_drop(sk, req, prev);
438 case TCP_SYN_RECV: /* Cannot happen.
439 It can f.e. if SYNs crossed.
441 if (!sock_owned_by_user(sk)) {
442 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
445 sk->sk_error_report(sk);
449 sk->sk_err_soft = err;
454 /* If we've already connected we will keep trying
455 * until we time out, or the user gives up.
457 * rfc1122 4.2.3.9 allows to consider as hard errors
458 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459 * but it is obsoleted by pmtu discovery).
461 * Note, that in modern internet, where routing is unreliable
462 * and in each dark corner broken firewalls sit, sending random
463 * errors ordered by their masters even this two messages finally lose
464 * their original sense (even Linux sends invalid PORT_UNREACHs)
466 * Now we are in compliance with RFCs.
471 if (!sock_owned_by_user(sk) && inet->recverr) {
473 sk->sk_error_report(sk);
474 } else { /* Only an error on timeout */
475 sk->sk_err_soft = err;
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
486 struct inet_sock *inet = inet_sk(sk);
487 struct tcphdr *th = skb->h.th;
489 if (skb->ip_summed == CHECKSUM_HW) {
490 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
491 skb->csum = offsetof(struct tcphdr, check);
493 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
494 csum_partial((char *)th,
501 * This routine will send an RST to the other tcp.
503 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
505 * Answer: if a packet caused RST, it is not for a socket
506 * existing in our system, if it is matched to a socket,
507 * it is just duplicate segment or bug in other side's TCP.
508 * So that we build reply only basing on parameters
509 * arrived with segment.
510 * Exception: precedence violation. We do not implement it in any case.
513 static void tcp_v4_send_reset(struct sk_buff *skb)
515 struct tcphdr *th = skb->h.th;
517 struct ip_reply_arg arg;
519 /* Never send a reset in response to a reset. */
523 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
526 /* Swap the send and the receive. */
527 memset(&rth, 0, sizeof(struct tcphdr));
528 rth.dest = th->source;
529 rth.source = th->dest;
530 rth.doff = sizeof(struct tcphdr) / 4;
534 rth.seq = th->ack_seq;
537 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
538 skb->len - (th->doff << 2));
541 memset(&arg, 0, sizeof arg);
542 arg.iov[0].iov_base = (unsigned char *)&rth;
543 arg.iov[0].iov_len = sizeof rth;
544 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
545 skb->nh.iph->saddr, /*XXX*/
546 sizeof(struct tcphdr), IPPROTO_TCP, 0);
547 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
549 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
551 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
552 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
555 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
556 outside socket context is ugly, certainly. What can I do?
559 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
562 struct tcphdr *th = skb->h.th;
567 struct ip_reply_arg arg;
569 memset(&rep.th, 0, sizeof(struct tcphdr));
570 memset(&arg, 0, sizeof arg);
572 arg.iov[0].iov_base = (unsigned char *)&rep;
573 arg.iov[0].iov_len = sizeof(rep.th);
575 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
576 (TCPOPT_TIMESTAMP << 8) |
578 rep.tsopt[1] = htonl(tcp_time_stamp);
579 rep.tsopt[2] = htonl(ts);
580 arg.iov[0].iov_len = sizeof(rep);
583 /* Swap the send and the receive. */
584 rep.th.dest = th->source;
585 rep.th.source = th->dest;
586 rep.th.doff = arg.iov[0].iov_len / 4;
587 rep.th.seq = htonl(seq);
588 rep.th.ack_seq = htonl(ack);
590 rep.th.window = htons(win);
592 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
593 skb->nh.iph->saddr, /*XXX*/
594 arg.iov[0].iov_len, IPPROTO_TCP, 0);
595 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
599 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
602 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
604 struct inet_timewait_sock *tw = inet_twsk(sk);
605 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
607 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
608 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
613 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
615 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
620 * Send a SYN-ACK after having received an ACK.
621 * This still operates on a request_sock only, not on a big
624 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
625 struct dst_entry *dst)
627 const struct inet_request_sock *ireq = inet_rsk(req);
629 struct sk_buff * skb;
631 /* First, grab a route. */
632 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
635 skb = tcp_make_synack(sk, dst, req);
638 struct tcphdr *th = skb->h.th;
640 th->check = tcp_v4_check(th, skb->len,
643 csum_partial((char *)th, skb->len,
646 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
649 if (err == NET_XMIT_CN)
659 * IPv4 request_sock destructor.
661 static void tcp_v4_reqsk_destructor(struct request_sock *req)
663 kfree(inet_rsk(req)->opt);
666 #ifdef CONFIG_SYN_COOKIES
667 static void syn_flood_warning(struct sk_buff *skb)
669 static unsigned long warntime;
671 if (time_after(jiffies, (warntime + HZ * 60))) {
674 "possible SYN flooding on port %d. Sending cookies.\n",
675 ntohs(skb->h.th->dest));
681 * Save and compile IPv4 options into the request_sock if needed.
683 static struct ip_options *tcp_v4_save_options(struct sock *sk,
686 struct ip_options *opt = &(IPCB(skb)->opt);
687 struct ip_options *dopt = NULL;
689 if (opt && opt->optlen) {
690 int opt_size = optlength(opt);
691 dopt = kmalloc(opt_size, GFP_ATOMIC);
693 if (ip_options_echo(dopt, skb)) {
702 struct request_sock_ops tcp_request_sock_ops = {
704 .obj_size = sizeof(struct tcp_request_sock),
705 .rtx_syn_ack = tcp_v4_send_synack,
706 .send_ack = tcp_v4_reqsk_send_ack,
707 .destructor = tcp_v4_reqsk_destructor,
708 .send_reset = tcp_v4_send_reset,
711 static struct timewait_sock_ops tcp_timewait_sock_ops = {
712 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
713 .twsk_unique = tcp_twsk_unique,
716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
718 struct inet_request_sock *ireq;
719 struct tcp_options_received tmp_opt;
720 struct request_sock *req;
721 __u32 saddr = skb->nh.iph->saddr;
722 __u32 daddr = skb->nh.iph->daddr;
723 __u32 isn = TCP_SKB_CB(skb)->when;
724 struct dst_entry *dst = NULL;
725 #ifdef CONFIG_SYN_COOKIES
728 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
731 /* Never answer to SYNs send to broadcast or multicast */
732 if (((struct rtable *)skb->dst)->rt_flags &
733 (RTCF_BROADCAST | RTCF_MULTICAST))
736 /* TW buckets are converted to open requests without
737 * limitations, they conserve resources and peer is
738 * evidently real one.
740 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
741 #ifdef CONFIG_SYN_COOKIES
742 if (sysctl_tcp_syncookies) {
749 /* Accept backlog is full. If we have already queued enough
750 * of warm entries in syn queue, drop request. It is better than
751 * clogging syn queue with openreqs with exponentially increasing
754 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
757 req = reqsk_alloc(&tcp_request_sock_ops);
761 tcp_clear_options(&tmp_opt);
762 tmp_opt.mss_clamp = 536;
763 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
765 tcp_parse_options(skb, &tmp_opt, 0);
768 tcp_clear_options(&tmp_opt);
769 tmp_opt.saw_tstamp = 0;
772 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
773 /* Some OSes (unknown ones, but I see them on web server, which
774 * contains information interesting only for windows'
775 * users) do not send their stamp in SYN. It is easy case.
776 * We simply do not advertise TS support.
778 tmp_opt.saw_tstamp = 0;
779 tmp_opt.tstamp_ok = 0;
781 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
783 tcp_openreq_init(req, &tmp_opt, skb);
785 ireq = inet_rsk(req);
786 ireq->loc_addr = daddr;
787 ireq->rmt_addr = saddr;
788 ireq->opt = tcp_v4_save_options(sk, skb);
790 TCP_ECN_create_request(req, skb->h.th);
793 #ifdef CONFIG_SYN_COOKIES
794 syn_flood_warning(skb);
796 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
798 struct inet_peer *peer = NULL;
800 /* VJ's idea. We save last timestamp seen
801 * from the destination in peer table, when entering
802 * state TIME-WAIT, and check against it before
803 * accepting new connection request.
805 * If "isn" is not zero, this request hit alive
806 * timewait bucket, so that all the necessary checks
807 * are made in the function processing timewait state.
809 if (tmp_opt.saw_tstamp &&
810 tcp_death_row.sysctl_tw_recycle &&
811 (dst = inet_csk_route_req(sk, req)) != NULL &&
812 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
813 peer->v4daddr == saddr) {
814 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
815 (s32)(peer->tcp_ts - req->ts_recent) >
817 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
822 /* Kill the following clause, if you dislike this way. */
823 else if (!sysctl_tcp_syncookies &&
824 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
825 (sysctl_max_syn_backlog >> 2)) &&
826 (!peer || !peer->tcp_ts_stamp) &&
827 (!dst || !dst_metric(dst, RTAX_RTT))) {
828 /* Without syncookies last quarter of
829 * backlog is filled with destinations,
830 * proven to be alive.
831 * It means that we continue to communicate
832 * to destinations, already remembered
833 * to the moment of synflood.
835 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
836 "request from %u.%u.%u.%u/%u\n",
838 ntohs(skb->h.th->source));
843 isn = tcp_v4_init_sequence(sk, skb);
845 tcp_rsk(req)->snt_isn = isn;
847 if (tcp_v4_send_synack(sk, req, dst))
853 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
860 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
866 * The three way handshake has completed - we got a valid synack -
867 * now create the new socket.
869 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
870 struct request_sock *req,
871 struct dst_entry *dst)
873 struct inet_request_sock *ireq;
874 struct inet_sock *newinet;
875 struct tcp_sock *newtp;
878 if (sk_acceptq_is_full(sk))
881 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
884 newsk = tcp_create_openreq_child(sk, req, skb);
888 newsk->sk_gso_type = SKB_GSO_TCPV4;
889 sk_setup_caps(newsk, dst);
891 newtp = tcp_sk(newsk);
892 newinet = inet_sk(newsk);
893 ireq = inet_rsk(req);
894 newinet->daddr = ireq->rmt_addr;
895 newinet->rcv_saddr = ireq->loc_addr;
896 newinet->saddr = ireq->loc_addr;
897 newinet->opt = ireq->opt;
899 newinet->mc_index = inet_iif(skb);
900 newinet->mc_ttl = skb->nh.iph->ttl;
901 inet_csk(newsk)->icsk_ext_hdr_len = 0;
903 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
904 newinet->id = newtp->write_seq ^ jiffies;
906 tcp_mtup_init(newsk);
907 tcp_sync_mss(newsk, dst_mtu(dst));
908 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
909 tcp_initialize_rcv_mss(newsk);
911 __inet_hash(&tcp_hashinfo, newsk, 0);
912 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
917 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
919 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
924 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
926 struct tcphdr *th = skb->h.th;
927 struct iphdr *iph = skb->nh.iph;
929 struct request_sock **prev;
930 /* Find possible connection requests. */
931 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
932 iph->saddr, iph->daddr);
934 return tcp_check_req(sk, skb, req, prev);
936 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
937 th->source, skb->nh.iph->daddr,
938 ntohs(th->dest), inet_iif(skb));
941 if (nsk->sk_state != TCP_TIME_WAIT) {
945 inet_twsk_put((struct inet_timewait_sock *)nsk);
949 #ifdef CONFIG_SYN_COOKIES
950 if (!th->rst && !th->syn && th->ack)
951 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
956 static int tcp_v4_checksum_init(struct sk_buff *skb)
958 if (skb->ip_summed == CHECKSUM_HW) {
959 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
960 skb->nh.iph->daddr, skb->csum)) {
961 skb->ip_summed = CHECKSUM_UNNECESSARY;
966 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
967 skb->len, IPPROTO_TCP, 0);
969 if (skb->len <= 76) {
970 return __skb_checksum_complete(skb);
976 /* The socket must have it's spinlock held when we get
979 * We have a potential double-lock case here, so even when
980 * doing backlog processing we use the BH locking scheme.
981 * This is because we cannot sleep with the original spinlock
984 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
986 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
988 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
994 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
997 if (sk->sk_state == TCP_LISTEN) {
998 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1003 if (tcp_child_process(sk, nsk, skb))
1009 TCP_CHECK_TIMER(sk);
1010 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1012 TCP_CHECK_TIMER(sk);
1016 tcp_v4_send_reset(skb);
1019 /* Be careful here. If this function gets more complicated and
1020 * gcc suffers from register pressure on the x86, sk (in %ebx)
1021 * might be destroyed here. This current version compiles correctly,
1022 * but you have been warned.
1027 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1035 int tcp_v4_rcv(struct sk_buff *skb)
1041 if (skb->pkt_type != PACKET_HOST)
1044 /* Count it even if it's bad */
1045 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1047 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1052 if (th->doff < sizeof(struct tcphdr) / 4)
1054 if (!pskb_may_pull(skb, th->doff * 4))
1057 /* An explanation is required here, I think.
1058 * Packet length and doff are validated by header prediction,
1059 * provided case of th->doff==0 is eliminated.
1060 * So, we defer the checks. */
1061 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1062 tcp_v4_checksum_init(skb)))
1066 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1067 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1068 skb->len - th->doff * 4);
1069 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1070 TCP_SKB_CB(skb)->when = 0;
1071 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1072 TCP_SKB_CB(skb)->sacked = 0;
1074 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1075 skb->nh.iph->daddr, ntohs(th->dest),
1082 if (sk->sk_state == TCP_TIME_WAIT)
1085 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1086 goto discard_and_relse;
1089 if (sk_filter(sk, skb, 0))
1090 goto discard_and_relse;
1096 if (!sock_owned_by_user(sk)) {
1097 #ifdef CONFIG_NET_DMA
1098 struct tcp_sock *tp = tcp_sk(sk);
1099 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1100 tp->ucopy.dma_chan = get_softnet_dma();
1101 if (tp->ucopy.dma_chan)
1102 ret = tcp_v4_do_rcv(sk, skb);
1106 if (!tcp_prequeue(sk, skb))
1107 ret = tcp_v4_do_rcv(sk, skb);
1110 sk_add_backlog(sk, skb);
1118 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1121 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1123 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1125 tcp_v4_send_reset(skb);
1129 /* Discard frame. */
1138 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1139 inet_twsk_put((struct inet_timewait_sock *) sk);
1143 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1144 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1145 inet_twsk_put((struct inet_timewait_sock *) sk);
1148 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1151 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1156 inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1158 inet_twsk_put((struct inet_timewait_sock *)sk);
1162 /* Fall through to ACK */
1165 tcp_v4_timewait_ack(sk, skb);
1169 case TCP_TW_SUCCESS:;
1174 /* VJ's idea. Save last timestamp seen from this destination
1175 * and hold it at least for normal timewait interval to use for duplicate
1176 * segment detection in subsequent connections, before they enter synchronized
1180 int tcp_v4_remember_stamp(struct sock *sk)
1182 struct inet_sock *inet = inet_sk(sk);
1183 struct tcp_sock *tp = tcp_sk(sk);
1184 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1185 struct inet_peer *peer = NULL;
1188 if (!rt || rt->rt_dst != inet->daddr) {
1189 peer = inet_getpeer(inet->daddr, 1);
1193 rt_bind_peer(rt, 1);
1198 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1199 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1200 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1201 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1202 peer->tcp_ts = tp->rx_opt.ts_recent;
1212 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1214 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1217 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1219 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1220 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1221 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1222 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1223 peer->tcp_ts = tcptw->tw_ts_recent;
1232 struct inet_connection_sock_af_ops ipv4_specific = {
1233 .queue_xmit = ip_queue_xmit,
1234 .send_check = tcp_v4_send_check,
1235 .rebuild_header = inet_sk_rebuild_header,
1236 .conn_request = tcp_v4_conn_request,
1237 .syn_recv_sock = tcp_v4_syn_recv_sock,
1238 .remember_stamp = tcp_v4_remember_stamp,
1239 .net_header_len = sizeof(struct iphdr),
1240 .setsockopt = ip_setsockopt,
1241 .getsockopt = ip_getsockopt,
1242 .addr2sockaddr = inet_csk_addr2sockaddr,
1243 .sockaddr_len = sizeof(struct sockaddr_in),
1244 #ifdef CONFIG_COMPAT
1245 .compat_setsockopt = compat_ip_setsockopt,
1246 .compat_getsockopt = compat_ip_getsockopt,
1250 /* NOTE: A lot of things set to zero explicitly by call to
1251 * sk_alloc() so need not be done here.
1253 static int tcp_v4_init_sock(struct sock *sk)
1255 struct inet_connection_sock *icsk = inet_csk(sk);
1256 struct tcp_sock *tp = tcp_sk(sk);
1258 skb_queue_head_init(&tp->out_of_order_queue);
1259 tcp_init_xmit_timers(sk);
1260 tcp_prequeue_init(tp);
1262 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1263 tp->mdev = TCP_TIMEOUT_INIT;
1265 /* So many TCP implementations out there (incorrectly) count the
1266 * initial SYN frame in their delayed-ACK and congestion control
1267 * algorithms that we must have the following bandaid to talk
1268 * efficiently to them. -DaveM
1272 /* See draft-stevens-tcpca-spec-01 for discussion of the
1273 * initialization of these values.
1275 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1276 tp->snd_cwnd_clamp = ~0;
1277 tp->mss_cache = 536;
1279 tp->reordering = sysctl_tcp_reordering;
1280 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1282 sk->sk_state = TCP_CLOSE;
1284 sk->sk_write_space = sk_stream_write_space;
1285 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1287 icsk->icsk_af_ops = &ipv4_specific;
1288 icsk->icsk_sync_mss = tcp_sync_mss;
1290 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1291 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1293 atomic_inc(&tcp_sockets_allocated);
1298 int tcp_v4_destroy_sock(struct sock *sk)
1300 struct tcp_sock *tp = tcp_sk(sk);
1302 tcp_clear_xmit_timers(sk);
1304 tcp_cleanup_congestion_control(sk);
1306 /* Cleanup up the write buffer. */
1307 sk_stream_writequeue_purge(sk);
1309 /* Cleans up our, hopefully empty, out_of_order_queue. */
1310 __skb_queue_purge(&tp->out_of_order_queue);
1312 #ifdef CONFIG_NET_DMA
1313 /* Cleans up our sk_async_wait_queue */
1314 __skb_queue_purge(&sk->sk_async_wait_queue);
1317 /* Clean prequeue, it must be empty really */
1318 __skb_queue_purge(&tp->ucopy.prequeue);
1320 /* Clean up a referenced TCP bind bucket. */
1321 if (inet_csk(sk)->icsk_bind_hash)
1322 inet_put_port(&tcp_hashinfo, sk);
1325 * If sendmsg cached page exists, toss it.
1327 if (sk->sk_sndmsg_page) {
1328 __free_page(sk->sk_sndmsg_page);
1329 sk->sk_sndmsg_page = NULL;
1332 atomic_dec(&tcp_sockets_allocated);
1337 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1339 #ifdef CONFIG_PROC_FS
1340 /* Proc filesystem TCP sock list dumping. */
1342 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1344 return hlist_empty(head) ? NULL :
1345 list_entry(head->first, struct inet_timewait_sock, tw_node);
1348 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1350 return tw->tw_node.next ?
1351 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1354 static void *listening_get_next(struct seq_file *seq, void *cur)
1356 struct inet_connection_sock *icsk;
1357 struct hlist_node *node;
1358 struct sock *sk = cur;
1359 struct tcp_iter_state* st = seq->private;
1363 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1369 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1370 struct request_sock *req = cur;
1372 icsk = inet_csk(st->syn_wait_sk);
1376 if (req->rsk_ops->family == st->family) {
1382 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1385 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1387 sk = sk_next(st->syn_wait_sk);
1388 st->state = TCP_SEQ_STATE_LISTENING;
1389 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1391 icsk = inet_csk(sk);
1392 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1393 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1395 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1399 sk_for_each_from(sk, node) {
1400 if (sk->sk_family == st->family) {
1404 icsk = inet_csk(sk);
1405 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1406 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1408 st->uid = sock_i_uid(sk);
1409 st->syn_wait_sk = sk;
1410 st->state = TCP_SEQ_STATE_OPENREQ;
1414 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1416 if (++st->bucket < INET_LHTABLE_SIZE) {
1417 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1425 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1427 void *rc = listening_get_next(seq, NULL);
1429 while (rc && *pos) {
1430 rc = listening_get_next(seq, rc);
1436 static void *established_get_first(struct seq_file *seq)
1438 struct tcp_iter_state* st = seq->private;
1441 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1443 struct hlist_node *node;
1444 struct inet_timewait_sock *tw;
1446 /* We can reschedule _before_ having picked the target: */
1447 cond_resched_softirq();
1449 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1450 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1451 if (sk->sk_family != st->family) {
1457 st->state = TCP_SEQ_STATE_TIME_WAIT;
1458 inet_twsk_for_each(tw, node,
1459 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1460 if (tw->tw_family != st->family) {
1466 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1467 st->state = TCP_SEQ_STATE_ESTABLISHED;
1473 static void *established_get_next(struct seq_file *seq, void *cur)
1475 struct sock *sk = cur;
1476 struct inet_timewait_sock *tw;
1477 struct hlist_node *node;
1478 struct tcp_iter_state* st = seq->private;
1482 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1486 while (tw && tw->tw_family != st->family) {
1493 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1494 st->state = TCP_SEQ_STATE_ESTABLISHED;
1496 /* We can reschedule between buckets: */
1497 cond_resched_softirq();
1499 if (++st->bucket < tcp_hashinfo.ehash_size) {
1500 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1501 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1509 sk_for_each_from(sk, node) {
1510 if (sk->sk_family == st->family)
1514 st->state = TCP_SEQ_STATE_TIME_WAIT;
1515 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1523 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1525 void *rc = established_get_first(seq);
1528 rc = established_get_next(seq, rc);
1534 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1537 struct tcp_iter_state* st = seq->private;
1539 inet_listen_lock(&tcp_hashinfo);
1540 st->state = TCP_SEQ_STATE_LISTENING;
1541 rc = listening_get_idx(seq, &pos);
1544 inet_listen_unlock(&tcp_hashinfo);
1546 st->state = TCP_SEQ_STATE_ESTABLISHED;
1547 rc = established_get_idx(seq, pos);
1553 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1555 struct tcp_iter_state* st = seq->private;
1556 st->state = TCP_SEQ_STATE_LISTENING;
1558 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1561 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1564 struct tcp_iter_state* st;
1566 if (v == SEQ_START_TOKEN) {
1567 rc = tcp_get_idx(seq, 0);
1572 switch (st->state) {
1573 case TCP_SEQ_STATE_OPENREQ:
1574 case TCP_SEQ_STATE_LISTENING:
1575 rc = listening_get_next(seq, v);
1577 inet_listen_unlock(&tcp_hashinfo);
1579 st->state = TCP_SEQ_STATE_ESTABLISHED;
1580 rc = established_get_first(seq);
1583 case TCP_SEQ_STATE_ESTABLISHED:
1584 case TCP_SEQ_STATE_TIME_WAIT:
1585 rc = established_get_next(seq, v);
1593 static void tcp_seq_stop(struct seq_file *seq, void *v)
1595 struct tcp_iter_state* st = seq->private;
1597 switch (st->state) {
1598 case TCP_SEQ_STATE_OPENREQ:
1600 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1601 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1603 case TCP_SEQ_STATE_LISTENING:
1604 if (v != SEQ_START_TOKEN)
1605 inet_listen_unlock(&tcp_hashinfo);
1607 case TCP_SEQ_STATE_TIME_WAIT:
1608 case TCP_SEQ_STATE_ESTABLISHED:
1610 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1616 static int tcp_seq_open(struct inode *inode, struct file *file)
1618 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1619 struct seq_file *seq;
1620 struct tcp_iter_state *s;
1623 if (unlikely(afinfo == NULL))
1626 s = kmalloc(sizeof(*s), GFP_KERNEL);
1629 memset(s, 0, sizeof(*s));
1630 s->family = afinfo->family;
1631 s->seq_ops.start = tcp_seq_start;
1632 s->seq_ops.next = tcp_seq_next;
1633 s->seq_ops.show = afinfo->seq_show;
1634 s->seq_ops.stop = tcp_seq_stop;
1636 rc = seq_open(file, &s->seq_ops);
1639 seq = file->private_data;
1648 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1651 struct proc_dir_entry *p;
1655 afinfo->seq_fops->owner = afinfo->owner;
1656 afinfo->seq_fops->open = tcp_seq_open;
1657 afinfo->seq_fops->read = seq_read;
1658 afinfo->seq_fops->llseek = seq_lseek;
1659 afinfo->seq_fops->release = seq_release_private;
1661 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1669 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1673 proc_net_remove(afinfo->name);
1674 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1677 static void get_openreq4(struct sock *sk, struct request_sock *req,
1678 char *tmpbuf, int i, int uid)
1680 const struct inet_request_sock *ireq = inet_rsk(req);
1681 int ttd = req->expires - jiffies;
1683 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1684 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1687 ntohs(inet_sk(sk)->sport),
1689 ntohs(ireq->rmt_port),
1691 0, 0, /* could print option size, but that is af dependent. */
1692 1, /* timers active (only the expire timer) */
1693 jiffies_to_clock_t(ttd),
1696 0, /* non standard timer */
1697 0, /* open_requests have no inode */
1698 atomic_read(&sk->sk_refcnt),
1702 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1705 unsigned long timer_expires;
1706 struct tcp_sock *tp = tcp_sk(sp);
1707 const struct inet_connection_sock *icsk = inet_csk(sp);
1708 struct inet_sock *inet = inet_sk(sp);
1709 unsigned int dest = inet->daddr;
1710 unsigned int src = inet->rcv_saddr;
1711 __u16 destp = ntohs(inet->dport);
1712 __u16 srcp = ntohs(inet->sport);
1714 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1716 timer_expires = icsk->icsk_timeout;
1717 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1719 timer_expires = icsk->icsk_timeout;
1720 } else if (timer_pending(&sp->sk_timer)) {
1722 timer_expires = sp->sk_timer.expires;
1725 timer_expires = jiffies;
1728 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1729 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1730 i, src, srcp, dest, destp, sp->sk_state,
1731 tp->write_seq - tp->snd_una,
1732 (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
1734 jiffies_to_clock_t(timer_expires - jiffies),
1735 icsk->icsk_retransmits,
1737 icsk->icsk_probes_out,
1739 atomic_read(&sp->sk_refcnt), sp,
1742 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1744 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1747 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1749 unsigned int dest, src;
1751 int ttd = tw->tw_ttd - jiffies;
1756 dest = tw->tw_daddr;
1757 src = tw->tw_rcv_saddr;
1758 destp = ntohs(tw->tw_dport);
1759 srcp = ntohs(tw->tw_sport);
1761 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1762 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1763 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1764 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1765 atomic_read(&tw->tw_refcnt), tw);
1770 static int tcp4_seq_show(struct seq_file *seq, void *v)
1772 struct tcp_iter_state* st;
1773 char tmpbuf[TMPSZ + 1];
1775 if (v == SEQ_START_TOKEN) {
1776 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1777 " sl local_address rem_address st tx_queue "
1778 "rx_queue tr tm->when retrnsmt uid timeout "
1784 switch (st->state) {
1785 case TCP_SEQ_STATE_LISTENING:
1786 case TCP_SEQ_STATE_ESTABLISHED:
1787 get_tcp4_sock(v, tmpbuf, st->num);
1789 case TCP_SEQ_STATE_OPENREQ:
1790 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1792 case TCP_SEQ_STATE_TIME_WAIT:
1793 get_timewait4_sock(v, tmpbuf, st->num);
1796 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1801 static struct file_operations tcp4_seq_fops;
1802 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1803 .owner = THIS_MODULE,
1806 .seq_show = tcp4_seq_show,
1807 .seq_fops = &tcp4_seq_fops,
1810 int __init tcp4_proc_init(void)
1812 return tcp_proc_register(&tcp4_seq_afinfo);
1815 void tcp4_proc_exit(void)
1817 tcp_proc_unregister(&tcp4_seq_afinfo);
1819 #endif /* CONFIG_PROC_FS */
1821 struct proto tcp_prot = {
1823 .owner = THIS_MODULE,
1825 .connect = tcp_v4_connect,
1826 .disconnect = tcp_disconnect,
1827 .accept = inet_csk_accept,
1829 .init = tcp_v4_init_sock,
1830 .destroy = tcp_v4_destroy_sock,
1831 .shutdown = tcp_shutdown,
1832 .setsockopt = tcp_setsockopt,
1833 .getsockopt = tcp_getsockopt,
1834 .sendmsg = tcp_sendmsg,
1835 .recvmsg = tcp_recvmsg,
1836 .backlog_rcv = tcp_v4_do_rcv,
1837 .hash = tcp_v4_hash,
1838 .unhash = tcp_unhash,
1839 .get_port = tcp_v4_get_port,
1840 .enter_memory_pressure = tcp_enter_memory_pressure,
1841 .sockets_allocated = &tcp_sockets_allocated,
1842 .orphan_count = &tcp_orphan_count,
1843 .memory_allocated = &tcp_memory_allocated,
1844 .memory_pressure = &tcp_memory_pressure,
1845 .sysctl_mem = sysctl_tcp_mem,
1846 .sysctl_wmem = sysctl_tcp_wmem,
1847 .sysctl_rmem = sysctl_tcp_rmem,
1848 .max_header = MAX_TCP_HEADER,
1849 .obj_size = sizeof(struct tcp_sock),
1850 .twsk_prot = &tcp_timewait_sock_ops,
1851 .rsk_prot = &tcp_request_sock_ops,
1852 #ifdef CONFIG_COMPAT
1853 .compat_setsockopt = compat_tcp_setsockopt,
1854 .compat_getsockopt = compat_tcp_getsockopt,
1858 void __init tcp_v4_init(struct net_proto_family *ops)
1860 if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
1861 panic("Failed to create the TCP control socket.\n");
1864 EXPORT_SYMBOL(ipv4_specific);
1865 EXPORT_SYMBOL(tcp_hashinfo);
1866 EXPORT_SYMBOL(tcp_prot);
1867 EXPORT_SYMBOL(tcp_unhash);
1868 EXPORT_SYMBOL(tcp_v4_conn_request);
1869 EXPORT_SYMBOL(tcp_v4_connect);
1870 EXPORT_SYMBOL(tcp_v4_do_rcv);
1871 EXPORT_SYMBOL(tcp_v4_remember_stamp);
1872 EXPORT_SYMBOL(tcp_v4_send_check);
1873 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1875 #ifdef CONFIG_PROC_FS
1876 EXPORT_SYMBOL(tcp_proc_register);
1877 EXPORT_SYMBOL(tcp_proc_unregister);
1879 EXPORT_SYMBOL(sysctl_local_port_range);
1880 EXPORT_SYMBOL(sysctl_tcp_low_latency);