import of upstream 2.4.34.4 from kernel.org
[linux-2.4.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.116 2001/11/08 04:20:06 davem Exp $
11  *
12  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/major.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/fcntl.h>
107 #include <linux/mm.h>
108 #include <linux/slab.h>
109 #include <linux/interrupt.h>
110 #include <linux/poll.h>
111 #include <linux/tcp.h>
112 #include <linux/init.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/sock.h>
121 #include <linux/ipsec.h>
122
123 #ifdef CONFIG_FILTER
124 #include <linux/filter.h>
125 #endif
126
127 #ifdef CONFIG_INET
128 #include <net/tcp.h>
129 #endif
130
131 /* Take into consideration the size of the struct sk_buff overhead in the
132  * determination of these values, since that is non-constant across
133  * platforms.  This makes socket queueing behavior and performance
134  * not depend upon such differences.
135  */
136 #define _SK_MEM_PACKETS         256
137 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
138 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
139 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140
141 /* Run time adjustable parameters. */
142 __u32 sysctl_wmem_max = SK_WMEM_MAX;
143 __u32 sysctl_rmem_max = SK_RMEM_MAX;
144 __u32 sysctl_wmem_default = SK_WMEM_MAX;
145 __u32 sysctl_rmem_default = SK_RMEM_MAX;
146
147 /* Maximal space eaten by iovec or ancilliary data plus some space */
148 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
149
150 static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
151 {
152         struct timeval tv;
153
154         if (optlen < sizeof(tv))
155                 return -EINVAL;
156         if (copy_from_user(&tv, optval, sizeof(tv)))
157                 return -EFAULT;
158
159         *timeo_p = MAX_SCHEDULE_TIMEOUT;
160         if (tv.tv_sec == 0 && tv.tv_usec == 0)
161                 return 0;
162         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
163                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
164         return 0;
165 }
166
167 /*
168  *      This is meant for all protocols to use and covers goings on
169  *      at the socket level. Everything here is generic.
170  */
171
172 int sock_setsockopt(struct socket *sock, int level, int optname,
173                     char *optval, int optlen)
174 {
175         struct sock *sk=sock->sk;
176 #ifdef CONFIG_FILTER
177         struct sk_filter *filter;
178 #endif
179         int val;
180         int valbool;
181         struct linger ling;
182         int ret = 0;
183         
184         /*
185          *      Options without arguments
186          */
187
188 #ifdef SO_DONTLINGER            /* Compatibility item... */
189         switch(optname)
190         {
191                 case SO_DONTLINGER:
192                         sk->linger=0;
193                         return 0;
194         }
195 #endif  
196                 
197         if(optlen<sizeof(int))
198                 return(-EINVAL);
199         
200         if (get_user(val, (int *)optval))
201                 return -EFAULT;
202         
203         valbool = val?1:0;
204
205         lock_sock(sk);
206
207         switch(optname) 
208         {
209                 case SO_DEBUG:  
210                         if(val && !capable(CAP_NET_ADMIN))
211                         {
212                                 ret = -EACCES;
213                         }
214                         else
215                                 sk->debug=valbool;
216                         break;
217                 case SO_REUSEADDR:
218                         sk->reuse = valbool;
219                         break;
220                 case SO_TYPE:
221                 case SO_ERROR:
222                         ret = -ENOPROTOOPT;
223                         break;
224                 case SO_DONTROUTE:
225                         sk->localroute=valbool;
226                         break;
227                 case SO_BROADCAST:
228                         sk->broadcast=valbool;
229                         break;
230                 case SO_SNDBUF:
231                         /* Don't error on this BSD doesn't and if you think
232                            about it this is right. Otherwise apps have to
233                            play 'guess the biggest size' games. RCVBUF/SNDBUF
234                            are treated in BSD as hints */
235                            
236                         if (val > sysctl_wmem_max)
237                                 val = sysctl_wmem_max;
238
239                         sk->userlocks |= SOCK_SNDBUF_LOCK;
240                         if ((val * 2) < SOCK_MIN_SNDBUF)
241                                 sk->sndbuf = SOCK_MIN_SNDBUF;
242                         else
243                                 sk->sndbuf = (val * 2);
244
245                         /*
246                          *      Wake up sending tasks if we
247                          *      upped the value.
248                          */
249                         sk->write_space(sk);
250                         break;
251
252                 case SO_RCVBUF:
253                         /* Don't error on this BSD doesn't and if you think
254                            about it this is right. Otherwise apps have to
255                            play 'guess the biggest size' games. RCVBUF/SNDBUF
256                            are treated in BSD as hints */
257                           
258                         if (val > sysctl_rmem_max)
259                                 val = sysctl_rmem_max;
260
261                         sk->userlocks |= SOCK_RCVBUF_LOCK;
262                         /* FIXME: is this lower bound the right one? */
263                         if ((val * 2) < SOCK_MIN_RCVBUF)
264                                 sk->rcvbuf = SOCK_MIN_RCVBUF;
265                         else
266                                 sk->rcvbuf = (val * 2);
267                         break;
268
269                 case SO_KEEPALIVE:
270 #ifdef CONFIG_INET
271                         if (sk->protocol == IPPROTO_TCP)
272                         {
273                                 tcp_set_keepalive(sk, valbool);
274                         }
275 #endif
276                         sk->keepopen = valbool;
277                         break;
278
279                 case SO_OOBINLINE:
280                         sk->urginline = valbool;
281                         break;
282
283                 case SO_NO_CHECK:
284                         sk->no_check = valbool;
285                         break;
286
287                 case SO_PRIORITY:
288                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
289                                 sk->priority = val;
290                         else
291                                 ret = -EPERM;
292                         break;
293
294                 case SO_LINGER:
295                         if(optlen<sizeof(ling)) {
296                                 ret = -EINVAL;  /* 1003.1g */
297                                 break;
298                         }
299                         if (copy_from_user(&ling,optval,sizeof(ling))) {
300                                 ret = -EFAULT;
301                                 break;
302                         }
303                         if(ling.l_onoff==0) {
304                                 sk->linger=0;
305                         } else {
306 #if (BITS_PER_LONG == 32)
307                                 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
308                                         sk->lingertime=MAX_SCHEDULE_TIMEOUT;
309                                 else
310 #endif
311                                         sk->lingertime=ling.l_linger*HZ;
312                                 sk->linger=1;
313                         }
314                         break;
315
316                 case SO_BSDCOMPAT:
317                         sk->bsdism = valbool;
318                         break;
319
320                 case SO_PASSCRED:
321                         sock->passcred = valbool;
322                         break;
323
324                 case SO_TIMESTAMP:
325                         sk->rcvtstamp = valbool;
326                         break;
327
328                 case SO_RCVLOWAT:
329                         if (val < 0)
330                                 val = INT_MAX;
331                         sk->rcvlowat = val ? : 1;
332                         break;
333
334                 case SO_RCVTIMEO:
335                         ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
336                         break;
337
338                 case SO_SNDTIMEO:
339                         ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
340                         break;
341
342 #ifdef CONFIG_NETDEVICES
343                 case SO_BINDTODEVICE:
344                 {
345                         char devname[IFNAMSIZ]; 
346
347                         /* Sorry... */ 
348                         if (!capable(CAP_NET_RAW)) {
349                                 ret = -EPERM;
350                                 break;
351                         }
352
353                         /* Bind this socket to a particular device like "eth0",
354                          * as specified in the passed interface name. If the
355                          * name is "" or the option length is zero the socket 
356                          * is not bound. 
357                          */ 
358
359                         if (!valbool) {
360                                 sk->bound_dev_if = 0;
361                         } else {
362                                 if (optlen > IFNAMSIZ) 
363                                         optlen = IFNAMSIZ; 
364                                 if (copy_from_user(devname, optval, optlen)) {
365                                         ret = -EFAULT;
366                                         break;
367                                 }
368
369                                 /* Remove any cached route for this socket. */
370                                 sk_dst_reset(sk);
371
372                                 if (devname[0] == '\0') {
373                                         sk->bound_dev_if = 0;
374                                 } else {
375                                         struct net_device *dev = dev_get_by_name(devname);
376                                         if (!dev) {
377                                                 ret = -ENODEV;
378                                                 break;
379                                         }
380                                         sk->bound_dev_if = dev->ifindex;
381                                         dev_put(dev);
382                                 }
383                         }
384                         break;
385                 }
386 #endif
387
388
389 #ifdef CONFIG_FILTER
390                 case SO_ATTACH_FILTER:
391                         ret = -EINVAL;
392                         if (optlen == sizeof(struct sock_fprog)) {
393                                 struct sock_fprog fprog;
394
395                                 ret = -EFAULT;
396                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
397                                         break;
398
399                                 ret = sk_attach_filter(&fprog, sk);
400                         }
401                         break;
402
403                 case SO_DETACH_FILTER:
404                         spin_lock_bh(&sk->lock.slock);
405                         filter = sk->filter;
406                         if (filter) {
407                                 sk->filter = NULL;
408                                 spin_unlock_bh(&sk->lock.slock);
409                                 sk_filter_release(sk, filter);
410                                 break;
411                         }
412                         spin_unlock_bh(&sk->lock.slock);
413                         ret = -ENONET;
414                         break;
415 #endif
416                 /* We implement the SO_SNDLOWAT etc to
417                    not be settable (1003.1g 5.3) */
418                 default:
419                         ret = -ENOPROTOOPT;
420                         break;
421         }
422         release_sock(sk);
423         return ret;
424 }
425
426
427 int sock_getsockopt(struct socket *sock, int level, int optname,
428                     char *optval, int *optlen)
429 {
430         struct sock *sk = sock->sk;
431         
432         union
433         {
434                 int val;
435                 struct linger ling;
436                 struct timeval tm;
437         } v;
438         
439         unsigned int lv=sizeof(int),len;
440         
441         if(get_user(len,optlen))
442                 return -EFAULT;
443         if(len < 0)
444                 return -EINVAL;
445                 
446         switch(optname) 
447         {
448                 case SO_DEBUG:          
449                         v.val = sk->debug;
450                         break;
451                 
452                 case SO_DONTROUTE:
453                         v.val = sk->localroute;
454                         break;
455                 
456                 case SO_BROADCAST:
457                         v.val= sk->broadcast;
458                         break;
459
460                 case SO_SNDBUF:
461                         v.val=sk->sndbuf;
462                         break;
463                 
464                 case SO_RCVBUF:
465                         v.val =sk->rcvbuf;
466                         break;
467
468                 case SO_REUSEADDR:
469                         v.val = sk->reuse;
470                         break;
471
472                 case SO_KEEPALIVE:
473                         v.val = sk->keepopen;
474                         break;
475
476                 case SO_TYPE:
477                         v.val = sk->type;                               
478                         break;
479
480                 case SO_ERROR:
481                         v.val = -sock_error(sk);
482                         if(v.val==0)
483                                 v.val=xchg(&sk->err_soft,0);
484                         break;
485
486                 case SO_OOBINLINE:
487                         v.val = sk->urginline;
488                         break;
489         
490                 case SO_NO_CHECK:
491                         v.val = sk->no_check;
492                         break;
493
494                 case SO_PRIORITY:
495                         v.val = sk->priority;
496                         break;
497                 
498                 case SO_LINGER: 
499                         lv=sizeof(v.ling);
500                         v.ling.l_onoff=sk->linger;
501                         v.ling.l_linger=sk->lingertime/HZ;
502                         break;
503                                         
504                 case SO_BSDCOMPAT:
505                         v.val = sk->bsdism;
506                         break;
507
508                 case SO_TIMESTAMP:
509                         v.val = sk->rcvtstamp;
510                         break;
511
512                 case SO_RCVTIMEO:
513                         lv=sizeof(struct timeval);
514                         if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
515                                 v.tm.tv_sec = 0;
516                                 v.tm.tv_usec = 0;
517                         } else {
518                                 v.tm.tv_sec = sk->rcvtimeo/HZ;
519                                 v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000000)/HZ;
520                         }
521                         break;
522
523                 case SO_SNDTIMEO:
524                         lv=sizeof(struct timeval);
525                         if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
526                                 v.tm.tv_sec = 0;
527                                 v.tm.tv_usec = 0;
528                         } else {
529                                 v.tm.tv_sec = sk->sndtimeo/HZ;
530                                 v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000000)/HZ;
531                         }
532                         break;
533
534                 case SO_RCVLOWAT:
535                         v.val = sk->rcvlowat;
536                         break;
537
538                 case SO_SNDLOWAT:
539                         v.val=1;
540                         break; 
541
542                 case SO_PASSCRED:
543                         v.val = sock->passcred;
544                         break;
545
546                 case SO_PEERCRED:
547                         if (len > sizeof(sk->peercred))
548                                 len = sizeof(sk->peercred);
549                         if (copy_to_user(optval, &sk->peercred, len))
550                                 return -EFAULT;
551                         goto lenout;
552
553                 case SO_PEERNAME:
554                 {
555                         char address[128];
556
557                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
558                                 return -ENOTCONN;
559                         if (lv < len)
560                                 return -EINVAL;
561                         if(copy_to_user((void*)optval, address, len))
562                                 return -EFAULT;
563                         goto lenout;
564                 }
565
566                 /* Dubious BSD thing... Probably nobody even uses it, but
567                  * the UNIX standard wants it for whatever reason... -DaveM
568                  */
569                 case SO_ACCEPTCONN:
570                         v.val = (sk->state == TCP_LISTEN);
571                         break;
572
573                 default:
574                         return(-ENOPROTOOPT);
575         }
576         if (len > lv)
577                 len = lv;
578         if (copy_to_user(optval, &v, len))
579                 return -EFAULT;
580 lenout:
581         if (put_user(len, optlen))
582                 return -EFAULT;
583         return 0;
584 }
585
586 static kmem_cache_t *sk_cachep;
587
588 /*
589  *      All socket objects are allocated here. This is for future
590  *      usage.
591  */
592  
593 struct sock *sk_alloc(int family, int priority, int zero_it)
594 {
595         struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
596
597         if(sk && zero_it) {
598                 memset(sk, 0, sizeof(struct sock));
599                 sk->family = family;
600                 sock_lock_init(sk);
601         }
602
603         return sk;
604 }
605
606 void sk_free(struct sock *sk)
607 {
608 #ifdef CONFIG_FILTER
609         struct sk_filter *filter;
610 #endif
611
612         if (sk->destruct)
613                 sk->destruct(sk);
614
615 #ifdef CONFIG_FILTER
616         filter = sk->filter;
617         if (filter) {
618                 sk_filter_release(sk, filter);
619                 sk->filter = NULL;
620         }
621 #endif
622
623         if (atomic_read(&sk->omem_alloc))
624                 printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
625
626         kmem_cache_free(sk_cachep, sk);
627 }
628
629 void __init sk_init(void)
630 {
631         sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
632                                       SLAB_HWCACHE_ALIGN, 0, 0);
633         if (!sk_cachep)
634                 printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
635
636         if (num_physpages <= 4096) {
637                 sysctl_wmem_max = 32767;
638                 sysctl_rmem_max = 32767;
639                 sysctl_wmem_default = 32767;
640                 sysctl_rmem_default = 32767;
641         } else if (num_physpages >= 131072) {
642                 sysctl_wmem_max = 131071;
643                 sysctl_rmem_max = 131071;
644         }
645 }
646
647 /*
648  *      Simple resource managers for sockets.
649  */
650
651
652 /* 
653  * Write buffer destructor automatically called from kfree_skb. 
654  */
655 void sock_wfree(struct sk_buff *skb)
656 {
657         struct sock *sk = skb->sk;
658
659         /* In case it might be waiting for more memory. */
660         atomic_sub(skb->truesize, &sk->wmem_alloc);
661         if (!sk->use_write_queue)
662                 sk->write_space(sk);
663         sock_put(sk);
664 }
665
666 /* 
667  * Read buffer destructor automatically called from kfree_skb. 
668  */
669 void sock_rfree(struct sk_buff *skb)
670 {
671         struct sock *sk = skb->sk;
672
673         atomic_sub(skb->truesize, &sk->rmem_alloc);
674 }
675
676 /*
677  * Allocate a skb from the socket's send buffer.
678  */
679 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
680 {
681         if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
682                 struct sk_buff * skb = alloc_skb(size, priority);
683                 if (skb) {
684                         skb_set_owner_w(skb, sk);
685                         return skb;
686                 }
687         }
688         return NULL;
689 }
690
691 /*
692  * Allocate a skb from the socket's receive buffer.
693  */ 
694 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
695 {
696         if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
697                 struct sk_buff *skb = alloc_skb(size, priority);
698                 if (skb) {
699                         skb_set_owner_r(skb, sk);
700                         return skb;
701                 }
702         }
703         return NULL;
704 }
705
706 /* 
707  * Allocate a memory block from the socket's option memory buffer.
708  */ 
709 void *sock_kmalloc(struct sock *sk, int size, int priority)
710 {
711         if ((unsigned)size <= sysctl_optmem_max &&
712             atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
713                 void *mem;
714                 /* First do the add, to avoid the race if kmalloc
715                  * might sleep.
716                  */
717                 atomic_add(size, &sk->omem_alloc);
718                 mem = kmalloc(size, priority);
719                 if (mem)
720                         return mem;
721                 atomic_sub(size, &sk->omem_alloc);
722         }
723         return NULL;
724 }
725
726 /*
727  * Free an option memory block.
728  */
729 void sock_kfree_s(struct sock *sk, void *mem, int size)
730 {
731         kfree(mem);
732         atomic_sub(size, &sk->omem_alloc);
733 }
734
735 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
736    I think, these locks should be removed for datagram sockets.
737  */
738 static long sock_wait_for_wmem(struct sock * sk, long timeo)
739 {
740         DECLARE_WAITQUEUE(wait, current);
741
742         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
743         add_wait_queue(sk->sleep, &wait);
744         for (;;) {
745                 if (!timeo)
746                         break;
747                 if (signal_pending(current))
748                         break;
749                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
750                 set_current_state(TASK_INTERRUPTIBLE);
751                 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
752                         break;
753                 if (sk->shutdown & SEND_SHUTDOWN)
754                         break;
755                 if (sk->err)
756                         break;
757                 timeo = schedule_timeout(timeo);
758         }
759         __set_current_state(TASK_RUNNING);
760         remove_wait_queue(sk->sleep, &wait);
761         return timeo;
762 }
763
764
765 /*
766  *      Generic send/receive buffer handlers
767  */
768
769 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
770                                      unsigned long data_len, int noblock, int *errcode)
771 {
772         struct sk_buff *skb;
773         long timeo;
774         int err;
775
776         timeo = sock_sndtimeo(sk, noblock);
777         while (1) {
778                 err = sock_error(sk);
779                 if (err != 0)
780                         goto failure;
781
782                 err = -EPIPE;
783                 if (sk->shutdown & SEND_SHUTDOWN)
784                         goto failure;
785
786                 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
787                         skb = alloc_skb(header_len, sk->allocation);
788                         if (skb) {
789                                 int npages;
790                                 int i;
791
792                                 /* No pages, we're done... */
793                                 if (!data_len)
794                                         break;
795
796                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
797                                 skb->truesize += data_len;
798                                 skb_shinfo(skb)->nr_frags = npages;
799                                 for (i = 0; i < npages; i++) {
800                                         struct page *page;
801                                         skb_frag_t *frag;
802
803                                         page = alloc_pages(sk->allocation, 0);
804                                         if (!page) {
805                                                 err = -ENOBUFS;
806                                                 skb_shinfo(skb)->nr_frags = i;
807                                                 kfree_skb(skb);
808                                                 goto failure;
809                                         }
810
811                                         frag = &skb_shinfo(skb)->frags[i];
812                                         frag->page = page;
813                                         frag->page_offset = 0;
814                                         frag->size = (data_len >= PAGE_SIZE ?
815                                                       PAGE_SIZE :
816                                                       data_len);
817                                         data_len -= PAGE_SIZE;
818                                 }
819
820                                 /* Full success... */
821                                 break;
822                         }
823                         err = -ENOBUFS;
824                         goto failure;
825                 }
826                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
827                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
828                 err = -EAGAIN;
829                 if (!timeo)
830                         goto failure;
831                 if (signal_pending(current))
832                         goto interrupted;
833                 timeo = sock_wait_for_wmem(sk, timeo);
834         }
835
836         skb_set_owner_w(skb, sk);
837         return skb;
838
839 interrupted:
840         err = sock_intr_errno(timeo);
841 failure:
842         *errcode = err;
843         return NULL;
844 }
845
846 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
847                                     int noblock, int *errcode)
848 {
849         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
850 }
851
852 void __lock_sock(struct sock *sk)
853 {
854         DECLARE_WAITQUEUE(wait, current);
855
856         add_wait_queue_exclusive(&sk->lock.wq, &wait);
857         for(;;) {
858                 current->state = TASK_UNINTERRUPTIBLE;
859                 spin_unlock_bh(&sk->lock.slock);
860                 schedule();
861                 spin_lock_bh(&sk->lock.slock);
862                 if(!sk->lock.users)
863                         break;
864         }
865         current->state = TASK_RUNNING;
866         remove_wait_queue(&sk->lock.wq, &wait);
867 }
868
869 void __release_sock(struct sock *sk)
870 {
871         struct sk_buff *skb = sk->backlog.head;
872
873         do {
874                 sk->backlog.head = sk->backlog.tail = NULL;
875                 bh_unlock_sock(sk);
876
877                 do {
878                         struct sk_buff *next = skb->next;
879
880                         skb->next = NULL;
881                         sk->backlog_rcv(sk, skb);
882                         skb = next;
883                 } while (skb != NULL);
884
885                 bh_lock_sock(sk);
886         } while((skb = sk->backlog.head) != NULL);
887 }
888
889 /*
890  *      Generic socket manager library. Most simpler socket families
891  *      use this to manage their socket lists. At some point we should
892  *      hash these. By making this generic we get the lot hashed for free.
893  *
894  *      It is broken by design. All the protocols using it must be fixed. --ANK
895  */
896
897 rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
898  
899 void sklist_remove_socket(struct sock **list, struct sock *sk)
900 {
901         struct sock *s;
902
903         write_lock_bh(&net_big_sklist_lock);
904
905         while ((s = *list) != NULL) {
906                 if (s == sk) {
907                         *list = s->next;
908                         break;
909                 }
910                 list = &s->next;
911         }
912
913         write_unlock_bh(&net_big_sklist_lock);
914         if (s)
915                 sock_put(s);
916 }
917
918 void sklist_insert_socket(struct sock **list, struct sock *sk)
919 {
920         write_lock_bh(&net_big_sklist_lock);
921         sk->next= *list;
922         *list=sk;
923         sock_hold(sk);
924         write_unlock_bh(&net_big_sklist_lock);
925 }
926
927 /*
928  *      This is only called from user mode. Thus it protects itself against
929  *      interrupt users but doesn't worry about being called during work.
930  *      Once it is removed from the queue no interrupt or bottom half will
931  *      touch it and we are (fairly 8-) ) safe.
932  */
933
934 void sklist_destroy_socket(struct sock **list, struct sock *sk);
935
936 /*
937  *      Handler for deferred kills.
938  */
939
940 static void sklist_destroy_timer(unsigned long data)
941 {
942         struct sock *sk=(struct sock *)data;
943         sklist_destroy_socket(NULL,sk);
944 }
945
946 /*
947  *      Destroy a socket. We pass NULL for a list if we know the
948  *      socket is not on a list.
949  */
950  
951 void sklist_destroy_socket(struct sock **list,struct sock *sk)
952 {
953         if(list)
954                 sklist_remove_socket(list, sk);
955
956         skb_queue_purge(&sk->receive_queue);
957
958         if(atomic_read(&sk->wmem_alloc) == 0 &&
959            atomic_read(&sk->rmem_alloc) == 0 &&
960            sk->dead)
961         {
962                 sock_put(sk);
963         }
964         else
965         {
966                 /*
967                  *      Someone is using our buffers still.. defer
968                  */
969                 init_timer(&sk->timer);
970                 sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
971                 sk->timer.function=sklist_destroy_timer;
972                 sk->timer.data = (unsigned long)sk;
973                 add_timer(&sk->timer);
974         }
975 }
976
977 /*
978  * Set of default routines for initialising struct proto_ops when
979  * the protocol does not support a particular function. In certain
980  * cases where it makes no sense for a protocol to have a "do nothing"
981  * function, some default processing is provided.
982  */
983
984 int sock_no_release(struct socket *sock)
985 {
986         return 0;
987 }
988
989 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
990 {
991         return -EOPNOTSUPP;
992 }
993
994 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
995                     int len, int flags)
996 {
997         return -EOPNOTSUPP;
998 }
999
1000 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1001 {
1002         return -EOPNOTSUPP;
1003 }
1004
1005 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1006 {
1007         return -EOPNOTSUPP;
1008 }
1009
1010 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1011                     int *len, int peer)
1012 {
1013         return -EOPNOTSUPP;
1014 }
1015
1016 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1017 {
1018         return 0;
1019 }
1020
1021 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1022 {
1023         return -EOPNOTSUPP;
1024 }
1025
1026 int sock_no_listen(struct socket *sock, int backlog)
1027 {
1028         return -EOPNOTSUPP;
1029 }
1030
1031 int sock_no_shutdown(struct socket *sock, int how)
1032 {
1033         return -EOPNOTSUPP;
1034 }
1035
1036 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1037                     char *optval, int optlen)
1038 {
1039         return -EOPNOTSUPP;
1040 }
1041
1042 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1043                     char *optval, int *optlen)
1044 {
1045         return -EOPNOTSUPP;
1046 }
1047
1048 /* 
1049  * Note: if you add something that sleeps here then change sock_fcntl()
1050  *       to do proper fd locking.
1051  */
1052 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1053 {
1054         struct sock *sk = sock->sk;
1055
1056         switch(cmd)
1057         {
1058                 case F_SETOWN:
1059                         /*
1060                          * This is a little restrictive, but it's the only
1061                          * way to make sure that you can't send a sigurg to
1062                          * another process.
1063                          */
1064                         if (current->pgrp != -arg &&
1065                                 current->pid != arg &&
1066                                 !capable(CAP_KILL)) return(-EPERM);
1067                         sk->proc = arg;
1068                         return(0);
1069                 case F_GETOWN:
1070                         return(sk->proc);
1071                 default:
1072                         return(-EINVAL);
1073         }
1074 }
1075
1076 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1077                     struct scm_cookie *scm)
1078 {
1079         return -EOPNOTSUPP;
1080 }
1081
1082 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1083                     struct scm_cookie *scm)
1084 {
1085         return -EOPNOTSUPP;
1086 }
1087
1088 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1089 {
1090         /* Mirror missing mmap method error code */
1091         return -ENODEV;
1092 }
1093
1094 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1095 {
1096         ssize_t res;
1097         struct msghdr msg;
1098         struct iovec iov;
1099         mm_segment_t old_fs;
1100         char *kaddr;
1101
1102         kaddr = kmap(page);
1103
1104         msg.msg_name = NULL;
1105         msg.msg_namelen = 0;
1106         msg.msg_iov = &iov;
1107         msg.msg_iovlen = 1;
1108         msg.msg_control = NULL;
1109         msg.msg_controllen = 0;
1110         msg.msg_flags = flags;
1111
1112         iov.iov_base = kaddr + offset;
1113         iov.iov_len = size;
1114
1115         old_fs = get_fs();
1116         set_fs(KERNEL_DS);
1117         res = sock_sendmsg(sock, &msg, size);
1118         set_fs(old_fs);
1119
1120         kunmap(page);
1121         return res;
1122 }
1123
1124 /*
1125  *      Default Socket Callbacks
1126  */
1127
1128 void sock_def_wakeup(struct sock *sk)
1129 {
1130         read_lock(&sk->callback_lock);
1131         if (sk->sleep && waitqueue_active(sk->sleep))
1132                 wake_up_interruptible_all(sk->sleep);
1133         read_unlock(&sk->callback_lock);
1134 }
1135
1136 void sock_def_error_report(struct sock *sk)
1137 {
1138         read_lock(&sk->callback_lock);
1139         if (sk->sleep && waitqueue_active(sk->sleep))
1140                 wake_up_interruptible(sk->sleep);
1141         sk_wake_async(sk,0,POLL_ERR); 
1142         read_unlock(&sk->callback_lock);
1143 }
1144
1145 void sock_def_readable(struct sock *sk, int len)
1146 {
1147         read_lock(&sk->callback_lock);
1148         if (sk->sleep && waitqueue_active(sk->sleep))
1149                 wake_up_interruptible(sk->sleep);
1150         sk_wake_async(sk,1,POLL_IN);
1151         read_unlock(&sk->callback_lock);
1152 }
1153
1154 void sock_def_write_space(struct sock *sk)
1155 {
1156         read_lock(&sk->callback_lock);
1157
1158         /* Do not wake up a writer until he can make "significant"
1159          * progress.  --DaveM
1160          */
1161         if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1162                 if (sk->sleep && waitqueue_active(sk->sleep))
1163                         wake_up_interruptible(sk->sleep);
1164
1165                 /* Should agree with poll, otherwise some programs break */
1166                 if (sock_writeable(sk))
1167                         sk_wake_async(sk, 2, POLL_OUT);
1168         }
1169
1170         read_unlock(&sk->callback_lock);
1171 }
1172
1173 void sock_def_destruct(struct sock *sk)
1174 {
1175         if (sk->protinfo.destruct_hook)
1176                 kfree(sk->protinfo.destruct_hook);
1177 }
1178
1179 void sock_init_data(struct socket *sock, struct sock *sk)
1180 {
1181         skb_queue_head_init(&sk->receive_queue);
1182         skb_queue_head_init(&sk->write_queue);
1183         skb_queue_head_init(&sk->error_queue);
1184
1185         init_timer(&sk->timer);
1186         
1187         sk->allocation  =       GFP_KERNEL;
1188         sk->rcvbuf      =       sysctl_rmem_default;
1189         sk->sndbuf      =       sysctl_wmem_default;
1190         sk->state       =       TCP_CLOSE;
1191         sk->zapped      =       1;
1192         sk->socket      =       sock;
1193
1194         if(sock)
1195         {
1196                 sk->type        =       sock->type;
1197                 sk->sleep       =       &sock->wait;
1198                 sock->sk        =       sk;
1199         } else
1200                 sk->sleep       =       NULL;
1201
1202         sk->dst_lock            =       RW_LOCK_UNLOCKED;
1203         sk->callback_lock       =       RW_LOCK_UNLOCKED;
1204
1205         sk->state_change        =       sock_def_wakeup;
1206         sk->data_ready          =       sock_def_readable;
1207         sk->write_space         =       sock_def_write_space;
1208         sk->error_report        =       sock_def_error_report;
1209         sk->destruct            =       sock_def_destruct;
1210
1211         sk->peercred.pid        =       0;
1212         sk->peercred.uid        =       -1;
1213         sk->peercred.gid        =       -1;
1214         sk->rcvlowat            =       1;
1215         sk->rcvtimeo            =       MAX_SCHEDULE_TIMEOUT;
1216         sk->sndtimeo            =       MAX_SCHEDULE_TIMEOUT;
1217
1218         atomic_set(&sk->refcnt, 1);
1219 }