added a lot of printk output to ease writing of emulator
[linux-2.4.21-pre4.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.1.1.1 2005/04/11 02:51:12 jack Exp $
11  *
12  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/major.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/fcntl.h>
107 #include <linux/mm.h>
108 #include <linux/slab.h>
109 #include <linux/interrupt.h>
110 #include <linux/poll.h>
111 #include <linux/tcp.h>
112 #include <linux/init.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/sock.h>
121 #include <linux/ipsec.h>
122
123 #ifdef CONFIG_FILTER
124 #include <linux/filter.h>
125 #endif
126
127 #ifdef CONFIG_INET
128 #include <net/tcp.h>
129 #endif
130
131 /* Run time adjustable parameters. */
132 __u32 sysctl_wmem_max = SK_WMEM_MAX;
133 __u32 sysctl_rmem_max = SK_RMEM_MAX;
134 __u32 sysctl_wmem_default = SK_WMEM_MAX;
135 __u32 sysctl_rmem_default = SK_RMEM_MAX;
136
137 /* Maximal space eaten by iovec or ancilliary data plus some space */
138 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
139
140 static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
141 {
142         struct timeval tv;
143
144         if (optlen < sizeof(tv))
145                 return -EINVAL;
146         if (copy_from_user(&tv, optval, sizeof(tv)))
147                 return -EFAULT;
148
149         *timeo_p = MAX_SCHEDULE_TIMEOUT;
150         if (tv.tv_sec == 0 && tv.tv_usec == 0)
151                 return 0;
152         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
153                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
154         return 0;
155 }
156
157 /*
158  *      This is meant for all protocols to use and covers goings on
159  *      at the socket level. Everything here is generic.
160  */
161
162 int sock_setsockopt(struct socket *sock, int level, int optname,
163                     char *optval, int optlen)
164 {
165         struct sock *sk=sock->sk;
166 #ifdef CONFIG_FILTER
167         struct sk_filter *filter;
168 #endif
169         int val;
170         int valbool;
171         struct linger ling;
172         int ret = 0;
173         
174         /*
175          *      Options without arguments
176          */
177
178 #ifdef SO_DONTLINGER            /* Compatibility item... */
179         switch(optname)
180         {
181                 case SO_DONTLINGER:
182                         sk->linger=0;
183                         return 0;
184         }
185 #endif  
186                 
187         if(optlen<sizeof(int))
188                 return(-EINVAL);
189         
190         if (get_user(val, (int *)optval))
191                 return -EFAULT;
192         
193         valbool = val?1:0;
194
195         lock_sock(sk);
196
197         switch(optname) 
198         {
199                 case SO_DEBUG:  
200                         if(val && !capable(CAP_NET_ADMIN))
201                         {
202                                 ret = -EACCES;
203                         }
204                         else
205                                 sk->debug=valbool;
206                         break;
207                 case SO_REUSEADDR:
208                         sk->reuse = valbool;
209                         break;
210                 case SO_TYPE:
211                 case SO_ERROR:
212                         ret = -ENOPROTOOPT;
213                         break;
214                 case SO_DONTROUTE:
215                         sk->localroute=valbool;
216                         break;
217                 case SO_BROADCAST:
218                         sk->broadcast=valbool;
219                         break;
220                 case SO_SNDBUF:
221                         /* Don't error on this BSD doesn't and if you think
222                            about it this is right. Otherwise apps have to
223                            play 'guess the biggest size' games. RCVBUF/SNDBUF
224                            are treated in BSD as hints */
225                            
226                         if (val > sysctl_wmem_max)
227                                 val = sysctl_wmem_max;
228
229                         sk->userlocks |= SOCK_SNDBUF_LOCK;
230                         if ((val * 2) < SOCK_MIN_SNDBUF)
231                                 sk->sndbuf = SOCK_MIN_SNDBUF;
232                         else
233                                 sk->sndbuf = (val * 2);
234
235                         /*
236                          *      Wake up sending tasks if we
237                          *      upped the value.
238                          */
239                         sk->write_space(sk);
240                         break;
241
242                 case SO_RCVBUF:
243                         /* Don't error on this BSD doesn't and if you think
244                            about it this is right. Otherwise apps have to
245                            play 'guess the biggest size' games. RCVBUF/SNDBUF
246                            are treated in BSD as hints */
247                           
248                         if (val > sysctl_rmem_max)
249                                 val = sysctl_rmem_max;
250
251                         sk->userlocks |= SOCK_RCVBUF_LOCK;
252                         /* FIXME: is this lower bound the right one? */
253                         if ((val * 2) < SOCK_MIN_RCVBUF)
254                                 sk->rcvbuf = SOCK_MIN_RCVBUF;
255                         else
256                                 sk->rcvbuf = (val * 2);
257                         break;
258
259                 case SO_KEEPALIVE:
260 #ifdef CONFIG_INET
261                         if (sk->protocol == IPPROTO_TCP)
262                         {
263                                 tcp_set_keepalive(sk, valbool);
264                         }
265 #endif
266                         sk->keepopen = valbool;
267                         break;
268
269                 case SO_OOBINLINE:
270                         sk->urginline = valbool;
271                         break;
272
273                 case SO_NO_CHECK:
274                         sk->no_check = valbool;
275                         break;
276
277                 case SO_PRIORITY:
278                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
279                                 sk->priority = val;
280                         else
281                                 ret = -EPERM;
282                         break;
283
284                 case SO_LINGER:
285                         if(optlen<sizeof(ling)) {
286                                 ret = -EINVAL;  /* 1003.1g */
287                                 break;
288                         }
289                         if (copy_from_user(&ling,optval,sizeof(ling))) {
290                                 ret = -EFAULT;
291                                 break;
292                         }
293                         if(ling.l_onoff==0) {
294                                 sk->linger=0;
295                         } else {
296 #if (BITS_PER_LONG == 32)
297                                 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
298                                         sk->lingertime=MAX_SCHEDULE_TIMEOUT;
299                                 else
300 #endif
301                                         sk->lingertime=ling.l_linger*HZ;
302                                 sk->linger=1;
303                         }
304                         break;
305
306                 case SO_BSDCOMPAT:
307                         sk->bsdism = valbool;
308                         break;
309
310                 case SO_PASSCRED:
311                         sock->passcred = valbool;
312                         break;
313
314                 case SO_TIMESTAMP:
315                         sk->rcvtstamp = valbool;
316                         break;
317
318                 case SO_RCVLOWAT:
319                         if (val < 0)
320                                 val = INT_MAX;
321                         sk->rcvlowat = val ? : 1;
322                         break;
323
324                 case SO_RCVTIMEO:
325                         ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
326                         break;
327
328                 case SO_SNDTIMEO:
329                         ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
330                         break;
331
332 #ifdef CONFIG_NETDEVICES
333                 case SO_BINDTODEVICE:
334                 {
335                         char devname[IFNAMSIZ]; 
336
337                         /* Sorry... */ 
338                         if (!capable(CAP_NET_RAW)) {
339                                 ret = -EPERM;
340                                 break;
341                         }
342
343                         /* Bind this socket to a particular device like "eth0",
344                          * as specified in the passed interface name. If the
345                          * name is "" or the option length is zero the socket 
346                          * is not bound. 
347                          */ 
348
349                         if (!valbool) {
350                                 sk->bound_dev_if = 0;
351                         } else {
352                                 if (optlen > IFNAMSIZ) 
353                                         optlen = IFNAMSIZ; 
354                                 if (copy_from_user(devname, optval, optlen)) {
355                                         ret = -EFAULT;
356                                         break;
357                                 }
358
359                                 /* Remove any cached route for this socket. */
360                                 sk_dst_reset(sk);
361
362                                 if (devname[0] == '\0') {
363                                         sk->bound_dev_if = 0;
364                                 } else {
365                                         struct net_device *dev = dev_get_by_name(devname);
366                                         if (!dev) {
367                                                 ret = -ENODEV;
368                                                 break;
369                                         }
370                                         sk->bound_dev_if = dev->ifindex;
371                                         dev_put(dev);
372                                 }
373                         }
374                         break;
375                 }
376 #endif
377
378
379 #ifdef CONFIG_FILTER
380                 case SO_ATTACH_FILTER:
381                         ret = -EINVAL;
382                         if (optlen == sizeof(struct sock_fprog)) {
383                                 struct sock_fprog fprog;
384
385                                 ret = -EFAULT;
386                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
387                                         break;
388
389                                 ret = sk_attach_filter(&fprog, sk);
390                         }
391                         break;
392
393                 case SO_DETACH_FILTER:
394                         spin_lock_bh(&sk->lock.slock);
395                         filter = sk->filter;
396                         if (filter) {
397                                 sk->filter = NULL;
398                                 spin_unlock_bh(&sk->lock.slock);
399                                 sk_filter_release(sk, filter);
400                                 break;
401                         }
402                         spin_unlock_bh(&sk->lock.slock);
403                         ret = -ENONET;
404                         break;
405 #endif
406                 /* We implement the SO_SNDLOWAT etc to
407                    not be settable (1003.1g 5.3) */
408                 default:
409                         ret = -ENOPROTOOPT;
410                         break;
411         }
412         release_sock(sk);
413         return ret;
414 }
415
416
417 int sock_getsockopt(struct socket *sock, int level, int optname,
418                     char *optval, int *optlen)
419 {
420         struct sock *sk = sock->sk;
421         
422         union
423         {
424                 int val;
425                 struct linger ling;
426                 struct timeval tm;
427         } v;
428         
429         unsigned int lv=sizeof(int),len;
430         
431         if(get_user(len,optlen))
432                 return -EFAULT;
433         if(len < 0)
434                 return -EINVAL;
435                 
436         switch(optname) 
437         {
438                 case SO_DEBUG:          
439                         v.val = sk->debug;
440                         break;
441                 
442                 case SO_DONTROUTE:
443                         v.val = sk->localroute;
444                         break;
445                 
446                 case SO_BROADCAST:
447                         v.val= sk->broadcast;
448                         break;
449
450                 case SO_SNDBUF:
451                         v.val=sk->sndbuf;
452                         break;
453                 
454                 case SO_RCVBUF:
455                         v.val =sk->rcvbuf;
456                         break;
457
458                 case SO_REUSEADDR:
459                         v.val = sk->reuse;
460                         break;
461
462                 case SO_KEEPALIVE:
463                         v.val = sk->keepopen;
464                         break;
465
466                 case SO_TYPE:
467                         v.val = sk->type;                               
468                         break;
469
470                 case SO_ERROR:
471                         v.val = -sock_error(sk);
472                         if(v.val==0)
473                                 v.val=xchg(&sk->err_soft,0);
474                         break;
475
476                 case SO_OOBINLINE:
477                         v.val = sk->urginline;
478                         break;
479         
480                 case SO_NO_CHECK:
481                         v.val = sk->no_check;
482                         break;
483
484                 case SO_PRIORITY:
485                         v.val = sk->priority;
486                         break;
487                 
488                 case SO_LINGER: 
489                         lv=sizeof(v.ling);
490                         v.ling.l_onoff=sk->linger;
491                         v.ling.l_linger=sk->lingertime/HZ;
492                         break;
493                                         
494                 case SO_BSDCOMPAT:
495                         v.val = sk->bsdism;
496                         break;
497
498                 case SO_TIMESTAMP:
499                         v.val = sk->rcvtstamp;
500                         break;
501
502                 case SO_RCVTIMEO:
503                         lv=sizeof(struct timeval);
504                         if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
505                                 v.tm.tv_sec = 0;
506                                 v.tm.tv_usec = 0;
507                         } else {
508                                 v.tm.tv_sec = sk->rcvtimeo/HZ;
509                                 v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000)/HZ;
510                         }
511                         break;
512
513                 case SO_SNDTIMEO:
514                         lv=sizeof(struct timeval);
515                         if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
516                                 v.tm.tv_sec = 0;
517                                 v.tm.tv_usec = 0;
518                         } else {
519                                 v.tm.tv_sec = sk->sndtimeo/HZ;
520                                 v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000)/HZ;
521                         }
522                         break;
523
524                 case SO_RCVLOWAT:
525                         v.val = sk->rcvlowat;
526                         break;
527
528                 case SO_SNDLOWAT:
529                         v.val=1;
530                         break; 
531
532                 case SO_PASSCRED:
533                         v.val = sock->passcred;
534                         break;
535
536                 case SO_PEERCRED:
537                         if (len > sizeof(sk->peercred))
538                                 len = sizeof(sk->peercred);
539                         if (copy_to_user(optval, &sk->peercred, len))
540                                 return -EFAULT;
541                         goto lenout;
542
543                 case SO_PEERNAME:
544                 {
545                         char address[128];
546
547                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
548                                 return -ENOTCONN;
549                         if (lv < len)
550                                 return -EINVAL;
551                         if(copy_to_user((void*)optval, address, len))
552                                 return -EFAULT;
553                         goto lenout;
554                 }
555
556                 /* Dubious BSD thing... Probably nobody even uses it, but
557                  * the UNIX standard wants it for whatever reason... -DaveM
558                  */
559                 case SO_ACCEPTCONN:
560                         v.val = (sk->state == TCP_LISTEN);
561                         break;
562
563                 default:
564                         return(-ENOPROTOOPT);
565         }
566         if (len > lv)
567                 len = lv;
568         if (copy_to_user(optval, &v, len))
569                 return -EFAULT;
570 lenout:
571         if (put_user(len, optlen))
572                 return -EFAULT;
573         return 0;
574 }
575
576 static kmem_cache_t *sk_cachep;
577
578 /*
579  *      All socket objects are allocated here. This is for future
580  *      usage.
581  */
582  
583 struct sock *sk_alloc(int family, int priority, int zero_it)
584 {
585         struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
586
587         if(sk && zero_it) {
588                 memset(sk, 0, sizeof(struct sock));
589                 sk->family = family;
590                 sock_lock_init(sk);
591         }
592
593         return sk;
594 }
595
596 void sk_free(struct sock *sk)
597 {
598 #ifdef CONFIG_FILTER
599         struct sk_filter *filter;
600 #endif
601
602         if (sk->destruct)
603                 sk->destruct(sk);
604
605 #ifdef CONFIG_FILTER
606         filter = sk->filter;
607         if (filter) {
608                 sk_filter_release(sk, filter);
609                 sk->filter = NULL;
610         }
611 #endif
612
613         if (atomic_read(&sk->omem_alloc))
614                 printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
615
616         kmem_cache_free(sk_cachep, sk);
617 }
618
619 void __init sk_init(void)
620 {
621         sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
622                                       SLAB_HWCACHE_ALIGN, 0, 0);
623         if (!sk_cachep)
624                 printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
625
626         if (num_physpages <= 4096) {
627                 sysctl_wmem_max = 32767;
628                 sysctl_rmem_max = 32767;
629                 sysctl_wmem_default = 32767;
630                 sysctl_rmem_default = 32767;
631         } else if (num_physpages >= 131072) {
632                 sysctl_wmem_max = 131071;
633                 sysctl_rmem_max = 131071;
634         }
635 }
636
637 /*
638  *      Simple resource managers for sockets.
639  */
640
641
642 /* 
643  * Write buffer destructor automatically called from kfree_skb. 
644  */
645 void sock_wfree(struct sk_buff *skb)
646 {
647         struct sock *sk = skb->sk;
648
649         /* In case it might be waiting for more memory. */
650         atomic_sub(skb->truesize, &sk->wmem_alloc);
651         if (!sk->use_write_queue)
652                 sk->write_space(sk);
653         sock_put(sk);
654 }
655
656 /* 
657  * Read buffer destructor automatically called from kfree_skb. 
658  */
659 void sock_rfree(struct sk_buff *skb)
660 {
661         struct sock *sk = skb->sk;
662
663         atomic_sub(skb->truesize, &sk->rmem_alloc);
664 }
665
666 /*
667  * Allocate a skb from the socket's send buffer.
668  */
669 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
670 {
671         if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
672                 struct sk_buff * skb = alloc_skb(size, priority);
673                 if (skb) {
674                         skb_set_owner_w(skb, sk);
675                         return skb;
676                 }
677         }
678         return NULL;
679 }
680
681 /*
682  * Allocate a skb from the socket's receive buffer.
683  */ 
684 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
685 {
686         if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
687                 struct sk_buff *skb = alloc_skb(size, priority);
688                 if (skb) {
689                         skb_set_owner_r(skb, sk);
690                         return skb;
691                 }
692         }
693         return NULL;
694 }
695
696 /* 
697  * Allocate a memory block from the socket's option memory buffer.
698  */ 
699 void *sock_kmalloc(struct sock *sk, int size, int priority)
700 {
701         if ((unsigned)size <= sysctl_optmem_max &&
702             atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
703                 void *mem;
704                 /* First do the add, to avoid the race if kmalloc
705                  * might sleep.
706                  */
707                 atomic_add(size, &sk->omem_alloc);
708                 mem = kmalloc(size, priority);
709                 if (mem)
710                         return mem;
711                 atomic_sub(size, &sk->omem_alloc);
712         }
713         return NULL;
714 }
715
716 /*
717  * Free an option memory block.
718  */
719 void sock_kfree_s(struct sock *sk, void *mem, int size)
720 {
721         kfree(mem);
722         atomic_sub(size, &sk->omem_alloc);
723 }
724
725 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
726    I think, these locks should be removed for datagram sockets.
727  */
728 static long sock_wait_for_wmem(struct sock * sk, long timeo)
729 {
730         DECLARE_WAITQUEUE(wait, current);
731
732         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
733         add_wait_queue(sk->sleep, &wait);
734         for (;;) {
735                 if (!timeo)
736                         break;
737                 if (signal_pending(current))
738                         break;
739                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
740                 set_current_state(TASK_INTERRUPTIBLE);
741                 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
742                         break;
743                 if (sk->shutdown & SEND_SHUTDOWN)
744                         break;
745                 if (sk->err)
746                         break;
747                 timeo = schedule_timeout(timeo);
748         }
749         __set_current_state(TASK_RUNNING);
750         remove_wait_queue(sk->sleep, &wait);
751         return timeo;
752 }
753
754
755 /*
756  *      Generic send/receive buffer handlers
757  */
758
759 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
760                                      unsigned long data_len, int noblock, int *errcode)
761 {
762         struct sk_buff *skb;
763         long timeo;
764         int err;
765
766         timeo = sock_sndtimeo(sk, noblock);
767         while (1) {
768                 err = sock_error(sk);
769                 if (err != 0)
770                         goto failure;
771
772                 err = -EPIPE;
773                 if (sk->shutdown & SEND_SHUTDOWN)
774                         goto failure;
775
776                 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
777                         skb = alloc_skb(header_len, sk->allocation);
778                         if (skb) {
779                                 int npages;
780                                 int i;
781
782                                 /* No pages, we're done... */
783                                 if (!data_len)
784                                         break;
785
786                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
787                                 skb->truesize += data_len;
788                                 skb_shinfo(skb)->nr_frags = npages;
789                                 for (i = 0; i < npages; i++) {
790                                         struct page *page;
791                                         skb_frag_t *frag;
792
793                                         page = alloc_pages(sk->allocation, 0);
794                                         if (!page) {
795                                                 err = -ENOBUFS;
796                                                 kfree_skb(skb);
797                                                 goto failure;
798                                         }
799
800                                         frag = &skb_shinfo(skb)->frags[i];
801                                         frag->page = page;
802                                         frag->page_offset = 0;
803                                         frag->size = (data_len >= PAGE_SIZE ?
804                                                       PAGE_SIZE :
805                                                       data_len);
806                                         data_len -= PAGE_SIZE;
807                                 }
808
809                                 /* Full success... */
810                                 break;
811                         }
812                         err = -ENOBUFS;
813                         goto failure;
814                 }
815                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
816                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
817                 err = -EAGAIN;
818                 if (!timeo)
819                         goto failure;
820                 if (signal_pending(current))
821                         goto interrupted;
822                 timeo = sock_wait_for_wmem(sk, timeo);
823         }
824
825         skb_set_owner_w(skb, sk);
826         return skb;
827
828 interrupted:
829         err = sock_intr_errno(timeo);
830 failure:
831         *errcode = err;
832         return NULL;
833 }
834
835 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
836                                     int noblock, int *errcode)
837 {
838         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
839 }
840
841 void __lock_sock(struct sock *sk)
842 {
843         DECLARE_WAITQUEUE(wait, current);
844
845         add_wait_queue_exclusive(&sk->lock.wq, &wait);
846         for(;;) {
847                 current->state = TASK_UNINTERRUPTIBLE;
848                 spin_unlock_bh(&sk->lock.slock);
849                 schedule();
850                 spin_lock_bh(&sk->lock.slock);
851                 if(!sk->lock.users)
852                         break;
853         }
854         current->state = TASK_RUNNING;
855         remove_wait_queue(&sk->lock.wq, &wait);
856 }
857
858 void __release_sock(struct sock *sk)
859 {
860         struct sk_buff *skb = sk->backlog.head;
861
862         do {
863                 sk->backlog.head = sk->backlog.tail = NULL;
864                 bh_unlock_sock(sk);
865
866                 do {
867                         struct sk_buff *next = skb->next;
868
869                         skb->next = NULL;
870                         sk->backlog_rcv(sk, skb);
871                         skb = next;
872                 } while (skb != NULL);
873
874                 bh_lock_sock(sk);
875         } while((skb = sk->backlog.head) != NULL);
876 }
877
878 /*
879  *      Generic socket manager library. Most simpler socket families
880  *      use this to manage their socket lists. At some point we should
881  *      hash these. By making this generic we get the lot hashed for free.
882  *
883  *      It is broken by design. All the protocols using it must be fixed. --ANK
884  */
885
886 rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
887  
888 void sklist_remove_socket(struct sock **list, struct sock *sk)
889 {
890         struct sock *s;
891
892         write_lock_bh(&net_big_sklist_lock);
893
894         while ((s = *list) != NULL) {
895                 if (s == sk) {
896                         *list = s->next;
897                         break;
898                 }
899                 list = &s->next;
900         }
901
902         write_unlock_bh(&net_big_sklist_lock);
903         if (s)
904                 sock_put(s);
905 }
906
907 void sklist_insert_socket(struct sock **list, struct sock *sk)
908 {
909         write_lock_bh(&net_big_sklist_lock);
910         sk->next= *list;
911         *list=sk;
912         sock_hold(sk);
913         write_unlock_bh(&net_big_sklist_lock);
914 }
915
916 /*
917  *      This is only called from user mode. Thus it protects itself against
918  *      interrupt users but doesn't worry about being called during work.
919  *      Once it is removed from the queue no interrupt or bottom half will
920  *      touch it and we are (fairly 8-) ) safe.
921  */
922
923 void sklist_destroy_socket(struct sock **list, struct sock *sk);
924
925 /*
926  *      Handler for deferred kills.
927  */
928
929 static void sklist_destroy_timer(unsigned long data)
930 {
931         struct sock *sk=(struct sock *)data;
932         sklist_destroy_socket(NULL,sk);
933 }
934
935 /*
936  *      Destroy a socket. We pass NULL for a list if we know the
937  *      socket is not on a list.
938  */
939  
940 void sklist_destroy_socket(struct sock **list,struct sock *sk)
941 {
942         if(list)
943                 sklist_remove_socket(list, sk);
944
945         skb_queue_purge(&sk->receive_queue);
946
947         if(atomic_read(&sk->wmem_alloc) == 0 &&
948            atomic_read(&sk->rmem_alloc) == 0 &&
949            sk->dead)
950         {
951                 sock_put(sk);
952         }
953         else
954         {
955                 /*
956                  *      Someone is using our buffers still.. defer
957                  */
958                 init_timer(&sk->timer);
959                 sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
960                 sk->timer.function=sklist_destroy_timer;
961                 sk->timer.data = (unsigned long)sk;
962                 add_timer(&sk->timer);
963         }
964 }
965
966 /*
967  * Set of default routines for initialising struct proto_ops when
968  * the protocol does not support a particular function. In certain
969  * cases where it makes no sense for a protocol to have a "do nothing"
970  * function, some default processing is provided.
971  */
972
973 int sock_no_release(struct socket *sock)
974 {
975         return 0;
976 }
977
978 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
979 {
980         return -EOPNOTSUPP;
981 }
982
983 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
984                     int len, int flags)
985 {
986         return -EOPNOTSUPP;
987 }
988
989 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
990 {
991         return -EOPNOTSUPP;
992 }
993
994 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
995 {
996         return -EOPNOTSUPP;
997 }
998
999 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1000                     int *len, int peer)
1001 {
1002         return -EOPNOTSUPP;
1003 }
1004
1005 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1006 {
1007         return 0;
1008 }
1009
1010 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1011 {
1012         return -EOPNOTSUPP;
1013 }
1014
1015 int sock_no_listen(struct socket *sock, int backlog)
1016 {
1017         return -EOPNOTSUPP;
1018 }
1019
1020 int sock_no_shutdown(struct socket *sock, int how)
1021 {
1022         return -EOPNOTSUPP;
1023 }
1024
1025 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1026                     char *optval, int optlen)
1027 {
1028         return -EOPNOTSUPP;
1029 }
1030
1031 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1032                     char *optval, int *optlen)
1033 {
1034         return -EOPNOTSUPP;
1035 }
1036
1037 /* 
1038  * Note: if you add something that sleeps here then change sock_fcntl()
1039  *       to do proper fd locking.
1040  */
1041 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1042 {
1043         struct sock *sk = sock->sk;
1044
1045         switch(cmd)
1046         {
1047                 case F_SETOWN:
1048                         /*
1049                          * This is a little restrictive, but it's the only
1050                          * way to make sure that you can't send a sigurg to
1051                          * another process.
1052                          */
1053                         if (current->pgrp != -arg &&
1054                                 current->pid != arg &&
1055                                 !capable(CAP_KILL)) return(-EPERM);
1056                         sk->proc = arg;
1057                         return(0);
1058                 case F_GETOWN:
1059                         return(sk->proc);
1060                 default:
1061                         return(-EINVAL);
1062         }
1063 }
1064
1065 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1066                     struct scm_cookie *scm)
1067 {
1068         return -EOPNOTSUPP;
1069 }
1070
1071 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1072                     struct scm_cookie *scm)
1073 {
1074         return -EOPNOTSUPP;
1075 }
1076
1077 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1078 {
1079         /* Mirror missing mmap method error code */
1080         return -ENODEV;
1081 }
1082
1083 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1084 {
1085         ssize_t res;
1086         struct msghdr msg;
1087         struct iovec iov;
1088         mm_segment_t old_fs;
1089         char *kaddr;
1090
1091         kaddr = kmap(page);
1092
1093         msg.msg_name = NULL;
1094         msg.msg_namelen = 0;
1095         msg.msg_iov = &iov;
1096         msg.msg_iovlen = 1;
1097         msg.msg_control = NULL;
1098         msg.msg_controllen = 0;
1099         msg.msg_flags = flags;
1100
1101         iov.iov_base = kaddr + offset;
1102         iov.iov_len = size;
1103
1104         old_fs = get_fs();
1105         set_fs(KERNEL_DS);
1106         res = sock_sendmsg(sock, &msg, size);
1107         set_fs(old_fs);
1108
1109         kunmap(page);
1110         return res;
1111 }
1112
1113 /*
1114  *      Default Socket Callbacks
1115  */
1116
1117 void sock_def_wakeup(struct sock *sk)
1118 {
1119         read_lock(&sk->callback_lock);
1120         if (sk->sleep && waitqueue_active(sk->sleep))
1121                 wake_up_interruptible_all(sk->sleep);
1122         read_unlock(&sk->callback_lock);
1123 }
1124
1125 void sock_def_error_report(struct sock *sk)
1126 {
1127         read_lock(&sk->callback_lock);
1128         if (sk->sleep && waitqueue_active(sk->sleep))
1129                 wake_up_interruptible(sk->sleep);
1130         sk_wake_async(sk,0,POLL_ERR); 
1131         read_unlock(&sk->callback_lock);
1132 }
1133
1134 void sock_def_readable(struct sock *sk, int len)
1135 {
1136         read_lock(&sk->callback_lock);
1137         if (sk->sleep && waitqueue_active(sk->sleep))
1138                 wake_up_interruptible(sk->sleep);
1139         sk_wake_async(sk,1,POLL_IN);
1140         read_unlock(&sk->callback_lock);
1141 }
1142
1143 void sock_def_write_space(struct sock *sk)
1144 {
1145         read_lock(&sk->callback_lock);
1146
1147         /* Do not wake up a writer until he can make "significant"
1148          * progress.  --DaveM
1149          */
1150         if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1151                 if (sk->sleep && waitqueue_active(sk->sleep))
1152                         wake_up_interruptible(sk->sleep);
1153
1154                 /* Should agree with poll, otherwise some programs break */
1155                 if (sock_writeable(sk))
1156                         sk_wake_async(sk, 2, POLL_OUT);
1157         }
1158
1159         read_unlock(&sk->callback_lock);
1160 }
1161
1162 void sock_def_destruct(struct sock *sk)
1163 {
1164         if (sk->protinfo.destruct_hook)
1165                 kfree(sk->protinfo.destruct_hook);
1166 }
1167
1168 void sock_init_data(struct socket *sock, struct sock *sk)
1169 {
1170         skb_queue_head_init(&sk->receive_queue);
1171         skb_queue_head_init(&sk->write_queue);
1172         skb_queue_head_init(&sk->error_queue);
1173
1174         init_timer(&sk->timer);
1175         
1176         sk->allocation  =       GFP_KERNEL;
1177         sk->rcvbuf      =       sysctl_rmem_default;
1178         sk->sndbuf      =       sysctl_wmem_default;
1179         sk->state       =       TCP_CLOSE;
1180         sk->zapped      =       1;
1181         sk->socket      =       sock;
1182
1183         if(sock)
1184         {
1185                 sk->type        =       sock->type;
1186                 sk->sleep       =       &sock->wait;
1187                 sock->sk        =       sk;
1188         } else
1189                 sk->sleep       =       NULL;
1190
1191         sk->dst_lock            =       RW_LOCK_UNLOCKED;
1192         sk->callback_lock       =       RW_LOCK_UNLOCKED;
1193
1194         sk->state_change        =       sock_def_wakeup;
1195         sk->data_ready          =       sock_def_readable;
1196         sk->write_space         =       sock_def_write_space;
1197         sk->error_report        =       sock_def_error_report;
1198         sk->destruct            =       sock_def_destruct;
1199
1200         sk->peercred.pid        =       0;
1201         sk->peercred.uid        =       -1;
1202         sk->peercred.gid        =       -1;
1203         sk->rcvlowat            =       1;
1204         sk->rcvtimeo            =       MAX_SCHEDULE_TIMEOUT;
1205         sk->sndtimeo            =       MAX_SCHEDULE_TIMEOUT;
1206
1207         atomic_set(&sk->refcnt, 1);
1208 }