diff options
author | Arnaldo Carvalho de Melo <acme@redhat.com> | 2009-10-13 02:40:10 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-10-13 02:40:10 -0400 |
commit | a2e2725541fad72416326798c2d7fa4dafb7d337 (patch) | |
tree | 6174be11da607e83eb8efb3775114ad4d6e0ca3a /net/socket.c | |
parent | c05e85a06e376f6b6d59e71e5333d707e956d78b (diff) |
net: Introduce recvmmsg socket syscall
Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.
Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.
This takes into account comments made by:
. Paul Moore: sock_recvmsg is called only for the first datagram,
sock_recvmsg_nosec is used for the rest.
. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
works in the same fashion as the ppoll one.
If the underlying protocol returns a datagram with MSG_OOB set, this
will make recvmmsg return right away with as many datagrams (+ the OOB
one) it has received so far.
. RĂ©mi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
datagrams and then recvmsg returns an error, recvmmsg will return
the successfully received datagrams, store the error and return it
in the next call.
This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg,
where we will be able to acquire the lock only at batch start and end, not at
every underlying recvmsg call.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/socket.c')
-rw-r--r-- | net/socket.c | 225 |
1 files changed, 183 insertions, 42 deletions
diff --git a/net/socket.c b/net/socket.c index 807935693846..9dff31c9b799 100644 --- a/net/socket.c +++ b/net/socket.c | |||
@@ -683,10 +683,9 @@ void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, | |||
683 | } | 683 | } |
684 | EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops); | 684 | EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops); |
685 | 685 | ||
686 | static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, | 686 | static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, |
687 | struct msghdr *msg, size_t size, int flags) | 687 | struct msghdr *msg, size_t size, int flags) |
688 | { | 688 | { |
689 | int err; | ||
690 | struct sock_iocb *si = kiocb_to_siocb(iocb); | 689 | struct sock_iocb *si = kiocb_to_siocb(iocb); |
691 | 690 | ||
692 | si->sock = sock; | 691 | si->sock = sock; |
@@ -695,13 +694,17 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
695 | si->size = size; | 694 | si->size = size; |
696 | si->flags = flags; | 695 | si->flags = flags; |
697 | 696 | ||
698 | err = security_socket_recvmsg(sock, msg, size, flags); | ||
699 | if (err) | ||
700 | return err; | ||
701 | |||
702 | return sock->ops->recvmsg(iocb, sock, msg, size, flags); | 697 | return sock->ops->recvmsg(iocb, sock, msg, size, flags); |
703 | } | 698 | } |
704 | 699 | ||
700 | static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, | ||
701 | struct msghdr *msg, size_t size, int flags) | ||
702 | { | ||
703 | int err = security_socket_recvmsg(sock, msg, size, flags); | ||
704 | |||
705 | return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); | ||
706 | } | ||
707 | |||
705 | int sock_recvmsg(struct socket *sock, struct msghdr *msg, | 708 | int sock_recvmsg(struct socket *sock, struct msghdr *msg, |
706 | size_t size, int flags) | 709 | size_t size, int flags) |
707 | { | 710 | { |
@@ -717,6 +720,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, | |||
717 | return ret; | 720 | return ret; |
718 | } | 721 | } |
719 | 722 | ||
723 | static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, | ||
724 | size_t size, int flags) | ||
725 | { | ||
726 | struct kiocb iocb; | ||
727 | struct sock_iocb siocb; | ||
728 | int ret; | ||
729 | |||
730 | init_sync_kiocb(&iocb, NULL); | ||
731 | iocb.private = &siocb; | ||
732 | ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags); | ||
733 | if (-EIOCBQUEUED == ret) | ||
734 | ret = wait_on_sync_kiocb(&iocb); | ||
735 | return ret; | ||
736 | } | ||
737 | |||
720 | int kernel_recvmsg(struct socket *sock, struct msghdr *msg, | 738 | int kernel_recvmsg(struct socket *sock, struct msghdr *msg, |
721 | struct kvec *vec, size_t num, size_t size, int flags) | 739 | struct kvec *vec, size_t num, size_t size, int flags) |
722 | { | 740 | { |
@@ -1983,22 +2001,15 @@ out: | |||
1983 | return err; | 2001 | return err; |
1984 | } | 2002 | } |
1985 | 2003 | ||
1986 | /* | 2004 | static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, |
1987 | * BSD recvmsg interface | 2005 | struct msghdr *msg_sys, unsigned flags, int nosec) |
1988 | */ | ||
1989 | |||
1990 | SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, | ||
1991 | unsigned int, flags) | ||
1992 | { | 2006 | { |
1993 | struct compat_msghdr __user *msg_compat = | 2007 | struct compat_msghdr __user *msg_compat = |
1994 | (struct compat_msghdr __user *)msg; | 2008 | (struct compat_msghdr __user *)msg; |
1995 | struct socket *sock; | ||
1996 | struct iovec iovstack[UIO_FASTIOV]; | 2009 | struct iovec iovstack[UIO_FASTIOV]; |
1997 | struct iovec *iov = iovstack; | 2010 | struct iovec *iov = iovstack; |
1998 | struct msghdr msg_sys; | ||
1999 | unsigned long cmsg_ptr; | 2011 | unsigned long cmsg_ptr; |
2000 | int err, iov_size, total_len, len; | 2012 | int err, iov_size, total_len, len; |
2001 | int fput_needed; | ||
2002 | 2013 | ||
2003 | /* kernel mode address */ | 2014 | /* kernel mode address */ |
2004 | struct sockaddr_storage addr; | 2015 | struct sockaddr_storage addr; |
@@ -2008,27 +2019,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, | |||
2008 | int __user *uaddr_len; | 2019 | int __user *uaddr_len; |
2009 | 2020 | ||
2010 | if (MSG_CMSG_COMPAT & flags) { | 2021 | if (MSG_CMSG_COMPAT & flags) { |
2011 | if (get_compat_msghdr(&msg_sys, msg_compat)) | 2022 | if (get_compat_msghdr(msg_sys, msg_compat)) |
2012 | return -EFAULT; | 2023 | return -EFAULT; |
2013 | } | 2024 | } |
2014 | else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) | 2025 | else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) |
2015 | return -EFAULT; | 2026 | return -EFAULT; |
2016 | 2027 | ||
2017 | sock = sockfd_lookup_light(fd, &err, &fput_needed); | ||
2018 | if (!sock) | ||
2019 | goto out; | ||
2020 | |||
2021 | err = -EMSGSIZE; | 2028 | err = -EMSGSIZE; |
2022 | if (msg_sys.msg_iovlen > UIO_MAXIOV) | 2029 | if (msg_sys->msg_iovlen > UIO_MAXIOV) |
2023 | goto out_put; | 2030 | goto out; |
2024 | 2031 | ||
2025 | /* Check whether to allocate the iovec area */ | 2032 | /* Check whether to allocate the iovec area */ |
2026 | err = -ENOMEM; | 2033 | err = -ENOMEM; |
2027 | iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); | 2034 | iov_size = msg_sys->msg_iovlen * sizeof(struct iovec); |
2028 | if (msg_sys.msg_iovlen > UIO_FASTIOV) { | 2035 | if (msg_sys->msg_iovlen > UIO_FASTIOV) { |
2029 | iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); | 2036 | iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); |
2030 | if (!iov) | 2037 | if (!iov) |
2031 | goto out_put; | 2038 | goto out; |
2032 | } | 2039 | } |
2033 | 2040 | ||
2034 | /* | 2041 | /* |
@@ -2036,46 +2043,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, | |||
2036 | * kernel msghdr to use the kernel address space) | 2043 | * kernel msghdr to use the kernel address space) |
2037 | */ | 2044 | */ |
2038 | 2045 | ||
2039 | uaddr = (__force void __user *)msg_sys.msg_name; | 2046 | uaddr = (__force void __user *)msg_sys->msg_name; |
2040 | uaddr_len = COMPAT_NAMELEN(msg); | 2047 | uaddr_len = COMPAT_NAMELEN(msg); |
2041 | if (MSG_CMSG_COMPAT & flags) { | 2048 | if (MSG_CMSG_COMPAT & flags) { |
2042 | err = verify_compat_iovec(&msg_sys, iov, | 2049 | err = verify_compat_iovec(msg_sys, iov, |
2043 | (struct sockaddr *)&addr, | 2050 | (struct sockaddr *)&addr, |
2044 | VERIFY_WRITE); | 2051 | VERIFY_WRITE); |
2045 | } else | 2052 | } else |
2046 | err = verify_iovec(&msg_sys, iov, | 2053 | err = verify_iovec(msg_sys, iov, |
2047 | (struct sockaddr *)&addr, | 2054 | (struct sockaddr *)&addr, |
2048 | VERIFY_WRITE); | 2055 | VERIFY_WRITE); |
2049 | if (err < 0) | 2056 | if (err < 0) |
2050 | goto out_freeiov; | 2057 | goto out_freeiov; |
2051 | total_len = err; | 2058 | total_len = err; |
2052 | 2059 | ||
2053 | cmsg_ptr = (unsigned long)msg_sys.msg_control; | 2060 | cmsg_ptr = (unsigned long)msg_sys->msg_control; |
2054 | msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); | 2061 | msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); |
2055 | 2062 | ||
2056 | if (sock->file->f_flags & O_NONBLOCK) | 2063 | if (sock->file->f_flags & O_NONBLOCK) |
2057 | flags |= MSG_DONTWAIT; | 2064 | flags |= MSG_DONTWAIT; |
2058 | err = sock_recvmsg(sock, &msg_sys, total_len, flags); | 2065 | err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, |
2066 | total_len, flags); | ||
2059 | if (err < 0) | 2067 | if (err < 0) |
2060 | goto out_freeiov; | 2068 | goto out_freeiov; |
2061 | len = err; | 2069 | len = err; |
2062 | 2070 | ||
2063 | if (uaddr != NULL) { | 2071 | if (uaddr != NULL) { |
2064 | err = move_addr_to_user((struct sockaddr *)&addr, | 2072 | err = move_addr_to_user((struct sockaddr *)&addr, |
2065 | msg_sys.msg_namelen, uaddr, | 2073 | msg_sys->msg_namelen, uaddr, |
2066 | uaddr_len); | 2074 | uaddr_len); |
2067 | if (err < 0) | 2075 | if (err < 0) |
2068 | goto out_freeiov; | 2076 | goto out_freeiov; |
2069 | } | 2077 | } |
2070 | err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT), | 2078 | err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), |
2071 | COMPAT_FLAGS(msg)); | 2079 | COMPAT_FLAGS(msg)); |
2072 | if (err) | 2080 | if (err) |
2073 | goto out_freeiov; | 2081 | goto out_freeiov; |
2074 | if (MSG_CMSG_COMPAT & flags) | 2082 | if (MSG_CMSG_COMPAT & flags) |
2075 | err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, | 2083 | err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, |
2076 | &msg_compat->msg_controllen); | 2084 | &msg_compat->msg_controllen); |
2077 | else | 2085 | else |
2078 | err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr, | 2086 | err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, |
2079 | &msg->msg_controllen); | 2087 | &msg->msg_controllen); |
2080 | if (err) | 2088 | if (err) |
2081 | goto out_freeiov; | 2089 | goto out_freeiov; |
@@ -2084,21 +2092,150 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, | |||
2084 | out_freeiov: | 2092 | out_freeiov: |
2085 | if (iov != iovstack) | 2093 | if (iov != iovstack) |
2086 | sock_kfree_s(sock->sk, iov, iov_size); | 2094 | sock_kfree_s(sock->sk, iov, iov_size); |
2087 | out_put: | 2095 | out: |
2096 | return err; | ||
2097 | } | ||
2098 | |||
2099 | /* | ||
2100 | * BSD recvmsg interface | ||
2101 | */ | ||
2102 | |||
2103 | SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, | ||
2104 | unsigned int, flags) | ||
2105 | { | ||
2106 | int fput_needed, err; | ||
2107 | struct msghdr msg_sys; | ||
2108 | struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); | ||
2109 | |||
2110 | if (!sock) | ||
2111 | goto out; | ||
2112 | |||
2113 | err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0); | ||
2114 | |||
2088 | fput_light(sock->file, fput_needed); | 2115 | fput_light(sock->file, fput_needed); |
2089 | out: | 2116 | out: |
2090 | return err; | 2117 | return err; |
2091 | } | 2118 | } |
2092 | 2119 | ||
2093 | #ifdef __ARCH_WANT_SYS_SOCKETCALL | 2120 | /* |
2121 | * Linux recvmmsg interface | ||
2122 | */ | ||
2123 | |||
2124 | int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, | ||
2125 | unsigned int flags, struct timespec *timeout) | ||
2126 | { | ||
2127 | int fput_needed, err, datagrams; | ||
2128 | struct socket *sock; | ||
2129 | struct mmsghdr __user *entry; | ||
2130 | struct msghdr msg_sys; | ||
2131 | struct timespec end_time; | ||
2132 | |||
2133 | if (timeout && | ||
2134 | poll_select_set_timeout(&end_time, timeout->tv_sec, | ||
2135 | timeout->tv_nsec)) | ||
2136 | return -EINVAL; | ||
2137 | |||
2138 | datagrams = 0; | ||
2139 | |||
2140 | sock = sockfd_lookup_light(fd, &err, &fput_needed); | ||
2141 | if (!sock) | ||
2142 | return err; | ||
2143 | |||
2144 | err = sock_error(sock->sk); | ||
2145 | if (err) | ||
2146 | goto out_put; | ||
2147 | |||
2148 | entry = mmsg; | ||
2149 | |||
2150 | while (datagrams < vlen) { | ||
2151 | /* | ||
2152 | * No need to ask LSM for more than the first datagram. | ||
2153 | */ | ||
2154 | err = __sys_recvmsg(sock, (struct msghdr __user *)entry, | ||
2155 | &msg_sys, flags, datagrams); | ||
2156 | if (err < 0) | ||
2157 | break; | ||
2158 | err = put_user(err, &entry->msg_len); | ||
2159 | if (err) | ||
2160 | break; | ||
2161 | ++entry; | ||
2162 | ++datagrams; | ||
2163 | |||
2164 | if (timeout) { | ||
2165 | ktime_get_ts(timeout); | ||
2166 | *timeout = timespec_sub(end_time, *timeout); | ||
2167 | if (timeout->tv_sec < 0) { | ||
2168 | timeout->tv_sec = timeout->tv_nsec = 0; | ||
2169 | break; | ||
2170 | } | ||
2171 | |||
2172 | /* Timeout, return less than vlen datagrams */ | ||
2173 | if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) | ||
2174 | break; | ||
2175 | } | ||
2176 | |||
2177 | /* Out of band data, return right away */ | ||
2178 | if (msg_sys.msg_flags & MSG_OOB) | ||
2179 | break; | ||
2180 | } | ||
2181 | |||
2182 | out_put: | ||
2183 | fput_light(sock->file, fput_needed); | ||
2094 | 2184 | ||
2185 | if (err == 0) | ||
2186 | return datagrams; | ||
2187 | |||
2188 | if (datagrams != 0) { | ||
2189 | /* | ||
2190 | * We may return less entries than requested (vlen) if the | ||
2191 | * sock is non block and there aren't enough datagrams... | ||
2192 | */ | ||
2193 | if (err != -EAGAIN) { | ||
2194 | /* | ||
2195 | * ... or if recvmsg returns an error after we | ||
2196 | * received some datagrams, where we record the | ||
2197 | * error to return on the next call or if the | ||
2198 | * app asks about it using getsockopt(SO_ERROR). | ||
2199 | */ | ||
2200 | sock->sk->sk_err = -err; | ||
2201 | } | ||
2202 | |||
2203 | return datagrams; | ||
2204 | } | ||
2205 | |||
2206 | return err; | ||
2207 | } | ||
2208 | |||
2209 | SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, | ||
2210 | unsigned int, vlen, unsigned int, flags, | ||
2211 | struct timespec __user *, timeout) | ||
2212 | { | ||
2213 | int datagrams; | ||
2214 | struct timespec timeout_sys; | ||
2215 | |||
2216 | if (!timeout) | ||
2217 | return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); | ||
2218 | |||
2219 | if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) | ||
2220 | return -EFAULT; | ||
2221 | |||
2222 | datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); | ||
2223 | |||
2224 | if (datagrams > 0 && | ||
2225 | copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys))) | ||
2226 | datagrams = -EFAULT; | ||
2227 | |||
2228 | return datagrams; | ||
2229 | } | ||
2230 | |||
2231 | #ifdef __ARCH_WANT_SYS_SOCKETCALL | ||
2095 | /* Argument list sizes for sys_socketcall */ | 2232 | /* Argument list sizes for sys_socketcall */ |
2096 | #define AL(x) ((x) * sizeof(unsigned long)) | 2233 | #define AL(x) ((x) * sizeof(unsigned long)) |
2097 | static const unsigned char nargs[19]={ | 2234 | static const unsigned char nargs[20] = { |
2098 | AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), | 2235 | AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), |
2099 | AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), | 2236 | AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), |
2100 | AL(6),AL(2),AL(5),AL(5),AL(3),AL(3), | 2237 | AL(6),AL(2),AL(5),AL(5),AL(3),AL(3), |
2101 | AL(4) | 2238 | AL(4),AL(5) |
2102 | }; | 2239 | }; |
2103 | 2240 | ||
2104 | #undef AL | 2241 | #undef AL |
@@ -2118,7 +2255,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) | |||
2118 | int err; | 2255 | int err; |
2119 | unsigned int len; | 2256 | unsigned int len; |
2120 | 2257 | ||
2121 | if (call < 1 || call > SYS_ACCEPT4) | 2258 | if (call < 1 || call > SYS_RECVMMSG) |
2122 | return -EINVAL; | 2259 | return -EINVAL; |
2123 | 2260 | ||
2124 | len = nargs[call]; | 2261 | len = nargs[call]; |
@@ -2196,6 +2333,10 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) | |||
2196 | case SYS_RECVMSG: | 2333 | case SYS_RECVMSG: |
2197 | err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); | 2334 | err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); |
2198 | break; | 2335 | break; |
2336 | case SYS_RECVMMSG: | ||
2337 | err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], | ||
2338 | (struct timespec __user *)a[4]); | ||
2339 | break; | ||
2199 | case SYS_ACCEPT4: | 2340 | case SYS_ACCEPT4: |
2200 | err = sys_accept4(a0, (struct sockaddr __user *)a1, | 2341 | err = sys_accept4(a0, (struct sockaddr __user *)a1, |
2201 | (int __user *)a[2], a[3]); | 2342 | (int __user *)a[2], a[3]); |