UDP发送过程,数据从用户空间复制到内核空间的copy_from_user调用路径跟踪
系统调用:sendto()------》__sys_sendto()------》 __sock_sendmsg() ----》sock_sendmsg_nosec() ------》 inet_sendmsg() ------》 udp_sendmsg()--------》 ip_make_skb()--------》 __ip_append_data()--------》 ip_generic_getfrag()--------》 csum_and_copy_from_iter_full() ----》 copy_from_user_iter_csum()--------》 csum_and_copy_from_user () ----》 copy_from_user()
net / socket.c:
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len) { return __sys_sendto(fd, buff, len, flags, addr, addr_len); }
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,unsigned int, flags, struct sockaddr __user *, addr,int, addr_len) {return __sys_sendto(fd, buff, len, flags, addr, addr_len); }------》__sys_sendto()
net / socket.c:
/* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; int fput_needed; err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); out: return err; }
/** Send a datagram to a given address. We move the address into kernel* space and check the user space data area is readable before invoking* the protocol.*/ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,struct sockaddr __user *addr, int addr_len) {struct socket *sock;struct sockaddr_storage address;int err;struct msghdr msg;int fput_needed;err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);if (unlikely(err))return err;sock = sockfd_lookup_light(fd, &err, &fput_needed);if (!sock)goto out;msg.msg_name = NULL;msg.msg_control = NULL;msg.msg_controllen = 0;msg.msg_namelen = 0;msg.msg_ubuf = NULL;if (addr) {err = move_addr_to_kernel(addr, addr_len, &address);if (err < 0)goto out_put;msg.msg_name = (struct sockaddr *)&address;msg.msg_namelen = addr_len;}flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;if (sock->file->f_flags & O_NONBLOCK)flags |= MSG_DONTWAIT;msg.msg_flags = flags;err = __sock_sendmsg(sock, &msg);out_put:fput_light(sock->file, fput_needed); out:return err; }其中:
import_ubuf()-->iov_iter_ubuf()函数将用户数据地址(buff)保存在msg.msg_iter.ubuf中,并设置msg.msg_iter.iter_type = ITER_UBUF,后续通过struct msghdr msg往下传递给__sock_sendmsg()。
import_ubuf()-->iov_iter_ubuf()函数将用户数据地址(buff)保存在msg.msg_iter.ubuf中,并设置msg.msg_iter.iter_type = ITER_UBUF,后续通过struct msghdr msg往下传递给__sock_sendmsg()。------》 __sock_sendmsg() ----》sock_sendmsg_nosec()
net / socket.c:
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); if (trace_sock_send_length_enabled()) call_trace_sock_send_length(sock->sk, ret, 0); return ret; } static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); return err ?: sock_sendmsg_nosec(sock, msg); }
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) {int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg,inet_sendmsg, sock, msg,msg_data_left(msg));BUG_ON(ret == -EIOCBQUEUED);if (trace_sock_send_length_enabled())call_trace_sock_send_length(sock->sk, ret, 0);return ret; } static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) {int err = security_socket_sendmsg(sock, msg,msg_data_left(msg));return err ?: sock_sendmsg_nosec(sock, msg); }------》 inet_sendmsg()
IPv4:
net / ipv4 / af_inet.c:
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) {struct sock *sk = sock->sk;if (unlikely(inet_send_prepare(sk)))return -EAGAIN;return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg);------》 udp_sendmsg()
net / ipv4 / udp.c:
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; int connected = 0; __be32 daddr, faddr, saddr; u8 tos, scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) return -EMSGSIZE; /* * Check the flags. */ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) { release_sock(sk); return -EINVAL; } goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); /* * Get and verify the address. */ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipcm_init_sk(&ipc, inet); ipc.gso_size = READ_ONCE(up->gso_size); if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); if (err > 0) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); connected = 0; } if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &msg->msg_namelen, &ipc.addr); if (err) goto out_free; if (usin) { if (usin->sin_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_free; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; } } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; connected = 0; } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); connected = 0; } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } if (connected) rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: saddr = fl4->saddr; if (!ipc.addr) daddr = ipc.addr = fl4->daddr; /* Lockless fast path for the non-corking case. */ if (!corkreq) { struct inet_cork cork; skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4, &cork); goto out; } lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } /* * Now cork the socket to pend data. */ fl4 = &inet->cork.fl.u.ip4; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) WRITE_ONCE(up->pending, 0); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udp_sendmsg); 其中,以getfrag = ip_generic_getfrag为例: getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) {struct inet_sock *inet = inet_sk(sk);struct udp_sock *up = udp_sk(sk);DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);struct flowi4 fl4_stack;struct flowi4 *fl4;int ulen = len;struct ipcm_cookie ipc;struct rtable *rt = NULL;int free = 0;int connected = 0;__be32 daddr, faddr, saddr;u8 tos, scope;__be16 dport;int err, is_udplite = IS_UDPLITE(sk);int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);struct sk_buff *skb;struct ip_options_data opt_copy;int uc_index;if (len > 0xFFFF)return -EMSGSIZE;/** Check the flags.*/if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */return -EOPNOTSUPP;getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;fl4 = &inet->cork.fl.u.ip4;if (READ_ONCE(up->pending)) {/** There are pending frames.* The socket lock must be held while it's corked.*/lock_sock(sk);if (likely(up->pending)) {if (unlikely(up->pending != AF_INET)) {release_sock(sk);return -EINVAL;}goto do_append_data;}release_sock(sk);}ulen += sizeof(struct udphdr);/** Get and verify the address.*/if (usin) {if (msg->msg_namelen < sizeof(*usin))return -EINVAL;if (usin->sin_family != AF_INET) {if (usin->sin_family != AF_UNSPEC)return -EAFNOSUPPORT;}daddr = usin->sin_addr.s_addr;dport = usin->sin_port;if (dport == 0)return -EINVAL;} else {if (sk->sk_state != TCP_ESTABLISHED)return -EDESTADDRREQ;daddr = inet->inet_daddr;dport = inet->inet_dport;/* Open fast path for connected socket.Route will not be used, if at least one option is set.*/connected = 1;}ipcm_init_sk(&ipc, inet);ipc.gso_size = READ_ONCE(up->gso_size);if (msg->msg_controllen) {err = udp_cmsg_send(sk, msg, &ipc.gso_size);if (err > 0) {err = ip_cmsg_send(sk, msg, &ipc,sk->sk_family == AF_INET6);connected = 0;}if (unlikely(err < 0)) {kfree(ipc.opt);return err;}if (ipc.opt)free = 1;}if (!ipc.opt) {struct ip_options_rcu *inet_opt;rcu_read_lock();inet_opt = rcu_dereference(inet->inet_opt);if (inet_opt) {memcpy(&opt_copy, inet_opt,sizeof(*inet_opt) + inet_opt->opt.optlen);ipc.opt = &opt_copy.opt;}rcu_read_unlock();}if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,(struct sockaddr *)usin,&msg->msg_namelen,&ipc.addr);if (err)goto out_free;if (usin) {if (usin->sin_port == 0) {/* BPF program set invalid port. Reject it. */err = -EINVAL;goto out_free;}daddr = usin->sin_addr.s_addr;dport = usin->sin_port;}}saddr = ipc.addr;ipc.addr = faddr = daddr;if (ipc.opt && ipc.opt->opt.srr) {if (!daddr) {err = -EINVAL;goto out_free;}faddr = ipc.opt->opt.faddr;connected = 0;}tos = get_rttos(&ipc, inet);scope = ip_sendmsg_scope(inet, &ipc, msg);if (scope == RT_SCOPE_LINK)connected = 0;uc_index = READ_ONCE(inet->uc_index);if (ipv4_is_multicast(daddr)) {if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))ipc.oif = READ_ONCE(inet->mc_index);if (!saddr)saddr = READ_ONCE(inet->mc_addr);connected = 0;} else if (!ipc.oif) {ipc.oif = uc_index;} else if (ipv4_is_lbcast(daddr) && uc_index) {/* oif is set, packet is to local broadcast and* uc_index is set. oif is most likely set* by sk_bound_dev_if. If uc_index != oif check if the* oif is an L3 master and uc_index is an L3 slave.* If so, we want to allow the send using the uc_index.*/if (ipc.oif != uc_index &&ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),uc_index)) {ipc.oif = uc_index;}}if (connected)rt = dst_rtable(sk_dst_check(sk, 0));if (!rt) {struct net *net = sock_net(sk);__u8 flow_flags = inet_sk_flowi_flags(sk);fl4 = &fl4_stack;flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,sk->sk_protocol, flow_flags, faddr, saddr,dport, inet->inet_sport, sk->sk_uid);security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));rt = ip_route_output_flow(net, fl4, sk);if (IS_ERR(rt)) {err = PTR_ERR(rt);rt = NULL;if (err == -ENETUNREACH)IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);goto out;}err = -EACCES;if ((rt->rt_flags & RTCF_BROADCAST) &&!sock_flag(sk, SOCK_BROADCAST))goto out;if (connected)sk_dst_set(sk, dst_clone(&rt->dst));}if (msg->msg_flags&MSG_CONFIRM)goto do_confirm; back_from_confirm:saddr = fl4->saddr;if (!ipc.addr)daddr = ipc.addr = fl4->daddr;/* Lockless fast path for the non-corking case. */if (!corkreq) {struct inet_cork cork;skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,sizeof(struct udphdr), &ipc, &rt,&cork, msg->msg_flags);err = PTR_ERR(skb);if (!IS_ERR_OR_NULL(skb))err = udp_send_skb(skb, fl4, &cork);goto out;}lock_sock(sk);if (unlikely(up->pending)) {/* The socket is already corked while preparing it. *//* ... which is an evident application bug. --ANK */release_sock(sk);net_dbg_ratelimited("socket already corked\n");err = -EINVAL;goto out;}/** Now cork the socket to pend data.*/fl4 = &inet->cork.fl.u.ip4;fl4->daddr = daddr;fl4->saddr = saddr;fl4->fl4_dport = dport;fl4->fl4_sport = inet->inet_sport;WRITE_ONCE(up->pending, AF_INET);do_append_data:up->len += ulen;err = ip_append_data(sk, fl4, getfrag, msg, ulen,sizeof(struct udphdr), &ipc, &rt,corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);if (err)udp_flush_pending_frames(sk);else if (!corkreq)err = udp_push_pending_frames(sk);else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))WRITE_ONCE(up->pending, 0);release_sock(sk);out:ip_rt_put(rt); out_free:if (free)kfree(ipc.opt);if (!err)return len;/** ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting* ENOBUFS might not be good (it's not tunable per se), but otherwise* we don't have a good statistic (IpOutDiscards but it can be too many* things). We could add another new stat but at least for now that* seems like overkill.*/if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {UDP_INC_STATS(sock_net(sk),UDP_MIB_SNDBUFERRORS, is_udplite);}return err;do_confirm:if (msg->msg_flags & MSG_PROBE)dst_confirm_neigh(&rt->dst, &fl4->daddr);if (!(msg->msg_flags&MSG_PROBE) || len)goto back_from_confirm;err = 0;goto out; } EXPORT_SYMBOL(udp_sendmsg); 其中,以getfrag = ip_generic_getfrag为例: getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;--------》 ip_make_skb()
net / ipv4 / ip_output.c:
struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags) { struct sk_buff_head queue; int err; if (flags & MSG_PROBE) return NULL; __skb_queue_head_init(&queue); cork->flags = 0; cork->addr = 0; cork->opt = NULL; err = ip_setup_cork(sk, cork, ipc, rtp); if (err) return ERR_PTR(err); err = __ip_append_data(sk, fl4, &queue, cork, ¤t->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } return __ip_make_skb(sk, fl4, &queue, cork); }
struct sk_buff *ip_make_skb(struct sock *sk,struct flowi4 *fl4,int getfrag(void *from, char *to, int offset,int len, int odd, struct sk_buff *skb),void *from, int length, int transhdrlen,struct ipcm_cookie *ipc, struct rtable **rtp,struct inet_cork *cork, unsigned int flags) {struct sk_buff_head queue;int err;if (flags & MSG_PROBE)return NULL;__skb_queue_head_init(&queue);cork->flags = 0;cork->addr = 0;cork->opt = NULL;err = ip_setup_cork(sk, cork, ipc, rtp);if (err)return ERR_PTR(err);err = __ip_append_data(sk, fl4, &queue, cork,¤t->task_frag, getfrag,from, length, transhdrlen, flags);if (err) {__ip_flush_pending_frames(sk, &queue, cork);return ERR_PTR(err);}return __ip_make_skb(sk, fl4, &queue, cork); }--------》 __ip_append_data()
static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } /* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && (!(flags & MSG_MORE) || cork->gso_size) && (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } cork->length += length; hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; if (hold_tskey) tskey = atomic_inc_return(&sk->sk_tskey) - 1; /* So, what's going on in the loop below? ** We use calculated fragment length to generate chained skb,* each of segments is IP fragment ready for sending to network after* adding appropriate IP header.*/if (!skb)goto alloc_new_skb;while (length > 0) {/* Check if the remaining data fits into current packet. */copy = mtu - skb->len;if (copy < length)copy = maxfraglen - skb->len;if (copy <= 0) {char *data;unsigned int datalen;unsigned int fraglen;unsigned int fraggap;unsigned int alloclen, alloc_extra;unsigned int pagedlen;struct sk_buff *skb_prev; alloc_new_skb:skb_prev = skb;if (skb_prev)fraggap = skb_prev->len - maxfraglen;elsefraggap = 0;/** If remaining data exceeds the mtu,* we know we need more fragment(s).*/datalen = length + fraggap;if (datalen > mtu - fragheaderlen)datalen = maxfraglen - fragheaderlen;fraglen = datalen + fragheaderlen;pagedlen = 0;alloc_extra = hh_len + 15;alloc_extra += exthdrlen;/* The last fragment gets additional space at tail.* Note, with MSG_MORE we overallocate on fragments,* because we have no idea what fragment will be* the last.*/if (datalen == length + fraggap)alloc_extra += rt->dst.trailer_len;if ((flags & MSG_MORE) &&!(rt->dst.dev->features&NETIF_F_SG))alloclen = mtu;else if (!paged &&(fraglen + alloc_extra < SKB_MAX_ALLOC ||!(rt->dst.dev->features & NETIF_F_SG)))alloclen = fraglen;else {alloclen = fragheaderlen + transhdrlen;pagedlen = datalen - transhdrlen;}alloclen += alloc_extra;if (transhdrlen) {skb = sock_alloc_send_skb(sk, alloclen,(flags & MSG_DONTWAIT), &err);} else {skb = NULL;if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=2 * sk->sk_sndbuf)skb = alloc_skb(alloclen,sk->sk_allocation);if (unlikely(!skb))err = -ENOBUFS;}if (!skb)goto error;/** Fill in the control structures*/skb->ip_summed = csummode;skb->csum = 0;skb_reserve(skb, hh_len);/** Find where to start putting bytes.*/data = skb_put(skb, fraglen + exthdrlen - pagedlen);skb_set_network_header(skb, exthdrlen);skb->transport_header = (skb->network_header +fragheaderlen);data += fragheaderlen + exthdrlen;if (fraggap) {skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen,data + transhdrlen, fraggap);skb_prev->csum = csum_sub(skb_prev->csum,skb->csum);data += fraggap;pskb_trim_unique(skb_prev, maxfraglen);}copy = datalen - transhdrlen - fraggap - pagedlen;/* [!] NOTE: copy will be negative if pagedlen>0* because then the equation reduces to -fraggap.*/if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {err = -EFAULT;kfree_skb(skb);goto error;} else if (flags & MSG_SPLICE_PAGES) {copy = 0;}offset += copy;length -= copy + transhdrlen;transhdrlen = 0;exthdrlen = 0;csummode = CHECKSUM_NONE;/* only the initial fragment is time stamped */skb_shinfo(skb)->tx_flags = cork->tx_flags;cork->tx_flags = 0;skb_shinfo(skb)->tskey = tskey;tskey = 0;skb_zcopy_set(skb, uarg, &extra_uref);if ((flags & MSG_CONFIRM) && !skb_prev)skb_set_dst_pending_confirm(skb, 1);/** Put the packet on the pending queue.*/if (!skb->destructor) {skb->destructor = sock_wfree;skb->sk = sk;wmem_alloc_delta += skb->truesize;}__skb_queue_tail(queue, skb);continue;}if (copy > length)copy = length;if (!(rt->dst.dev->features&NETIF_F_SG) &&skb_tailroom(skb) >= copy) {unsigned int off;off = skb->len;if (getfrag(from, skb_put(skb, copy),offset, copy, off, skb) < 0) {__skb_trim(skb, off);err = -EFAULT;goto error;}} else if (flags & MSG_SPLICE_PAGES) {struct msghdr *msg = from;err = -EIO;if (WARN_ON_ONCE(copy > msg->msg_iter.count))goto error;err = skb_splice_from_iter(skb, &msg->msg_iter, copy,sk->sk_allocation);if (err < 0)goto error;copy = err;wmem_alloc_delta += copy;} else if (!zc) {int i = skb_shinfo(skb)->nr_frags;err = -ENOMEM;if (!sk_page_frag_refill(sk, pfrag))goto error;skb_zcopy_downgrade_managed(skb);if (!skb_can_coalesce(skb, i, pfrag->page,pfrag->offset)) {err = -EMSGSIZE;if (i == MAX_SKB_FRAGS)goto error;__skb_fill_page_desc(skb, i, pfrag->page,pfrag->offset, 0);skb_shinfo(skb)->nr_frags = ++i;get_page(pfrag->page);}copy = min_t(int, copy, pfrag->size - pfrag->offset);if (getfrag(from,page_address(pfrag->page) + pfrag->offset,offset, copy, skb->len, skb) < 0)goto error_efault;pfrag->offset += copy;skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);skb_len_add(skb, copy);wmem_alloc_delta += copy;} else {err = skb_zerocopy_iter_dgram(skb, from, copy);if (err < 0)goto error;}offset += copy;length -= copy;}if (wmem_alloc_delta)refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);return 0;error_efault:err = -EFAULT; error:net_zcopy_put_abort(uarg, extra_uref);cork->length -= length;IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);if (hold_tskey)atomic_dec(&sk->sk_tskey);return err; }
static int __ip_append_data(struct sock *sk,struct flowi4 *fl4,struct sk_buff_head *queue,struct inet_cork *cork,struct page_frag *pfrag,int getfrag(void *from, char *to, int offset,int len, int odd, struct sk_buff *skb),void *from, int length, int transhdrlen,unsigned int flags) {struct inet_sock *inet = inet_sk(sk);struct ubuf_info *uarg = NULL;struct sk_buff *skb;struct ip_options *opt = cork->opt;int hh_len;int exthdrlen;int mtu;int copy;int err;int offset = 0;bool zc = false;unsigned int maxfraglen, fragheaderlen, maxnonfragsize;int csummode = CHECKSUM_NONE;struct rtable *rt = dst_rtable(cork->dst);bool paged, hold_tskey, extra_uref = false;unsigned int wmem_alloc_delta = 0;u32 tskey = 0;skb = skb_peek_tail(queue);exthdrlen = !skb ? rt->dst.header_len : 0;mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;paged = !!cork->gso_size;hh_len = LL_RESERVED_SPACE(rt->dst.dev);fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;if (cork->length + length > maxnonfragsize - fragheaderlen) {ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,mtu - (opt ? opt->optlen : 0));return -EMSGSIZE;}/** transhdrlen > 0 means that this is the first fragment and we wish* it won't be fragmented in the future.*/if (transhdrlen &&length + fragheaderlen <= mtu &&rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&(!(flags & MSG_MORE) || cork->gso_size) &&(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))csummode = CHECKSUM_PARTIAL;if ((flags & MSG_ZEROCOPY) && length) {struct msghdr *msg = from;if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))return -EINVAL;/* Leave uarg NULL if can't zerocopy, callers should* be able to handle it.*/if ((rt->dst.dev->features & NETIF_F_SG) &&csummode == CHECKSUM_PARTIAL) {paged = true;zc = true;uarg = msg->msg_ubuf;}} else if (sock_flag(sk, SOCK_ZEROCOPY)) {uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));if (!uarg)return -ENOBUFS;extra_uref = !skb_zcopy(skb); /* only ref on new uarg */if (rt->dst.dev->features & NETIF_F_SG &&csummode == CHECKSUM_PARTIAL) {paged = true;zc = true;} else {uarg_to_msgzc(uarg)->zerocopy = 0;skb_zcopy_set(skb, uarg, &extra_uref);}}} else if ((flags & MSG_SPLICE_PAGES) && length) {if (inet_test_bit(HDRINCL, sk))return -EPERM;if (rt->dst.dev->features & NETIF_F_SG &&getfrag == ip_generic_getfrag)/* We need an empty buffer to attach stuff to */paged = true;elseflags &= ~MSG_SPLICE_PAGES;}cork->length += length;hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;if (hold_tskey)tskey = atomic_inc_return(&sk->sk_tskey) - 1;/* So, what's going on in the loop below?** We use calculated fragment length to generate chained skb,* each of segments is IP fragment ready for sending to network after* adding appropriate IP header.*/if (!skb)goto alloc_new_skb;while (length > 0) {/* Check if the remaining data fits into current packet. */copy = mtu - skb->len;if (copy < length)copy = maxfraglen - skb->len;if (copy <= 0) {char *data;unsigned int datalen;unsigned int fraglen;unsigned int fraggap;unsigned int alloclen, alloc_extra;unsigned int pagedlen;struct sk_buff *skb_prev; alloc_new_skb:skb_prev = skb;if (skb_prev)fraggap = skb_prev->len - maxfraglen;elsefraggap = 0;/** If remaining data exceeds the mtu,* we know we need more fragment(s).*/datalen = length + fraggap;if (datalen > mtu - fragheaderlen)datalen = maxfraglen - fragheaderlen;fraglen = datalen + fragheaderlen;pagedlen = 0;alloc_extra = hh_len + 15;alloc_extra += exthdrlen;/* The last fragment gets additional space at tail.* Note, with MSG_MORE we overallocate on fragments,* because we have no idea what fragment will be* the last.*/if (datalen == length + fraggap)alloc_extra += rt->dst.trailer_len;if ((flags & MSG_MORE) &&!(rt->dst.dev->features&NETIF_F_SG))alloclen = mtu;else if (!paged &&(fraglen + alloc_extra < SKB_MAX_ALLOC ||!(rt->dst.dev->features & NETIF_F_SG)))alloclen = fraglen;else {alloclen = fragheaderlen + transhdrlen;pagedlen = datalen - transhdrlen;}alloclen += alloc_extra;if (transhdrlen) {skb = sock_alloc_send_skb(sk, alloclen,(flags & MSG_DONTWAIT), &err);} else {skb = NULL;if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=2 * sk->sk_sndbuf)skb = alloc_skb(alloclen,sk->sk_allocation);if (unlikely(!skb))err = -ENOBUFS;}if (!skb)goto error;/** Fill in the control structures*/skb->ip_summed = csummode;skb->csum = 0;skb_reserve(skb, hh_len);/** Find where to start putting bytes.*/data = skb_put(skb, fraglen + exthdrlen - pagedlen);skb_set_network_header(skb, exthdrlen);skb->transport_header = (skb->network_header +fragheaderlen);data += fragheaderlen + exthdrlen;if (fraggap) {skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen,data + transhdrlen, fraggap);skb_prev->csum = csum_sub(skb_prev->csum,skb->csum);data += fraggap;pskb_trim_unique(skb_prev, maxfraglen);}copy = datalen - transhdrlen - fraggap - pagedlen;/* [!] NOTE: copy will be negative if pagedlen>0* because then the equation reduces to -fraggap.*/if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {err = -EFAULT;kfree_skb(skb);goto error;} else if (flags & MSG_SPLICE_PAGES) {copy = 0;}offset += copy;length -= copy + transhdrlen;transhdrlen = 0;exthdrlen = 0;csummode = CHECKSUM_NONE;/* only the initial fragment is time stamped */skb_shinfo(skb)->tx_flags = cork->tx_flags;cork->tx_flags = 0;skb_shinfo(skb)->tskey = tskey;tskey = 0;skb_zcopy_set(skb, uarg, &extra_uref);if ((flags & MSG_CONFIRM) && !skb_prev)skb_set_dst_pending_confirm(skb, 1);/** Put the packet on the pending queue.*/if (!skb->destructor) {skb->destructor = sock_wfree;skb->sk = sk;wmem_alloc_delta += skb->truesize;}__skb_queue_tail(queue, skb);continue;}if (copy > length)copy = length;if (!(rt->dst.dev->features&NETIF_F_SG) &&skb_tailroom(skb) >= copy) {unsigned int off;off = skb->len;if (getfrag(from, skb_put(skb, copy),offset, copy, off, skb) < 0) {__skb_trim(skb, off);err = -EFAULT;goto error;}} else if (flags & MSG_SPLICE_PAGES) {struct msghdr *msg = from;err = -EIO;if (WARN_ON_ONCE(copy > msg->msg_iter.count))goto error;err = skb_splice_from_iter(skb, &msg->msg_iter, copy,sk->sk_allocation);if (err < 0)goto error;copy = err;wmem_alloc_delta += copy;} else if (!zc) {int i = skb_shinfo(skb)->nr_frags;err = -ENOMEM;if (!sk_page_frag_refill(sk, pfrag))goto error;skb_zcopy_downgrade_managed(skb);if (!skb_can_coalesce(skb, i, pfrag->page,pfrag->offset)) {err = -EMSGSIZE;if (i == MAX_SKB_FRAGS)goto error;__skb_fill_page_desc(skb, i, pfrag->page,pfrag->offset, 0);skb_shinfo(skb)->nr_frags = ++i;get_page(pfrag->page);}copy = min_t(int, copy, pfrag->size - pfrag->offset);if (getfrag(from,page_address(pfrag->page) + pfrag->offset,offset, copy, skb->len, skb) < 0)goto error_efault;pfrag->offset += copy;skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);skb_len_add(skb, copy);wmem_alloc_delta += copy;} else {err = skb_zerocopy_iter_dgram(skb, from, copy);if (err < 0)goto error;}offset += copy;length -= copy;}if (wmem_alloc_delta)refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);return 0;error_efault:err = -EFAULT; error:net_zcopy_put_abort(uarg, extra_uref);cork->length -= length;IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);if (hold_tskey)atomic_dec(&sk->sk_tskey);return err; }--------》 ip_generic_getfrag()
net / ipv4 / ip_output.c:
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { if (!copy_from_iter_full(to, len, &msg->msg_iter)) return -EFAULT; } else { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } return 0; } EXPORT_SYMBOL(ip_generic_getfrag);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) {struct msghdr *msg = from;if (skb->ip_summed == CHECKSUM_PARTIAL) {if (!copy_from_iter_full(to, len, &msg->msg_iter))return -EFAULT;} else {__wsum csum = 0;if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))return -EFAULT;skb->csum = csum_block_add(skb->csum, csum, odd);}return 0; } EXPORT_SYMBOL(ip_generic_getfrag);--------》 csum_and_copy_from_iter_full() ----》 copy_from_user_iter_csum()
net / core / skbuff.c:
bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied; if (WARN_ON_ONCE(!i->data_source)) return false; copied = iterate_and_advance2(i, bytes, addr, csum, copy_from_user_iter_csum, memcpy_from_iter_csum); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); static __always_inline size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_from_user(iter_from, to + progress, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; }
bool csum_and_copy_from_iter_full(void *addr, size_t bytes,__wsum *csum, struct iov_iter *i) {size_t copied;if (WARN_ON_ONCE(!i->data_source))return false;copied = iterate_and_advance2(i, bytes, addr, csum,copy_from_user_iter_csum,memcpy_from_iter_csum);if (likely(copied == bytes))return true;iov_iter_revert(i, copied);return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); static __always_inline size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,size_t len, void *to, void *priv2) {__wsum next, *csum = priv2;next = csum_and_copy_from_user(iter_from, to + progress, len);*csum = csum_block_add(*csum, next, progress);return next ? 0 : len; }--------》 csum_and_copy_from_user () ----》 copy_from_user()
include / net / checksum.h:
#ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { if (copy_from_user(dst, src, len)) return 0; return csum_partial(dst, len, ~0U); } #endif
#ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst,int len) {if (copy_from_user(dst, src, len))return 0;return csum_partial(dst, len, ~0U); } #endif