一、用户空间
以Linux以及TPACKET_V3为例。
调用pcap_dispatch获取数据包,然后回调用户传递的数据包处理函数。
read_op实际调用的是pcap_read_linux_mmap_v3
// pcap.c
int
pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{return (p->read_op(p, cnt, callback, user));
}
1.1 获取block
1.1.1 根据offset获取一个block
#define RING_GET_FRAME_AT(h, offset) (((union thdr **)h->buffer)[(offset)])
#define RING_GET_CURRENT_FRAME(h) RING_GET_FRAME_AT(h, h->offset)h.raw = RING_GET_CURRENT_FRAME(handle);
1.1.2 判断当前block的状态
根据block_status值判断block的实际状态,主要关注两个值,
TP_STATUS_KERNEL
- block正在内核使用,用户不能使用
TP_STATUS_USER
- block已经由内核填充了数据,用户可以读取,内核不能使用
if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {...
}
1.2 读取/处理数据
如果当前block的status为TP_STATUS_USER
,则开始读取数据
...//偏移到实际的数据包部分
handlep->current_packet = h.raw + h.h3->hdr.bh1.offset_to_first_pkt;
//当前block中数据包的个数
handlep->packets_left = h.h3->hdr.bh1.num_pkts;while (packets_to_read-- && !handle->break_loop) {struct tpacket3_hdr* tp3_hdr = (struct tpacket3_hdr*) handlep->current_packet;ret = pcap_handle_packet_mmap(handle,callback,user,handlep->current_packet,tp3_hdr->tp_len,tp3_hdr->tp_mac,tp3_hdr->tp_snaplen,tp3_hdr->tp_sec,handle->opt.tstamp_precision == PCAP_TSTAMP_PRECISION_NANO ? tp3_hdr->tp_nsec : tp3_hdr->tp_nsec / 1000,VLAN_VALID(tp3_hdr, &tp3_hdr->hv1),tp3_hdr->hv1.tp_vlan_tci,VLAN_TPID(tp3_hdr, &tp3_hdr->hv1));...//移动到下一个包handlep->current_packet += tp3_hdr->tp_next_offset;handlep->packets_left--;
}
...
回调用户处理函数
/* handle a single memory mapped packet */
static int pcap_handle_packet_mmap(pcap_t *handle,pcap_handler callback,u_char *user,unsigned char *frame,unsigned int tp_len,unsigned int tp_mac,unsigned int tp_snaplen,unsigned int tp_sec,unsigned int tp_usec,int tp_vlan_tci_valid,__u16 tp_vlan_tci,__u16 tp_vlan_tpid)
{.../* pass the packet to the user */callback(user, &pcaphdr, bp);return 1;
}
1.3 "释放"当前block
当前block的数据包处理完成后,需要将当前block归还给内核,让内核可以继续写数据,只是将状态值设置为TP_STATUS_KERNEL
即可。
if (handlep->packets_left <= 0) {h.h3->hdr.bh1.block_status = TP_STATUS_KERNEL;.../* next block */if (++handle->offset >= handle->cc)handle->offset = 0;handlep->current_packet = NULL;
}
1.4 等待数据
因为block是一个循环队列
,只要当前block的状态是TP_STATUS_KERNEL
则说明后面都没有数据,只能等待。
libpcap通过poll
进行数据的等待,其中fd则是最开始创建的socket。
if (h.h3->hdr.bh1.block_status == TP_STATUS_KERNEL) {ret = pcap_wait_for_frames_mmap(handle);if (ret) {return ret;}
}
static int pcap_wait_for_frames_mmap(pcap_t *handle)
{struct pcap_linux *handlep = handle->priv;...struct pollfd pollinfo;int ret;pollinfo.fd = handle->fd;pollinfo.events = POLLIN;do {ret = poll(&pollinfo, 1, handlep->poll_timeout);...} while (ret < 0);
1.4.1 阻塞模式
默认是阻塞模式,并且TPACKET3下超时为-1(永不超时
), 当没有流量时,将不会被唤醒。
static void
set_poll_timeout(struct pcap_linux *handlep) {
...if (handlep->tp_version == TPACKET_V3 && !broken_tpacket_v3)handlep->poll_timeout = -1; /* block forever, let TPACKET_V3 wake us up */else...
}
如何提前唤醒?
在 libpcap-1.10.4之前无法提前唤醒,只能等待数据的到来,在新版本中增加了一个fd,专门用来提前唤醒。
https://github.com/the-tcpdump-group/libpcap/pull/741/commits/5c8b13d3e87542527ed9a3a79fb0f9b2edb74df1
- 在创建handle时,同时创建了poll_breakloop_fd
pcap_t *
pcap_create_interface(const char *device, char *ebuf)
{pcap_t *handle;handle = PCAP_CREATE_COMMON(ebuf, struct pcap_linux);if (handle == NULL)return NULL;...struct pcap_linux *handlep = handle->priv;handlep->poll_breakloop_fd = eventfd(0, EFD_NONBLOCK);return handle;
}
- 激活handle时设置对应的break_loop callback
static int
pcap_activate_linux(pcap_t *handle)...handle->breakloop_op = pcap_breakloop_linux;...
}
- poll时将poll_breakloop_fd也监听
static int pcap_wait_for_frames_mmap(pcap_t *handle)
{struct pcap_linux *handlep = handle->priv;...struct pollfd pollinfo[2];int numpollinfo;pollinfo[0].fd = handle->fd;pollinfo[0].events = POLLIN;...pollinfo[1].fd = handlep->poll_breakloop_fd;pollinfo[1].events = POLLIN;numpollinfo = 2;...
- 调用
pcap_breakloop
通知唤醒
void
pcap_breakloop(pcap_t *p)
{p->breakloop_op(p);
}
static void pcap_breakloop_linux(pcap_t *handle)
{pcap_breakloop_common(handle);struct pcap_linux *handlep = handle->priv;uint64_t value = 1;/* XXX - what if this fails? */if (handlep->poll_breakloop_fd != -1)(void)write(handlep->poll_breakloop_fd, &value, sizeof(value));
}
- poll被poll_breakloop_fd唤醒
if (pollinfo[1].revents & POLLIN) {ssize_t nread;uint64_t value;nread = read(handlep->poll_breakloop_fd, &value,sizeof(value));...if (handle->break_loop) {handle->break_loop = 0;return PCAP_ERROR_BREAK;}
}
1.4.2 非阻塞模式
没有数据时,立刻返回,通过如下API进行设置
int
pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf)
二、内核空间
内核在接收到数据包时,将调用相应的处理函数进行处理
__netif_receive_skb_core()deliver_skb()
static inline int deliver_skb(struct sk_buff *skb,struct packet_type *pt_prev,struct net_device *orig_dev)
{...return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
而pt_prev->func实际为在设置rx ring时设置的函数tpacket_rcv
2.1 判断是否有可用空间
如果没有空间了,则当前数据包被丢弃。
/* If we are flooded, just give up */if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {atomic_inc(&po->tp_drops);goto drop_n_restore;}
2.2 获取一个可用的block
h.raw = packet_current_rx_frame(po, skb,TP_STATUS_KERNEL, (macoff+snaplen));
2.3 拷贝数据到block中
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2.4 更新block属性
switch (po->tp_version) {...case TPACKET_V3:h.h3->tp_status |= status;h.h3->tp_len = skb->len;h.h3->tp_snaplen = snaplen;h.h3->tp_mac = macoff;h.h3->tp_net = netoff;h.h3->tp_sec = ts.tv_sec;h.h3->tp_nsec = ts.tv_nsec;memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));hdrlen = sizeof(*h.h3);break;default:BUG();}
2.5 何时更新block状态
1. 当block写满时
在获取每次获取block时,将判断当前block是否有足够的空间写入当前的数据包
static void *packet_current_rx_frame(struct packet_sock *po,struct sk_buff *skb,int status, unsigned int len)
{char *curr = NULL;switch (po->tp_version) {...case TPACKET_V3:return __packet_lookup_frame_in_block(po, skb, len);...}
}
static void *__packet_lookup_frame_in_block(struct packet_sock *po,struct sk_buff *skb,unsigned int len)
{.../* 空间足够,直接返回 */if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {prb_fill_curr_block(curr, pkc, pbd, len);return (void *)curr;}
空间不足时,将当前block 关闭(即将状态设置为TP_STATUS_USER
),并通知socket fd有数据, 用户空间的poll则会被唤醒。
/* Ok, close the current block */prb_retire_current_block(pkc, po, 0);
static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,struct packet_sock *po, unsigned int status)
{...prb_close_block(pkc, pbd, po, status);...
}
static void prb_close_block(struct tpacket_kbdq_core *pkc1,struct tpacket_block_desc *pbd1,struct packet_sock *po, unsigned int stat)
{__u32 status = TP_STATUS_USER | stat;
.../* Flush the block */prb_flush_block(pkc1, pbd1, status);//通知socket 有数据,poll将被唤醒sk->sk_data_ready(sk);pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
}static void prb_flush_block(struct tpacket_kbdq_core *pkc1,struct tpacket_block_desc *pbd1, __u32 status)
{/* Now update the block status. */BLOCK_STATUS(pbd1) = status;
}
2. 当block超时时
当流量很小时,block一直都不会被写满,因此数据一直停留在block中,上层应用无法获取数据;因此增加了一个timer.
- 在建立ring buf时初始化timer并设置超时处理函数
static void init_prb_bdqc(struct packet_sock *po,struct packet_ring_buffer *rb,struct pgv *pg_vec,union tpacket_req_u *req_u)
{...prb_setup_retire_blk_timer(po);...
}static void prb_setup_retire_blk_timer(struct packet_sock *po)
{struct tpacket_kbdq_core *pkc;pkc = GET_PBDQC_FROM_RB(&po->rx_ring);timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,0);pkc->retire_blk_timer.expires = jiffies;
}
- timer超时,调用回调函数,关闭当前block
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
{...if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {...prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);... }
...
}