af_packet 网卡down/up导致丢包

最近碰到一个问题,使用 af_packet的mmap模式发包时,如果把网卡down了再up起来后,第一个报文总是发送失败,并且errno是-100(ENETDOWN)。

最初怀疑这个问题和网卡驱动有关(出问题的环境用的X710网卡的SRIOV生成的VF),但是将SRIOV换成virtio网卡后,问题仍然存在,这个就把驱动层的怀疑排除掉了。而且这个问题是在k8s的pod环境发现的,跑着很多业务,干扰因素很多。简单看了下业务代码后,第一感觉这个问题和业务没关系,而是和packet mmap有关,就想了一个办法,通过修改kernel源码中自带的packet mmap例子(linux-stable/tools/testing/selftests/net/psock_tpacket.c),并且在一个新建的没有杂包的VM中运行,问题果然复现了。修改后的版本在文章最后面,这里先看下复现步骤。

如下,编译代码并执行,参数ens8表示会将组装好的数据包sendto到此网卡。代码中会判断网卡链路状态,如果为up就发一个数据包,如果down就不发送数据包。为了复现问题,先将ens8 down调,再up起来,可看到up起来后,第一个数据包发送果然失败了,errno也是 -100,这个问题复现了,能复现的问题就是肯定可以解决的问题。

root@node2:~# gcc psock_tpacket.c -o psock_tpacket
root@node2:~# ./psock_tpacket ens8
send data to NIC ens8
NIC is up, send one packet
send success
NIC is up, send one packet
send success
NIC is up, send one packet
send success
NIC is down, don't send packet    ---> ifconfig ens8 down
NIC is down, don't send packet 
sleep 5 s                         ---> ifconfig ens8 up
NIC is up, send one packet        -->ens8 up后发的第一个包失败
sendto fail
: Network is down
NIC is up, send one packet      -->后续的包都可以发送成功
send success

调用sendto发包的流程大概是下面这样的,其中省略了tc qdisc的处理。
sys_sendto -> sock_sendmsg -> __sock_sendmsg -> packet_sendmsg -> tpacket_snd -> dev_queue_xmit --> __dev_queue_xmit -->dev_hard_start_xmit -> xmit_one -> netdev_start_xmit -> __netdev_start_xmit -> ndo_start_xmit
接下来就要看一下ENETDOWN是在哪设置的,粗略看了下代码,在这个路径上有两个函数明确会返回ENETDOWN: tpacket_snd 和 __dev_queue_xmit,代码片段如下。
对于tpacket_snd来说,因为每次发包是在确定网卡是up的情况下发送的,即 dev->flags 是包含 IFF_UP 的,所以这个可能性也不大。
对于__dev_queue_xmit来说,如果在这个函数丢包了是会有drop计数的,即dev->tx_dropped。查看了网卡的丢包计数,也没发现有值。
所以这两种情况的可能性也不大。

static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)... err = -ENETDOWN;if (unlikely(!(dev->flags & IFF_UP)))goto out_put;...
out_put:dev_put(dev);
out:mutex_unlock(&po->pg_vec_lock);return err;static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)if (dev->flags & IFF_UP) {skb = dev_hard_start_xmit(skb, dev, txq, &rc);}rc = -ENETDOWN;
drop:rcu_read_unlock_bh();atomic_long_inc(&dev->tx_dropped);kfree_skb_list(skb);return rc;

既然发包失败了,存放数据包的skb肯定得释放,所以使用kretprobe抓一下kfree_skb和packet_sendmsg的调用栈和返回值。

//首先获取psock_tpacket的pid,后面会用pid过滤输出
root@node2:/root# ps -ef | grep psock_tpacket
root     30924 29468  0 21:45 pts/2    00:00:00 ./psock_tpacket ens8cd /sys/kernel/debug/tracing
//使能backtrace,查看调用栈
echo 1 > options/stacktrace
//抓取函数tpacket_snd 
echo 'r tpacket_snd ret=$retval' >> kprobe_events
//使能
echo 1 > events/kprobes/r_tpacket_snd_0/enable
//过滤指定进程发生的事件
echo 'common_pid==30924' > events/kprobes/r_tpacket_snd_0/filter
//使能 kfree_skb
echo 'r kfree_skb skb=+0(%si) ret=$retval' >> kprobe_events
echo 1 > events/kprobes/r_kfree_skb_0/enable
echo 'common_pid==30924' > events/kprobes/r_kfree_skb_0/filter
//总开关,使能后,就会开始抓取上面定义的两个事件
echo 1 > tracing_on

网卡ens8从down到up后,发的第一个包打印出如下两个事件,r_kfree_skb_0的出现说明确实是有free skb的操作,而且是在函数tpacket_snd中调用的kfree_skb。r_tpacket_snd_0的返回值0xffffff9c(有符号-100)也证明是在此函数设置的。

root@node2:/sys/kernel/debug/tracing# cat trace
# tracer: nop
#
# entries-in-buffer/entries-written: 6/6   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |psock_tpacket-30924 [000] d... 633441.532440: r_kfree_skb_0: (tpacket_snd+0x582/0xf10 <- kfree_skb) skb=0x8e00000001 ret=0x1psock_tpacket-30924 [000] d... 633441.532447: => [unknown/kretprobe'd]=> [unknown/kretprobe'd]=> sock_sendmsg=> __sys_sendto=> __x64_sys_sendto=> do_syscall_64=> entry_SYSCALL_64_after_hwframepsock_tpacket-30924 [000] d... 633441.532449: r_tpacket_snd_0: (packet_sendmsg+0x1f/0x30 <- tpacket_snd) ret=0xffffff9cpsock_tpacket-30924 [000] d... 633441.532451: => [unknown/kretprobe'd]=> sock_sendmsg=> __sys_sendto=> __x64_sys_sendto=> do_syscall_64=> entry_SYSCALL_64_after_hwframe

虽然可以确定丢包发生在函数tpacket_snd中,但依然无法精确定位的哪一行代码。

看来还得仔细看一下代码,经过全局搜索,发现下面的代码很可疑,也是在af_packet.c文件中

packet_notifier in af_packet.c (linux-3.18.79\net\packet) :                     sk->sk_err = ENETDOWN;

static struct notifier_block packet_netdev_notifier = {.notifier_call =    packet_notifier,
};
//packet_init初始化时,会注册网络设备变化通知事件
module_init(packet_init);
static int __init packet_init(void)register_netdevice_notifier(&packet_netdev_notifier);//在事件处理函数中,如果网卡down了,会将和此网卡关联的所有af_packet类型的socket的sk->sk_err设置成ENETDOWN
static int packet_notifier(struct notifier_block *this,unsigned long msg, void *ptr)sk_for_each_rcu(sk, &net->packet.sklist) {struct packet_sock *po = pkt_sk(sk);switch (msg) {case NETDEV_DOWN:if (dev->ifindex == po->ifindex) {sk->sk_err = ENETDOWN;case NETDEV_UP:

而在函数tpacket_snd中也调用了sock_error,如下
tpacket_snd -> sock_alloc_send_skb ->sock_alloc_send_pskb->sock_error

问题原因总算找到了,原来是网卡down触发执行回调函数packet_notifier,将sk->sk_err设置成ENETDOWN,但是网卡up时,没clean sk->sk_err中的ENETDOWN。而且查了最新kernel代码,依然没有clean ENETDOWN,可能就是这样做的吧,来提醒使用者注意网卡曾经down过。

解决办法,从down到up时,首先调用下面的函数将sk->sk_err清零即可。

static int get_sock_err(int sock)
{int ret = 0;int opt_val = 0;socklen_t optlen = sizeof(opt_val);ret = getsockopt(sock, SOL_SOCKET, SO_ERROR, &opt_val, &optlen);if (ret){perror("a error getsockopt SO_ERROR");return 0;}return opt_val;
}//kernel端代码,每次获取sock_error,都会将sock_error清零
int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)case SO_ERROR:v.val = -sock_error(sk);break;static inline int sock_error(struct sock *sk)
{int err;if (likely(!sk->sk_err))return 0;err = xchg(&sk->sk_err, 0);return -err;
}

参考

上网查了下,很早之前就有人发现此问题了,不过是在接收数据包时,而我这个是在发送数据包时
http://lkml.iu.edu/hypermail/linux/net/0303.0/0034.html

kernel提供的packet mmap文档
https://github.com/torvalds/linux/blob/master/Documentation/networking/packet_mmap.rst

kernel提供的测试packet mmap用例,包含tpacket v1/v2/v3,发送和接收方向,值得一看
https://github.com/torvalds/linux/blob/master/tools/testing/selftests/net/psock_tpacket.c

完整代码

下面代码编译后,按照复现步骤即可复现。
解决问题的办法是在walk_tx函数中调用get_sock_err清除sk->sk_err_soft

root@node2:~# cat psock_tpacket.c
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include #ifndef __aligned_tpacket
# define __aligned_tpacket      __attribute__((aligned(TPACKET_ALIGNMENT)))
#endif#ifndef __align_tpacket
# define __align_tpacket(x)     __attribute__((aligned(TPACKET_ALIGN(x))))
#endif#define NUM_PACKETS             100
#define ALIGN_8(x)              (((x) + 8 - 1) & ~(8 - 1))#define DATA_LEN                        100
#define DATA_CHAR                       'a'
#define DATA_CHAR_1                     'b'struct ring {struct iovec *rd;uint8_t *mm_space;size_t mm_len, rd_len;struct sockaddr_ll ll;void (*walk)(int sock, struct ring *ring, char* name);int type, rd_num, flen, version;union {struct tpacket_req  req;struct tpacket_req3 req3;};
};struct block_desc {uint32_t version;uint32_t offset_to_priv;struct tpacket_hdr_v1 h1;
};union frame_map {struct {struct tpacket_hdr tp_h __aligned_tpacket;struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket_hdr));} *v1;struct {struct tpacket2_hdr tp_h __aligned_tpacket;struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr));} *v2;void *raw;
};static int get_status(int sock, char* name)
{struct ifreq ifr;strcpy(ifr.ifr_name, name);if (ioctl(sock, SIOCGIFFLAGS, &ifr) == -1){perror("a error ioctl SIOCGIFFLAGS");return 0;}return (ifr.ifr_flags & IFF_UP);
}static int get_sock_err(int sock)
{int ret = 0;int opt_val = 0;socklen_t optlen = sizeof(opt_val);ret = getsockopt(sock, SOL_SOCKET, SO_ERROR, &opt_val, &optlen);if (ret){perror("a error getsockopt SO_ERROR");return 0;}return opt_val;
}
static unsigned int total_packets, total_bytes;static int pfsocket(int ver)
{int ret, sock = socket(PF_PACKET, SOCK_RAW, 0);if (sock == -1) {perror("socket");exit(1);}ret = setsockopt(sock, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));if (ret == -1) {perror("setsockopt");exit(1);}return sock;
}static void status_bar_update(void)
{if (total_packets % 10 == 0) {fprintf(stderr, ".");fflush(stderr);}
}static void create_payload(void *pay, size_t *len)
{int i;struct ethhdr *eth = pay;struct iphdr *ip = pay + sizeof(*eth);/* Lets create some broken crap, that still passes* our BPF filter.*/*len = DATA_LEN + 42;memset(pay, 0xff, ETH_ALEN * 2);eth->h_proto = htons(ETH_P_IP);for (i = 0; i < sizeof(*ip); ++i)((uint8_t *) pay)[i + sizeof(*eth)] = (uint8_t) rand();ip->ihl = 5;ip->version = 4;ip->protocol = 0x11;ip->frag_off = 0;ip->ttl = 64;ip->tot_len = htons((uint16_t) *len - sizeof(*eth));ip->saddr = inet_addr("2.2.2.2");ip->daddr = inet_addr("2.2.2.4");memset(pay + sizeof(*eth) + sizeof(*ip),DATA_CHAR, DATA_LEN);
}static inline int __v1_tx_kernel_ready(struct tpacket_hdr *hdr)
{return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
}static inline void __v1_tx_user_ready(struct tpacket_hdr *hdr)
{hdr->tp_status = TP_STATUS_SEND_REQUEST;__sync_synchronize();
}static inline int __v2_tx_kernel_ready(struct tpacket2_hdr *hdr)
{return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
}static inline void __v2_tx_user_ready(struct tpacket2_hdr *hdr)
{hdr->tp_status = TP_STATUS_SEND_REQUEST;__sync_synchronize();
}static inline int __v3_tx_kernel_ready(struct tpacket3_hdr *hdr)
{return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
}static inline void __v3_tx_user_ready(struct tpacket3_hdr *hdr)
{hdr->tp_status = TP_STATUS_SEND_REQUEST;__sync_synchronize();
}static inline int __tx_kernel_ready(void *base, int version)
{switch (version) {case TPACKET_V1:return __v1_tx_kernel_ready(base);case TPACKET_V2:return __v2_tx_kernel_ready(base);case TPACKET_V3:return __v3_tx_kernel_ready(base);default:return 0;}
}static inline void __tx_user_ready(void *base, int version)
{switch (version) {case TPACKET_V1:__v1_tx_user_ready(base);break;case TPACKET_V2:__v2_tx_user_ready(base);break;case TPACKET_V3:__v3_tx_user_ready(base);break;}
}static void __v1_v2_set_packet_loss_discard(int sock)
{int ret, discard = 1;ret = setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard,sizeof(discard));if (ret == -1) {perror("setsockopt");exit(1);}
}static inline void *get_next_frame(struct ring *ring, int n)
{uint8_t *f0 = ring->rd[0].iov_base;switch (ring->version) {case TPACKET_V1:case TPACKET_V2:return ring->rd[n].iov_base;case TPACKET_V3:return f0 + (n * ring->req3.tp_frame_size);}
}static void walk_tx(int sock, struct ring *ring, char* name)
{struct pollfd pfd;int ret;size_t packet_len;union frame_map ppd;char packet[1024];unsigned int frame_num = 0, got = 0;struct sockaddr_ll ll = {.sll_family = PF_PACKET,.sll_halen = ETH_ALEN,};int nframes;/* TPACKET_V{1,2} sets up the ring->rd* related variables based* on frames (e.g., rd_num is tp_frame_nr) whereas V3 sets these* up based on blocks (e.g, rd_num is  tp_block_nr)*/if (ring->version <= TPACKET_V2)nframes = ring->rd_num;elsenframes = ring->req3.tp_frame_nr;memset(&pfd, 0, sizeof(pfd));pfd.fd = sock;pfd.events = POLLOUT | POLLERR;pfd.revents = 0;total_packets = NUM_PACKETS;create_payload(packet, &packet_len);int nic_down= 0;int err = 0;while (total_packets > 0) {if (get_status(sock, name)) {if (nic_down) {printf("sleep 5 s\n");sleep(2);nic_down = 0;//err = get_sock_err(sock);//printf("socket error is %d\n", err);}void *next = get_next_frame(ring, frame_num);if (!__tx_kernel_ready(next, ring->version) || total_packets <= 0)continue;printf("NIC is up, send one packet\n");ppd.raw = next;switch (ring->version) {case TPACKET_V1:ppd.v1->tp_h.tp_snaplen = packet_len;ppd.v1->tp_h.tp_len = packet_len;memcpy((uint8_t *) ppd.raw + TPACKET_HDRLEN -sizeof(struct sockaddr_ll), packet,packet_len);total_bytes += ppd.v1->tp_h.tp_snaplen;break;case TPACKET_V2:ppd.v2->tp_h.tp_snaplen = packet_len;ppd.v2->tp_h.tp_len = packet_len;memcpy((uint8_t *) ppd.raw + TPACKET2_HDRLEN -sizeof(struct sockaddr_ll), packet,packet_len);total_bytes += ppd.v2->tp_h.tp_snaplen;break;case TPACKET_V3: {struct tpacket3_hdr *tx = next;tx->tp_snaplen = packet_len;tx->tp_len = packet_len;tx->tp_next_offset = 0;memcpy((uint8_t *)tx + TPACKET3_HDRLEN -sizeof(struct sockaddr_ll), packet,packet_len);total_bytes += tx->tp_snaplen;break;}}status_bar_update();total_packets--;__tx_user_ready(next, ring->version);frame_num = (frame_num + 1) % nframes;ret = sendto(sock, NULL, 0, 0, NULL, 0);if (ret != -1) {printf("send success\n");}else {perror("sendto fail\n");}}else {printf("NIC is down, don't send packet\n");nic_down = 1;}sleep(1);poll(&pfd, 1, 1);}
}static void __v1_v2_fill(struct ring *ring, unsigned int blocks)
{ring->req.tp_block_size = getpagesize() << 2;ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;ring->req.tp_block_nr = blocks;ring->req.tp_frame_nr = ring->req.tp_block_size /ring->req.tp_frame_size *ring->req.tp_block_nr;ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;ring->walk = walk_tx;ring->rd_num = ring->req.tp_frame_nr;ring->flen = ring->req.tp_frame_size;
}static void setup_ring(int sock, struct ring *ring, int version, int type)
{int ret = 0;unsigned int blocks = 256;ring->type = type;ring->version = version;switch (version) {case TPACKET_V1:case TPACKET_V2:if (type == PACKET_TX_RING)__v1_v2_set_packet_loss_discard(sock);__v1_v2_fill(ring, blocks);ret = setsockopt(sock, SOL_PACKET, type, &ring->req,sizeof(ring->req));break;}if (ret == -1) {perror("setsockopt");exit(1);}ring->rd_len = ring->rd_num * sizeof(*ring->rd);ring->rd = malloc(ring->rd_len);if (ring->rd == NULL) {perror("malloc");exit(1);}total_packets = 0;total_bytes = 0;
}static void mmap_ring(int sock, struct ring *ring)
{int i;ring->mm_space = mmap(0, ring->mm_len, PROT_READ | PROT_WRITE,MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);if (ring->mm_space == MAP_FAILED) {perror("mmap");exit(1);}memset(ring->rd, 0, ring->rd_len);for (i = 0; i < ring->rd_num; ++i) {ring->rd[i].iov_base = ring->mm_space + (i * ring->flen);ring->rd[i].iov_len = ring->flen;}
}static void bind_ring(int sock, struct ring *ring, char* name)
{int ret;ring->ll.sll_family = PF_PACKET;ring->ll.sll_protocol = htons(ETH_P_ALL);ring->ll.sll_ifindex = if_nametoindex(name);ring->ll.sll_hatype = 0;ring->ll.sll_pkttype = 0;ring->ll.sll_halen = 0;ret = bind(sock, (struct sockaddr *) &ring->ll, sizeof(ring->ll));if (ret == -1) {perror("bind");exit(1);}
}static void walk_ring(int sock, struct ring *ring, char* name)
{ring->walk(sock, ring, name);
}static void unmap_ring(int sock, struct ring *ring)
{munmap(ring->mm_space, ring->mm_len);free(ring->rd);
}static int test_tpacket(int version, int type, char* name)
{int sock;struct ring ring;sock = pfsocket(version);memset(&ring, 0, sizeof(ring));setup_ring(sock, &ring, version, type);mmap_ring(sock, &ring);bind_ring(sock, &ring, name);printf("send data to NIC %s\n", name);walk_ring(sock, &ring, name);unmap_ring(sock, &ring);close(sock);fprintf(stderr, "\n");return 0;
}int main(int argc, char* argv[])
{test_tpacket(TPACKET_V2, PACKET_TX_RING, argv[1]);return 0;
}

也可参考:af_packet 网卡down/up导致丢包 - 简书 (jianshu.com) 


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部