Browse Source

cherry-pick vhost perf regression and mem-leak fix

Fabian Grünbichler 8 years ago
parent
commit
ddad99c986

+ 72 - 0
patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch

@@ -0,0 +1,72 @@
+From 15bea51dda49a0403a9a84fb26f3376636d2a3c2 Mon Sep 17 00:00:00 2001
+From: Wei Xu <[email protected]>
+Date: Fri, 1 Dec 2017 05:10:36 -0500
+Subject: [PATCH 08/13] vhost: fix skb leak in handle_rx()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Matthew found a roughly 40% tcp throughput regression with commit
+c67df11f(vhost_net: try batch dequing from skb array) as discussed
+in the following thread:
+https://www.mail-archive.com/[email protected]/msg187936.html
+
+Eventually we figured out that it was a skb leak in handle_rx()
+when sending packets to the VM. This usually happens when a guest
+can not drain out vq as fast as vhost fills in, afterwards it sets
+off the traffic jam and leaks skb(s) which occurs as no headcount
+to send on the vq from vhost side.
+
+This can be avoided by making sure we have got enough headcount
+before actually consuming a skb from the batched rx array while
+transmitting, which is simply done by moving checking the zero
+headcount a bit ahead.
+
+Signed-off-by: Wei Xu <[email protected]>
+Reported-by: Matthew Rosato <[email protected]>
+Signed-off-by: Fabian Grünbichler <[email protected]>
+---
+ drivers/vhost/net.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
+index 1c75572f5a3f..010253847022 100644
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -781,16 +781,6 @@ static void handle_rx(struct vhost_net *net)
+ 		/* On error, stop handling until the next kick. */
+ 		if (unlikely(headcount < 0))
+ 			goto out;
+-		if (nvq->rx_array)
+-			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
+-		/* On overrun, truncate and discard */
+-		if (unlikely(headcount > UIO_MAXIOV)) {
+-			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+-			err = sock->ops->recvmsg(sock, &msg,
+-						 1, MSG_DONTWAIT | MSG_TRUNC);
+-			pr_debug("Discarded rx packet: len %zd\n", sock_len);
+-			continue;
+-		}
+ 		/* OK, now we need to know about added descriptors. */
+ 		if (!headcount) {
+ 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+@@ -803,6 +793,16 @@ static void handle_rx(struct vhost_net *net)
+ 			 * they refilled. */
+ 			goto out;
+ 		}
++		if (nvq->rx_array)
++			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
++		/* On overrun, truncate and discard */
++		if (unlikely(headcount > UIO_MAXIOV)) {
++			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
++			err = sock->ops->recvmsg(sock, &msg,
++						 1, MSG_DONTWAIT | MSG_TRUNC);
++			pr_debug("Discarded rx packet: len %zd\n", sock_len);
++			continue;
++		}
+ 		/* We don't need to be notified again. */
+ 		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
+ 		fixup = msg.msg_iter;
+-- 
+2.14.2
+

+ 86 - 0
patches/kernel/0009-tun-free-skb-in-early-errors.patch

@@ -0,0 +1,86 @@
+From ccc0b8662620d562798183b77bcd30125d2d14f3 Mon Sep 17 00:00:00 2001
+From: Wei Xu <[email protected]>
+Date: Fri, 1 Dec 2017 05:10:37 -0500
+Subject: [PATCH 09/13] tun: free skb in early errors
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+tun_recvmsg() supports accepting skb by msg_control after
+commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
+the skb if presented should be freed no matter how far it can go
+along, otherwise it would be leaked.
+
+This patch fixes several missed cases.
+
+Signed-off-by: Wei Xu <[email protected]>
+Reported-by: Matthew Rosato <[email protected]>
+Signed-off-by: Fabian Grünbichler <[email protected]>
+---
+ drivers/net/tun.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index cb1f7747adad..5143e948d7d1 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1519,8 +1519,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
+ 
+ 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
+ 
+-	if (!iov_iter_count(to))
++	if (!iov_iter_count(to)) {
++		if (skb)
++			kfree_skb(skb);
+ 		return 0;
++	}
+ 
+ 	if (!skb) {
+ 		/* Read frames from ring */
+@@ -1636,22 +1639,24 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
+ {
+ 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+ 	struct tun_struct *tun = __tun_get(tfile);
++	struct sk_buff *skb = m->msg_control;
+ 	int ret;
+ 
+-	if (!tun)
+-		return -EBADFD;
++	if (!tun) {
++		ret = -EBADFD;
++		goto out_free_skb;
++	}
+ 
+ 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
+ 		ret = -EINVAL;
+-		goto out;
++		goto out_put_tun;
+ 	}
+ 	if (flags & MSG_ERRQUEUE) {
+ 		ret = sock_recv_errqueue(sock->sk, m, total_len,
+ 					 SOL_PACKET, TUN_TX_TIMESTAMP);
+ 		goto out;
+ 	}
+-	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
+-			  m->msg_control);
++	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+ 	if (ret > (ssize_t)total_len) {
+ 		m->msg_flags |= MSG_TRUNC;
+ 		ret = flags & MSG_TRUNC ? ret : total_len;
+@@ -1659,6 +1664,13 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
+ out:
+ 	tun_put(tun);
+ 	return ret;
++
++out_put_tun:
++	tun_put(tun);
++out_free_skb:
++	if (skb)
++		kfree_skb(skb);
++	return ret;
+ }
+ 
+ static int tun_peek_len(struct socket *sock)
+-- 
+2.14.2
+

+ 58 - 0
patches/kernel/0010-tap-free-skb-if-flags-error.patch

@@ -0,0 +1,58 @@
+From 68783aa6989756cda8e9e305292afbb9f4f5677c Mon Sep 17 00:00:00 2001
+From: Wei Xu <[email protected]>
+Date: Fri, 1 Dec 2017 05:10:38 -0500
+Subject: [PATCH 10/13] tap: free skb if flags error
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+tap_recvmsg() supports accepting skb by msg_control after
+commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
+the skb if presented should be freed within the function, otherwise
+it would be leaked.
+
+Signed-off-by: Wei Xu <[email protected]>
+Reported-by: Matthew Rosato <[email protected]>
+Signed-off-by: Fabian Grünbichler <[email protected]>
+---
+ drivers/net/tap.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/net/tap.c b/drivers/net/tap.c
+index 3570c7576993..4e04b6094f3c 100644
+--- a/drivers/net/tap.c
++++ b/drivers/net/tap.c
+@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_queue *q,
+ 	DEFINE_WAIT(wait);
+ 	ssize_t ret = 0;
+ 
+-	if (!iov_iter_count(to))
++	if (!iov_iter_count(to)) {
++		if (skb)
++			kfree_skb(skb);
+ 		return 0;
++	}
+ 
+ 	if (skb)
+ 		goto put;
+@@ -1155,11 +1158,14 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m,
+ 		       size_t total_len, int flags)
+ {
+ 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
++	struct sk_buff *skb = m->msg_control;
+ 	int ret;
+-	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
++	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
++		if (skb)
++			kfree_skb(skb);
+ 		return -EINVAL;
+-	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT,
+-			  m->msg_control);
++	}
++	ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+ 	if (ret > total_len) {
+ 		m->msg_flags |= MSG_TRUNC;
+ 		ret = flags & MSG_TRUNC ? ret : total_len;
+-- 
+2.14.2
+