123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- From 3a1cc23a75abcd9cea585eb84846507363d58397 Mon Sep 17 00:00:00 2001
- From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
- Date: Tue, 25 Oct 2022 15:22:45 +0200
- Subject: [PATCH] net: broadcom: bcm4908_enet: use build_skb()
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- RX code can be more efficient with the build_skb(). Allocating actual
- SKB around eth packet buffer - right before passing it up - results in
- a better cache usage.
- Without RPS (echo 0 > rps_cpus) BCM4908 NAT masq performance "jumps"
- between two speeds: ~900 Mbps and 940 Mbps (it's a 4 CPUs SoC). This
- change bumps the lower speed from 905 Mb/s to 918 Mb/s (tested using
- single stream iperf 2.0.5 traffic).
- There are more optimizations to consider. One obvious to try is GRO
- however as BCM4908 doesn't do hw csum is may actually lower performance.
- Sometimes. Some early testing:
- ┌─────────────────────────────────┬─────────────────────┬────────────────────┐
- │ │ netif_receive_skb() │ napi_gro_receive() │
- ├─────────────────────────────────┼─────────────────────┼────────────────────┤
- │ netdev_alloc_skb() │ 905 Mb/s │ 892 Mb/s │
- │ napi_alloc_frag() + build_skb() │ 918 Mb/s │ 917 Mb/s │
- └─────────────────────────────────┴─────────────────────┴────────────────────┘
- Another ideas:
- 1. napi_build_skb()
- 2. skb_copy_from_linear_data() for small packets
- Those need proper testing first though. That can be done later.
- Signed-off-by: Rafał Miłecki <[email protected]>
- Link: https://lore.kernel.org/r/[email protected]
- Signed-off-by: Paolo Abeni <[email protected]>
- ---
- drivers/net/ethernet/broadcom/bcm4908_enet.c | 53 +++++++++++++-------
- 1 file changed, 36 insertions(+), 17 deletions(-)
- --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c
- +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c
- @@ -36,13 +36,24 @@
- #define ENET_MAX_ETH_OVERHEAD (ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
- ETH_FCS_LEN + 4) /* 32 */
-
- +#define ENET_RX_SKB_BUF_SIZE (NET_SKB_PAD + NET_IP_ALIGN + \
- + ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
- + ENET_MTU_MAX + ETH_FCS_LEN + 4)
- +#define ENET_RX_SKB_BUF_ALLOC_SIZE (SKB_DATA_ALIGN(ENET_RX_SKB_BUF_SIZE) + \
- + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
- +#define ENET_RX_BUF_DMA_OFFSET (NET_SKB_PAD + NET_IP_ALIGN)
- +#define ENET_RX_BUF_DMA_SIZE (ENET_RX_SKB_BUF_SIZE - ENET_RX_BUF_DMA_OFFSET)
- +
- struct bcm4908_enet_dma_ring_bd {
- __le32 ctl;
- __le32 addr;
- } __packed;
-
- struct bcm4908_enet_dma_ring_slot {
- - struct sk_buff *skb;
- + union {
- + void *buf; /* RX */
- + struct sk_buff *skb; /* TX */
- + };
- unsigned int len;
- dma_addr_t dma_addr;
- };
- @@ -260,22 +271,21 @@ static int bcm4908_enet_dma_alloc_rx_buf
- u32 tmp;
- int err;
-
- - slot->len = ENET_MTU_MAX + ENET_MAX_ETH_OVERHEAD;
- -
- - slot->skb = netdev_alloc_skb(enet->netdev, slot->len);
- - if (!slot->skb)
- + slot->buf = napi_alloc_frag(ENET_RX_SKB_BUF_ALLOC_SIZE);
- + if (!slot->buf)
- return -ENOMEM;
-
- - slot->dma_addr = dma_map_single(dev, slot->skb->data, slot->len, DMA_FROM_DEVICE);
- + slot->dma_addr = dma_map_single(dev, slot->buf + ENET_RX_BUF_DMA_OFFSET,
- + ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
- err = dma_mapping_error(dev, slot->dma_addr);
- if (err) {
- dev_err(dev, "Failed to map DMA buffer: %d\n", err);
- - kfree_skb(slot->skb);
- - slot->skb = NULL;
- + skb_free_frag(slot->buf);
- + slot->buf = NULL;
- return err;
- }
-
- - tmp = slot->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
- + tmp = ENET_RX_BUF_DMA_SIZE << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
- tmp |= DMA_CTL_STATUS_OWN;
- if (idx == enet->rx_ring.length - 1)
- tmp |= DMA_CTL_STATUS_WRAP;
- @@ -315,11 +325,11 @@ static void bcm4908_enet_dma_uninit(stru
-
- for (i = rx_ring->length - 1; i >= 0; i--) {
- slot = &rx_ring->slots[i];
- - if (!slot->skb)
- + if (!slot->buf)
- continue;
- dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_FROM_DEVICE);
- - kfree_skb(slot->skb);
- - slot->skb = NULL;
- + skb_free_frag(slot->buf);
- + slot->buf = NULL;
- }
- }
-
- @@ -575,6 +585,7 @@ static int bcm4908_enet_poll_rx(struct n
- while (handled < weight) {
- struct bcm4908_enet_dma_ring_bd *buf_desc;
- struct bcm4908_enet_dma_ring_slot slot;
- + struct sk_buff *skb;
- u32 ctl;
- int len;
- int err;
- @@ -598,16 +609,24 @@ static int bcm4908_enet_poll_rx(struct n
-
- if (len < ETH_ZLEN ||
- (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) {
- - kfree_skb(slot.skb);
- + skb_free_frag(slot.buf);
- enet->netdev->stats.rx_dropped++;
- break;
- }
-
- - dma_unmap_single(dev, slot.dma_addr, slot.len, DMA_FROM_DEVICE);
- + dma_unmap_single(dev, slot.dma_addr, ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
- +
- + skb = build_skb(slot.buf, ENET_RX_SKB_BUF_ALLOC_SIZE);
- + if (unlikely(!skb)) {
- + skb_free_frag(slot.buf);
- + enet->netdev->stats.rx_dropped++;
- + break;
- + }
- + skb_reserve(skb, ENET_RX_BUF_DMA_OFFSET);
- + skb_put(skb, len - ETH_FCS_LEN);
- + skb->protocol = eth_type_trans(skb, enet->netdev);
-
- - skb_put(slot.skb, len - ETH_FCS_LEN);
- - slot.skb->protocol = eth_type_trans(slot.skb, enet->netdev);
- - netif_receive_skb(slot.skb);
- + netif_receive_skb(skb);
-
- enet->netdev->stats.rx_packets++;
- enet->netdev->stats.rx_bytes += len;
|