| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578 |
- From: Felix Fietkau <[email protected]>
- Date: Tue, 23 Apr 2024 11:23:03 +0200
- Subject: [PATCH] net: add TCP fraglist GRO support
- When forwarding TCP after GRO, software segmentation is very expensive,
- especially when the checksum needs to be recalculated.
- One case where that's currently unavoidable is when routing packets over
- PPPoE. Performance improves significantly when using fraglist GRO
- implemented in the same way as for UDP.
- Here's a measurement of running 2 TCP streams through a MediaTek MT7622
- device (2-core Cortex-A53), which runs NAT with flow offload enabled from
- one ethernet port to PPPoE on another ethernet port + cake qdisc set to
- 1Gbps.
- rx-gro-list off: 630 Mbit/s, CPU 35% idle
- rx-gro-list on: 770 Mbit/s, CPU 40% idle
- Signe-off-by: Felix Fietkau <[email protected]>
- ---
- --- a/include/net/gro.h
- +++ b/include/net/gro.h
- @@ -439,6 +439,7 @@ static inline __wsum ip6_gro_compute_pse
- }
-
- int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
- +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
-
- /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
- static inline void gro_normal_list(struct napi_struct *napi)
- --- a/include/net/tcp.h
- +++ b/include/net/tcp.h
- @@ -2084,7 +2084,10 @@ void tcp_v4_destroy_sock(struct sock *sk
-
- struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
- netdev_features_t features);
- -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
- +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
- +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
- +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
- + struct tcphdr *th);
- INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
- INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
- INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
- --- a/net/core/gro.c
- +++ b/net/core/gro.c
- @@ -233,6 +233,33 @@ done:
- return 0;
- }
-
- +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
- +{
- + if (unlikely(p->len + skb->len >= 65536))
- + return -E2BIG;
- +
- + if (NAPI_GRO_CB(p)->last == p)
- + skb_shinfo(p)->frag_list = skb;
- + else
- + NAPI_GRO_CB(p)->last->next = skb;
- +
- + skb_pull(skb, skb_gro_offset(skb));
- +
- + NAPI_GRO_CB(p)->last = skb;
- + NAPI_GRO_CB(p)->count++;
- + p->data_len += skb->len;
- +
- + /* sk ownership - if any - completely transferred to the aggregated packet */
- + skb->destructor = NULL;
- + skb->sk = NULL;
- + p->truesize += skb->truesize;
- + p->len += skb->len;
- +
- + NAPI_GRO_CB(skb)->same_flow = 1;
- +
- + return 0;
- +}
- +
-
- static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
- {
- --- a/net/ipv4/tcp_offload.c
- +++ b/net/ipv4/tcp_offload.c
- @@ -28,6 +28,70 @@ static void tcp_gso_tstamp(struct sk_buf
- }
- }
-
- +static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
- + __be32 *oldip, __be32 newip,
- + __be16 *oldport, __be16 newport)
- +{
- + struct tcphdr *th;
- + struct iphdr *iph;
- +
- + if (*oldip == newip && *oldport == newport)
- + return;
- +
- + th = tcp_hdr(seg);
- + iph = ip_hdr(seg);
- +
- + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
- + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
- + *oldport = newport;
- +
- + csum_replace4(&iph->check, *oldip, newip);
- + *oldip = newip;
- +}
- +
- +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
- +{
- + const struct tcphdr *th;
- + const struct iphdr *iph;
- + struct sk_buff *seg;
- + struct tcphdr *th2;
- + struct iphdr *iph2;
- +
- + seg = segs;
- + th = tcp_hdr(seg);
- + iph = ip_hdr(seg);
- + th2 = tcp_hdr(seg->next);
- + iph2 = ip_hdr(seg->next);
- +
- + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
- + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
- + return segs;
- +
- + while ((seg = seg->next)) {
- + th2 = tcp_hdr(seg);
- + iph2 = ip_hdr(seg);
- +
- + __tcpv4_gso_segment_csum(seg,
- + &iph2->saddr, iph->saddr,
- + &th2->source, th->source);
- + __tcpv4_gso_segment_csum(seg,
- + &iph2->daddr, iph->daddr,
- + &th2->dest, th->dest);
- + }
- +
- + return segs;
- +}
- +
- +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
- + netdev_features_t features)
- +{
- + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
- + if (IS_ERR(skb))
- + return skb;
- +
- + return __tcpv4_gso_segment_list_csum(skb);
- +}
- +
- static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
- {
- @@ -37,6 +101,9 @@ static struct sk_buff *tcp4_gso_segment(
- if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
- return ERR_PTR(-EINVAL);
-
- + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
- + return __tcp4_gso_segment_list(skb, features);
- +
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
- @@ -181,61 +248,76 @@ out:
- return segs;
- }
-
- -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
- +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
- {
- - struct sk_buff *pp = NULL;
- + struct tcphdr *th2;
- struct sk_buff *p;
- +
- + list_for_each_entry(p, head, list) {
- + if (!NAPI_GRO_CB(p)->same_flow)
- + continue;
- +
- + th2 = tcp_hdr(p);
- + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- + NAPI_GRO_CB(p)->same_flow = 0;
- + continue;
- + }
- +
- + return p;
- + }
- +
- + return NULL;
- +}
- +
- +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
- +{
- + unsigned int thlen, hlen, off;
- struct tcphdr *th;
- - struct tcphdr *th2;
- - unsigned int len;
- - unsigned int thlen;
- - __be32 flags;
- - unsigned int mss = 1;
- - unsigned int hlen;
- - unsigned int off;
- - int flush = 1;
- - int i;
-
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*th);
- th = skb_gro_header(skb, hlen, off);
- if (unlikely(!th))
- - goto out;
- + return NULL;
-
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- - goto out;
- + return NULL;
-
- hlen = off + thlen;
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- - goto out;
- + return NULL;
- }
-
- skb_gro_pull(skb, thlen);
-
- - len = skb_gro_len(skb);
- - flags = tcp_flag_word(th);
- -
- - list_for_each_entry(p, head, list) {
- - if (!NAPI_GRO_CB(p)->same_flow)
- - continue;
- + return th;
- +}
-
- - th2 = tcp_hdr(p);
- +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
- + struct tcphdr *th)
- +{
- + unsigned int thlen = th->doff * 4;
- + struct sk_buff *pp = NULL;
- + struct sk_buff *p;
- + struct tcphdr *th2;
- + unsigned int len;
- + __be32 flags;
- + unsigned int mss = 1;
- + int flush = 1;
- + int i;
-
- - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- - NAPI_GRO_CB(p)->same_flow = 0;
- - continue;
- - }
- + len = skb_gro_len(skb);
- + flags = tcp_flag_word(th);
-
- - goto found;
- - }
- - p = NULL;
- - goto out_check_final;
- + p = tcp_gro_lookup(head, th);
- + if (!p)
- + goto out_check_final;
-
- -found:
- /* Include the IP ID check below from the inner most IP hdr */
- + th2 = tcp_hdr(p);
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
- @@ -272,6 +354,19 @@ found:
- flush |= p->decrypted ^ skb->decrypted;
- #endif
-
- + if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
- + flush |= (__force int)(flags ^ tcp_flag_word(th2));
- + flush |= skb->ip_summed != p->ip_summed;
- + flush |= skb->csum_level != p->csum_level;
- + flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
- + flush |= NAPI_GRO_CB(p)->count >= 64;
- +
- + if (flush || skb_gro_receive_list(p, skb))
- + mss = 1;
- +
- + goto out_check_final;
- + }
- +
- if (flush || skb_gro_receive(p, skb)) {
- mss = 1;
- goto out_check_final;
- @@ -293,7 +388,6 @@ out_check_final:
- if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = p;
-
- -out:
- NAPI_GRO_CB(skb)->flush |= (flush != 0);
-
- return pp;
- @@ -317,18 +411,58 @@ void tcp_gro_complete(struct sk_buff *sk
- }
- EXPORT_SYMBOL(tcp_gro_complete);
-
- +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
- + struct tcphdr *th)
- +{
- + const struct iphdr *iph;
- + struct sk_buff *p;
- + struct sock *sk;
- + struct net *net;
- + int iif, sdif;
- +
- + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
- + return;
- +
- + p = tcp_gro_lookup(head, th);
- + if (p) {
- + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
- + return;
- + }
- +
- + inet_get_iif_sdif(skb, &iif, &sdif);
- + iph = skb_gro_network_header(skb);
- + net = dev_net(skb->dev);
- + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- + iph->saddr, th->source,
- + iph->daddr, ntohs(th->dest),
- + iif, sdif);
- + NAPI_GRO_CB(skb)->is_flist = !sk;
- + if (sk)
- + sock_put(sk);
- +}
- +
- INDIRECT_CALLABLE_SCOPE
- struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
- {
- + struct tcphdr *th;
- +
- /* Don't bother verifying checksum if we're going to flush anyway. */
- if (!NAPI_GRO_CB(skb)->flush &&
- skb_gro_checksum_validate(skb, IPPROTO_TCP,
- - inet_gro_compute_pseudo)) {
- - NAPI_GRO_CB(skb)->flush = 1;
- - return NULL;
- - }
- + inet_gro_compute_pseudo))
- + goto flush;
- +
- + th = tcp_gro_pull_header(skb);
- + if (!th)
- + goto flush;
-
- - return tcp_gro_receive(head, skb);
- + tcp4_check_fraglist_gro(head, skb, th);
- +
- + return tcp_gro_receive(head, skb, th);
- +
- +flush:
- + NAPI_GRO_CB(skb)->flush = 1;
- + return NULL;
- }
-
- INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
- @@ -336,6 +470,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
- const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
-
- + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
- + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
- + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
- +
- + __skb_incr_checksum_unnecessary(skb);
- +
- + return 0;
- + }
- +
- th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
- iph->daddr, 0);
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
- --- a/net/ipv4/udp_offload.c
- +++ b/net/ipv4/udp_offload.c
- @@ -451,33 +451,6 @@ out:
- return segs;
- }
-
- -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
- -{
- - if (unlikely(p->len + skb->len >= 65536))
- - return -E2BIG;
- -
- - if (NAPI_GRO_CB(p)->last == p)
- - skb_shinfo(p)->frag_list = skb;
- - else
- - NAPI_GRO_CB(p)->last->next = skb;
- -
- - skb_pull(skb, skb_gro_offset(skb));
- -
- - NAPI_GRO_CB(p)->last = skb;
- - NAPI_GRO_CB(p)->count++;
- - p->data_len += skb->len;
- -
- - /* sk ownership - if any - completely transferred to the aggregated packet */
- - skb->destructor = NULL;
- - skb->sk = NULL;
- - p->truesize += skb->truesize;
- - p->len += skb->len;
- -
- - NAPI_GRO_CB(skb)->same_flow = 1;
- -
- - return 0;
- -}
- -
-
- #define UDP_GRO_CNT_MAX 64
- static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
- --- a/net/ipv6/tcpv6_offload.c
- +++ b/net/ipv6/tcpv6_offload.c
- @@ -7,24 +7,67 @@
- */
- #include <linux/indirect_call_wrapper.h>
- #include <linux/skbuff.h>
- +#include <net/inet6_hashtables.h>
- #include <net/gro.h>
- #include <net/protocol.h>
- #include <net/tcp.h>
- #include <net/ip6_checksum.h>
- #include "ip6_offload.h"
-
- +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
- + struct tcphdr *th)
- +{
- +#if IS_ENABLED(CONFIG_IPV6)
- + const struct ipv6hdr *hdr;
- + struct sk_buff *p;
- + struct sock *sk;
- + struct net *net;
- + int iif, sdif;
- +
- + if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
- + return;
- +
- + p = tcp_gro_lookup(head, th);
- + if (p) {
- + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
- + return;
- + }
- +
- + inet6_get_iif_sdif(skb, &iif, &sdif);
- + hdr = skb_gro_network_header(skb);
- + net = dev_net(skb->dev);
- + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- + &hdr->saddr, th->source,
- + &hdr->daddr, ntohs(th->dest),
- + iif, sdif);
- + NAPI_GRO_CB(skb)->is_flist = !sk;
- + if (sk)
- + sock_put(sk);
- +#endif /* IS_ENABLED(CONFIG_IPV6) */
- +}
- +
- INDIRECT_CALLABLE_SCOPE
- struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
- {
- + struct tcphdr *th;
- +
- /* Don't bother verifying checksum if we're going to flush anyway. */
- if (!NAPI_GRO_CB(skb)->flush &&
- skb_gro_checksum_validate(skb, IPPROTO_TCP,
- - ip6_gro_compute_pseudo)) {
- - NAPI_GRO_CB(skb)->flush = 1;
- - return NULL;
- - }
- + ip6_gro_compute_pseudo))
- + goto flush;
-
- - return tcp_gro_receive(head, skb);
- + th = tcp_gro_pull_header(skb);
- + if (!th)
- + goto flush;
- +
- + tcp6_check_fraglist_gro(head, skb, th);
- +
- + return tcp_gro_receive(head, skb, th);
- +
- +flush:
- + NAPI_GRO_CB(skb)->flush = 1;
- + return NULL;
- }
-
- INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
- @@ -32,6 +75,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
-
- + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
- + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
- + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
- +
- + __skb_incr_checksum_unnecessary(skb);
- +
- + return 0;
- + }
- +
- th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
- &iph->daddr, 0);
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
- @@ -40,6 +92,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
- return 0;
- }
-
- +static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
- + __be16 *oldport, __be16 newport)
- +{
- + struct tcphdr *th;
- +
- + if (*oldport == newport)
- + return;
- +
- + th = tcp_hdr(seg);
- + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
- + *oldport = newport;
- +}
- +
- +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
- +{
- + const struct tcphdr *th;
- + const struct ipv6hdr *iph;
- + struct sk_buff *seg;
- + struct tcphdr *th2;
- + struct ipv6hdr *iph2;
- +
- + seg = segs;
- + th = tcp_hdr(seg);
- + iph = ipv6_hdr(seg);
- + th2 = tcp_hdr(seg->next);
- + iph2 = ipv6_hdr(seg->next);
- +
- + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
- + ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
- + ipv6_addr_equal(&iph->daddr, &iph2->daddr))
- + return segs;
- +
- + while ((seg = seg->next)) {
- + th2 = tcp_hdr(seg);
- + iph2 = ipv6_hdr(seg);
- +
- + iph2->saddr = iph->saddr;
- + iph2->daddr = iph->daddr;
- + __tcpv6_gso_segment_csum(seg, &th2->source, th->source);
- + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest);
- + }
- +
- + return segs;
- +}
- +
- +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
- + netdev_features_t features)
- +{
- + skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
- + if (IS_ERR(skb))
- + return skb;
- +
- + return __tcpv6_gso_segment_list_csum(skb);
- +}
- +
- static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
- {
- @@ -51,6 +158,9 @@ static struct sk_buff *tcp6_gso_segment(
- if (!pskb_may_pull(skb, sizeof(*th)))
- return ERR_PTR(-EINVAL);
-
- + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
- + return __tcp6_gso_segment_list(skb, features);
- +
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
|