123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- From: Felix Fietkau <[email protected]>
- Date: Thu, 16 Feb 2023 18:39:04 +0100
- Subject: [PATCH] net/core: add optional threading for backlog processing
- When dealing with few flows or an imbalance on CPU utilization, static RPS
- CPU assignment can be too inflexible. Add support for enabling threaded NAPI
- for backlog processing in order to allow the scheduler to better balance
- processing. This helps better spread the load across idle CPUs.
- Signed-off-by: Felix Fietkau <[email protected]>
- ---
- --- a/include/linux/netdevice.h
- +++ b/include/linux/netdevice.h
- @@ -520,6 +520,7 @@ static inline bool napi_complete(struct
- }
-
- int dev_set_threaded(struct net_device *dev, bool threaded);
- +int backlog_set_threaded(bool threaded);
-
- /**
- * napi_disable - prevent NAPI from scheduling
- @@ -3128,6 +3129,7 @@ struct softnet_data {
- unsigned int processed;
- unsigned int time_squeeze;
- unsigned int received_rps;
- + unsigned int process_queue_empty;
- #ifdef CONFIG_RPS
- struct softnet_data *rps_ipi_list;
- #endif
- --- a/net/core/dev.c
- +++ b/net/core/dev.c
- @@ -4597,7 +4597,7 @@ static int napi_schedule_rps(struct soft
- struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
-
- #ifdef CONFIG_RPS
- - if (sd != mysd) {
- + if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
- sd->rps_ipi_next = mysd->rps_ipi_list;
- mysd->rps_ipi_list = sd;
-
- @@ -5778,6 +5778,8 @@ static DEFINE_PER_CPU(struct work_struct
- /* Network device is going away, flush any packets still pending */
- static void flush_backlog(struct work_struct *work)
- {
- + unsigned int process_queue_empty;
- + bool threaded, flush_processq;
- struct sk_buff *skb, *tmp;
- struct softnet_data *sd;
-
- @@ -5792,8 +5794,17 @@ static void flush_backlog(struct work_st
- input_queue_head_incr(sd);
- }
- }
- +
- + threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
- + flush_processq = threaded &&
- + !skb_queue_empty_lockless(&sd->process_queue);
- + if (flush_processq)
- + process_queue_empty = sd->process_queue_empty;
- rps_unlock_irq_enable(sd);
-
- + if (threaded)
- + goto out;
- +
- skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
- __skb_unlink(skb, &sd->process_queue);
- @@ -5801,7 +5812,16 @@ static void flush_backlog(struct work_st
- input_queue_head_incr(sd);
- }
- }
- +
- +out:
- local_bh_enable();
- +
- + while (flush_processq) {
- + msleep(1);
- + rps_lock_irq_disable(sd);
- + flush_processq = process_queue_empty == sd->process_queue_empty;
- + rps_unlock_irq_enable(sd);
- + }
- }
-
- static bool flush_required(int cpu)
- @@ -5933,6 +5953,7 @@ static int process_backlog(struct napi_s
- }
-
- rps_lock_irq_disable(sd);
- + sd->process_queue_empty++;
- if (skb_queue_empty(&sd->input_pkt_queue)) {
- /*
- * Inline a custom version of __napi_complete().
- @@ -5942,7 +5963,8 @@ static int process_backlog(struct napi_s
- * We can use a plain write instead of clear_bit(),
- * and we dont need an smp_mb() memory barrier.
- */
- - napi->state = 0;
- + napi->state &= ~(NAPIF_STATE_SCHED |
- + NAPIF_STATE_SCHED_THREADED);
- again = false;
- } else {
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- @@ -6358,6 +6380,55 @@ int dev_set_threaded(struct net_device *
- }
- EXPORT_SYMBOL(dev_set_threaded);
-
- +int backlog_set_threaded(bool threaded)
- +{
- + static bool backlog_threaded;
- + int err = 0;
- + int i;
- +
- + if (backlog_threaded == threaded)
- + return 0;
- +
- + for_each_possible_cpu(i) {
- + struct softnet_data *sd = &per_cpu(softnet_data, i);
- + struct napi_struct *n = &sd->backlog;
- +
- + if (n->thread)
- + continue;
- + n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
- + if (IS_ERR(n->thread)) {
- + err = PTR_ERR(n->thread);
- + pr_err("kthread_run failed with err %d\n", err);
- + n->thread = NULL;
- + threaded = false;
- + break;
- + }
- +
- + }
- +
- + backlog_threaded = threaded;
- +
- + /* Make sure kthread is created before THREADED bit
- + * is set.
- + */
- + smp_mb__before_atomic();
- +
- + for_each_possible_cpu(i) {
- + struct softnet_data *sd = &per_cpu(softnet_data, i);
- + struct napi_struct *n = &sd->backlog;
- + unsigned long flags;
- +
- + rps_lock_irqsave(sd, &flags);
- + if (threaded)
- + n->state |= NAPIF_STATE_THREADED;
- + else
- + n->state &= ~NAPIF_STATE_THREADED;
- + rps_unlock_irq_restore(sd, &flags);
- + }
- +
- + return err;
- +}
- +
- void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
- {
- @@ -11130,6 +11201,9 @@ static int dev_cpu_dead(unsigned int old
- raise_softirq_irqoff(NET_TX_SOFTIRQ);
- local_irq_enable();
-
- + if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
- + return 0;
- +
- #ifdef CONFIG_RPS
- remsd = oldsd->rps_ipi_list;
- oldsd->rps_ipi_list = NULL;
- @@ -11433,6 +11507,7 @@ static int __init net_dev_init(void)
- INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- spin_lock_init(&sd->defer_lock);
-
- + INIT_LIST_HEAD(&sd->backlog.poll_list);
- init_gro_hash(&sd->backlog);
- sd->backlog.poll = process_backlog;
- sd->backlog.weight = weight_p;
- --- a/net/core/sysctl_net_core.c
- +++ b/net/core/sysctl_net_core.c
- @@ -29,6 +29,7 @@ static int int_3600 = 3600;
- static int min_sndbuf = SOCK_MIN_SNDBUF;
- static int min_rcvbuf = SOCK_MIN_RCVBUF;
- static int max_skb_frags = MAX_SKB_FRAGS;
- +static int backlog_threaded;
-
- static int net_msg_warn; /* Unused, but still a sysctl */
-
- @@ -112,6 +113,23 @@ static int rps_sock_flow_sysctl(struct c
- }
- #endif /* CONFIG_RPS */
-
- +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
- + void *buffer, size_t *lenp, loff_t *ppos)
- +{
- + static DEFINE_MUTEX(backlog_threaded_mutex);
- + int ret;
- +
- + mutex_lock(&backlog_threaded_mutex);
- +
- + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- + if (write && !ret)
- + ret = backlog_set_threaded(backlog_threaded);
- +
- + mutex_unlock(&backlog_threaded_mutex);
- +
- + return ret;
- +}
- +
- #ifdef CONFIG_NET_FLOW_LIMIT
- static DEFINE_MUTEX(flow_limit_update_mutex);
-
- @@ -473,6 +491,15 @@ static struct ctl_table net_core_table[]
- .proc_handler = rps_sock_flow_sysctl
- },
- #endif
- + {
- + .procname = "backlog_threaded",
- + .data = &backlog_threaded,
- + .maxlen = sizeof(unsigned int),
- + .mode = 0644,
- + .proc_handler = backlog_threaded_sysctl,
- + .extra1 = SYSCTL_ZERO,
- + .extra2 = SYSCTL_ONE
- + },
- #ifdef CONFIG_NET_FLOW_LIMIT
- {
- .procname = "flow_limit_cpu_bitmap",
|