| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232 | 
							- From: Felix Fietkau <[email protected]>
 
- Date: Thu, 16 Feb 2023 18:39:04 +0100
 
- Subject: [PATCH] net/core: add optional threading for backlog processing
 
- When dealing with few flows or an imbalance on CPU utilization, static RPS
 
- CPU assignment can be too inflexible. Add support for enabling threaded NAPI
 
- for backlog processing in order to allow the scheduler to better balance
 
- processing. This helps better spread the load across idle CPUs.
 
- Signed-off-by: Felix Fietkau <[email protected]>
 
- ---
 
- --- a/include/linux/netdevice.h
 
- +++ b/include/linux/netdevice.h
 
- @@ -502,6 +502,7 @@ static inline bool napi_complete(struct
 
-  }
 
-  
 
-  int dev_set_threaded(struct net_device *dev, bool threaded);
 
- +int backlog_set_threaded(bool threaded);
 
-  
 
-  /**
 
-   *	napi_disable - prevent NAPI from scheduling
 
- @@ -3365,6 +3366,7 @@ struct softnet_data {
 
-  	unsigned int		processed;
 
-  	unsigned int		time_squeeze;
 
-  	unsigned int		received_rps;
 
- +	unsigned int		process_queue_empty;
 
-  #ifdef CONFIG_RPS
 
-  	struct softnet_data	*rps_ipi_list;
 
-  #endif
 
- --- a/net/core/dev.c
 
- +++ b/net/core/dev.c
 
- @@ -4574,7 +4574,7 @@ static int rps_ipi_queued(struct softnet
 
-  #ifdef CONFIG_RPS
 
-  	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
-  
 
- -	if (sd != mysd) {
 
- +	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 
-  		sd->rps_ipi_next = mysd->rps_ipi_list;
 
-  		mysd->rps_ipi_list = sd;
 
-  
 
- @@ -5755,6 +5755,8 @@ static DEFINE_PER_CPU(struct work_struct
 
-  /* Network device is going away, flush any packets still pending */
 
-  static void flush_backlog(struct work_struct *work)
 
-  {
 
- +	unsigned int process_queue_empty;
 
- +	bool threaded, flush_processq;
 
-  	struct sk_buff *skb, *tmp;
 
-  	struct softnet_data *sd;
 
-  
 
- @@ -5770,9 +5772,18 @@ static void flush_backlog(struct work_st
 
-  			input_queue_head_incr(sd);
 
-  		}
 
-  	}
 
- +
 
- +	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
 
- +	flush_processq = threaded &&
 
- +			 !skb_queue_empty_lockless(&sd->process_queue);
 
- +	if (flush_processq)
 
- +		process_queue_empty = sd->process_queue_empty;
 
-  	rps_unlock(sd);
 
-  	local_irq_enable();
 
-  
 
- +	if (threaded)
 
- +		goto out;
 
- +
 
-  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 
-  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 
-  			__skb_unlink(skb, &sd->process_queue);
 
- @@ -5780,7 +5791,18 @@ static void flush_backlog(struct work_st
 
-  			input_queue_head_incr(sd);
 
-  		}
 
-  	}
 
- +
 
- +out:
 
-  	local_bh_enable();
 
- +
 
- +	while (flush_processq) {
 
- +		msleep(1);
 
- +		local_irq_disable();
 
- +		rps_lock(sd);
 
- +		flush_processq = process_queue_empty == sd->process_queue_empty;
 
- +		rps_unlock(sd);
 
- +		local_irq_enable();
 
- +	}
 
-  }
 
-  
 
-  static bool flush_required(int cpu)
 
- @@ -6463,6 +6485,7 @@ static int process_backlog(struct napi_s
 
-  
 
-  		local_irq_disable();
 
-  		rps_lock(sd);
 
- +		sd->process_queue_empty++;
 
-  		if (skb_queue_empty(&sd->input_pkt_queue)) {
 
-  			/*
 
-  			 * Inline a custom version of __napi_complete().
 
- @@ -6472,7 +6495,8 @@ static int process_backlog(struct napi_s
 
-  			 * We can use a plain write instead of clear_bit(),
 
-  			 * and we dont need an smp_mb() memory barrier.
 
-  			 */
 
- -			napi->state = 0;
 
- +			napi->state &= ~(NAPIF_STATE_SCHED |
 
- +					 NAPIF_STATE_SCHED_THREADED);
 
-  			again = false;
 
-  		} else {
 
-  			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 
- @@ -6889,6 +6913,57 @@ int dev_set_threaded(struct net_device *
 
-  }
 
-  EXPORT_SYMBOL(dev_set_threaded);
 
-  
 
- +int backlog_set_threaded(bool threaded)
 
- +{
 
- +	static bool backlog_threaded;
 
- +	int err = 0;
 
- +	int i;
 
- +
 
- +	if (backlog_threaded == threaded)
 
- +		return 0;
 
- +
 
- +	for_each_possible_cpu(i) {
 
- +		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
- +		struct napi_struct *n = &sd->backlog;
 
- +
 
- +		if (n->thread)
 
- +			continue;
 
- +		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
 
- +		if (IS_ERR(n->thread)) {
 
- +			err = PTR_ERR(n->thread);
 
- +			pr_err("kthread_run failed with err %d\n", err);
 
- +			n->thread = NULL;
 
- +			threaded = false;
 
- +			break;
 
- +		}
 
- +
 
- +	}
 
- +
 
- +	backlog_threaded = threaded;
 
- +
 
- +	/* Make sure kthread is created before THREADED bit
 
- +	 * is set.
 
- +	 */
 
- +	smp_mb__before_atomic();
 
- +
 
- +	for_each_possible_cpu(i) {
 
- +		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
- +		struct napi_struct *n = &sd->backlog;
 
- +		unsigned long flags;
 
- +
 
- +		local_irq_save(flags);
 
- +		rps_lock(sd);
 
- +		if (threaded)
 
- +			n->state |= NAPIF_STATE_THREADED;
 
- +		else
 
- +			n->state &= ~NAPIF_STATE_THREADED;
 
- +		rps_unlock(sd);
 
- +		local_irq_restore(flags);
 
- +	}
 
- +
 
- +	return err;
 
- +}
 
- +
 
-  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 
-  		    int (*poll)(struct napi_struct *, int), int weight)
 
-  {
 
- @@ -11367,6 +11442,9 @@ static int dev_cpu_dead(unsigned int old
 
-  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 
-  	local_irq_enable();
 
-  
 
- +	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
 
- +		return 0;
 
- +
 
-  #ifdef CONFIG_RPS
 
-  	remsd = oldsd->rps_ipi_list;
 
-  	oldsd->rps_ipi_list = NULL;
 
- @@ -11706,6 +11784,7 @@ static int __init net_dev_init(void)
 
-  		sd->cpu = i;
 
-  #endif
 
-  
 
- +		INIT_LIST_HEAD(&sd->backlog.poll_list);
 
-  		init_gro_hash(&sd->backlog);
 
-  		sd->backlog.poll = process_backlog;
 
-  		sd->backlog.weight = weight_p;
 
- --- a/net/core/sysctl_net_core.c
 
- +++ b/net/core/sysctl_net_core.c
 
- @@ -28,6 +28,7 @@ static int int_3600 = 3600;
 
-  static int min_sndbuf = SOCK_MIN_SNDBUF;
 
-  static int min_rcvbuf = SOCK_MIN_RCVBUF;
 
-  static int max_skb_frags = MAX_SKB_FRAGS;
 
- +static int backlog_threaded;
 
-  static long long_one __maybe_unused = 1;
 
-  static long long_max __maybe_unused = LONG_MAX;
 
-  
 
- @@ -114,6 +115,23 @@ static int rps_sock_flow_sysctl(struct c
 
-  }
 
-  #endif /* CONFIG_RPS */
 
-  
 
- +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
 
- +			       void *buffer, size_t *lenp, loff_t *ppos)
 
- +{
 
- +	static DEFINE_MUTEX(backlog_threaded_mutex);
 
- +	int ret;
 
- +
 
- +	mutex_lock(&backlog_threaded_mutex);
 
- +
 
- +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
- +	if (write && !ret)
 
- +		ret = backlog_set_threaded(backlog_threaded);
 
- +
 
- +	mutex_unlock(&backlog_threaded_mutex);
 
- +
 
- +	return ret;
 
- +}
 
- +
 
-  #ifdef CONFIG_NET_FLOW_LIMIT
 
-  static DEFINE_MUTEX(flow_limit_update_mutex);
 
-  
 
- @@ -470,6 +488,15 @@ static struct ctl_table net_core_table[]
 
-  		.proc_handler	= rps_sock_flow_sysctl
 
-  	},
 
-  #endif
 
- +	{
 
- +		.procname	= "backlog_threaded",
 
- +		.data		= &backlog_threaded,
 
- +		.maxlen		= sizeof(unsigned int),
 
- +		.mode		= 0644,
 
- +		.proc_handler	= backlog_threaded_sysctl,
 
- +		.extra1		= SYSCTL_ZERO,
 
- +		.extra2		= SYSCTL_ONE
 
- +	},
 
-  #ifdef CONFIG_NET_FLOW_LIMIT
 
-  	{
 
-  		.procname	= "flow_limit_cpu_bitmap",
 
 
  |