123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- From: Felix Fietkau <[email protected]>
- Date: Sun, 26 Jul 2020 14:03:21 +0200
- Subject: [PATCH] net: add support for threaded NAPI polling
- For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
- poll function does not perform well. Since NAPI poll is bound to the CPU it
- was scheduled from, we can easily end up with a few very busy CPUs spending
- most of their time in softirq/ksoftirqd and some idle ones.
- Introduce threaded NAPI for such drivers based on a workqueue. The API is the
- same except for using netif_threaded_napi_add instead of netif_napi_add.
- In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
- improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
- NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
- thread.
- With threaded NAPI it seems stable and consistent (and higher than the best
- results I got without it).
- Based on a patch by Hillf Danton
- Cc: Hillf Danton <[email protected]>
- Signed-off-by: Felix Fietkau <[email protected]>
- ---
- --- a/include/linux/netdevice.h
- +++ b/include/linux/netdevice.h
- @@ -340,6 +340,7 @@ struct napi_struct {
- struct list_head dev_list;
- struct hlist_node napi_hash_node;
- unsigned int napi_id;
- + struct work_struct work;
- };
-
- enum {
- @@ -350,6 +351,7 @@ enum {
- NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
- NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
- NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
- + NAPI_STATE_THREADED, /* Use threaded NAPI */
- };
-
- enum {
- @@ -360,6 +362,7 @@ enum {
- NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
- NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
- NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
- + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
- };
-
- enum gro_result {
- @@ -2249,6 +2252,26 @@ void netif_napi_add(struct net_device *d
- int (*poll)(struct napi_struct *, int), int weight);
-
- /**
- + * netif_threaded_napi_add - initialize a NAPI context
- + * @dev: network device
- + * @napi: NAPI context
- + * @poll: polling function
- + * @weight: default weight
- + *
- + * This variant of netif_napi_add() should be used from drivers using NAPI
- + * with CPU intensive poll functions.
- + * This will schedule polling from a high priority workqueue
- + */
- +static inline void netif_threaded_napi_add(struct net_device *dev,
- + struct napi_struct *napi,
- + int (*poll)(struct napi_struct *, int),
- + int weight)
- +{
- + set_bit(NAPI_STATE_THREADED, &napi->state);
- + netif_napi_add(dev, napi, poll, weight);
- +}
- +
- +/**
- * netif_tx_napi_add - initialize a NAPI context
- * @dev: network device
- * @napi: NAPI context
- --- a/net/core/dev.c
- +++ b/net/core/dev.c
- @@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
- struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
- struct list_head ptype_all __read_mostly; /* Taps */
- static struct list_head offload_base __read_mostly;
- +static struct workqueue_struct *napi_workq __read_mostly;
-
- static int netif_rx_internal(struct sk_buff *skb);
- static int call_netdevice_notifiers_info(unsigned long val,
- @@ -5910,6 +5911,11 @@ void __napi_schedule(struct napi_struct
- {
- unsigned long flags;
-
- + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
- + queue_work(napi_workq, &n->work);
- + return;
- + }
- +
- local_irq_save(flags);
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
- local_irq_restore(flags);
- @@ -5957,6 +5963,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
- */
- void __napi_schedule_irqoff(struct napi_struct *n)
- {
- + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
- + queue_work(napi_workq, &n->work);
- + return;
- + }
- +
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
- }
- EXPORT_SYMBOL(__napi_schedule_irqoff);
- @@ -6218,6 +6229,84 @@ static void init_gro_hash(struct napi_st
- napi->gro_bitmask = 0;
- }
-
- +static int __napi_poll(struct napi_struct *n, bool *repoll)
- +{
- + int work, weight;
- +
- + weight = n->weight;
- +
- + /* This NAPI_STATE_SCHED test is for avoiding a race
- + * with netpoll's poll_napi(). Only the entity which
- + * obtains the lock and sees NAPI_STATE_SCHED set will
- + * actually make the ->poll() call. Therefore we avoid
- + * accidentally calling ->poll() when NAPI is not scheduled.
- + */
- + work = 0;
- + if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- + work = n->poll(n, weight);
- + trace_napi_poll(n, work, weight);
- + }
- +
- + WARN_ON_ONCE(work > weight);
- +
- + if (likely(work < weight))
- + return work;
- +
- + /* Drivers must not modify the NAPI state if they
- + * consume the entire weight. In such cases this code
- + * still "owns" the NAPI instance and therefore can
- + * move the instance around on the list at-will.
- + */
- + if (unlikely(napi_disable_pending(n))) {
- + napi_complete(n);
- + return work;
- + }
- +
- + if (n->gro_bitmask) {
- + /* flush too old packets
- + * If HZ < 1000, flush all packets.
- + */
- + napi_gro_flush(n, HZ >= 1000);
- + }
- +
- + gro_normal_list(n);
- +
- + *repoll = true;
- +
- + return work;
- +}
- +
- +static void napi_workfn(struct work_struct *work)
- +{
- + struct napi_struct *n = container_of(work, struct napi_struct, work);
- + void *have;
- +
- + for (;;) {
- + bool repoll = false;
- +
- + local_bh_disable();
- +
- + have = netpoll_poll_lock(n);
- + __napi_poll(n, &repoll);
- + netpoll_poll_unlock(have);
- +
- + local_bh_enable();
- +
- + if (!repoll)
- + return;
- +
- + if (!need_resched())
- + continue;
- +
- + /*
- + * have to pay for the latency of task switch even if
- + * napi is scheduled
- + */
- + queue_work(napi_workq, work);
- + return;
- + }
- +}
- +
- void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
- {
- @@ -6237,6 +6326,7 @@ void netif_napi_add(struct net_device *d
- #ifdef CONFIG_NETPOLL
- napi->poll_owner = -1;
- #endif
- + INIT_WORK(&napi->work, napi_workfn);
- set_bit(NAPI_STATE_SCHED, &napi->state);
- set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
- @@ -6277,6 +6367,7 @@ static void flush_gro_hash(struct napi_s
- void netif_napi_del(struct napi_struct *napi)
- {
- might_sleep();
- + cancel_work_sync(&napi->work);
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
- @@ -6289,50 +6380,18 @@ EXPORT_SYMBOL(netif_napi_del);
-
- static int napi_poll(struct napi_struct *n, struct list_head *repoll)
- {
- + bool do_repoll = false;
- void *have;
- - int work, weight;
- + int work;
-
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
- - weight = n->weight;
- + work = __napi_poll(n, &do_repoll);
-
- - /* This NAPI_STATE_SCHED test is for avoiding a race
- - * with netpoll's poll_napi(). Only the entity which
- - * obtains the lock and sees NAPI_STATE_SCHED set will
- - * actually make the ->poll() call. Therefore we avoid
- - * accidentally calling ->poll() when NAPI is not scheduled.
- - */
- - work = 0;
- - if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- - work = n->poll(n, weight);
- - trace_napi_poll(n, work, weight);
- - }
- -
- - WARN_ON_ONCE(work > weight);
- -
- - if (likely(work < weight))
- - goto out_unlock;
- -
- - /* Drivers must not modify the NAPI state if they
- - * consume the entire weight. In such cases this code
- - * still "owns" the NAPI instance and therefore can
- - * move the instance around on the list at-will.
- - */
- - if (unlikely(napi_disable_pending(n))) {
- - napi_complete(n);
- + if (!do_repoll)
- goto out_unlock;
- - }
- -
- - if (n->gro_bitmask) {
- - /* flush too old packets
- - * If HZ < 1000, flush all packets.
- - */
- - napi_gro_flush(n, HZ >= 1000);
- - }
- -
- - gro_normal_list(n);
-
- /* Some drivers may have called napi_schedule
- * prior to exhausting their budget.
- @@ -10270,6 +10329,10 @@ static int __init net_dev_init(void)
- sd->backlog.weight = weight_p;
- }
-
- + napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
- + WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
- + BUG_ON(!napi_workq);
- +
- dev_boot_phase = 0;
-
- /* The loopback device is special if any other network devices
- --- a/net/core/net-sysfs.c
- +++ b/net/core/net-sysfs.c
- @@ -442,6 +442,52 @@ static ssize_t proto_down_store(struct d
- }
- NETDEVICE_SHOW_RW(proto_down, fmt_dec);
-
- +static int change_napi_threaded(struct net_device *dev, unsigned long val)
- +{
- + struct napi_struct *napi;
- +
- + if (list_empty(&dev->napi_list))
- + return -EOPNOTSUPP;
- +
- + list_for_each_entry(napi, &dev->napi_list, dev_list) {
- + if (val)
- + set_bit(NAPI_STATE_THREADED, &napi->state);
- + else
- + clear_bit(NAPI_STATE_THREADED, &napi->state);
- + }
- +
- + return 0;
- +}
- +
- +static ssize_t napi_threaded_store(struct device *dev,
- + struct device_attribute *attr,
- + const char *buf, size_t len)
- +{
- + return netdev_store(dev, attr, buf, len, change_napi_threaded);
- +}
- +
- +static ssize_t napi_threaded_show(struct device *dev,
- + struct device_attribute *attr,
- + char *buf)
- +{
- + struct net_device *netdev = to_net_dev(dev);
- + struct napi_struct *napi;
- + bool enabled = false;
- +
- + if (!rtnl_trylock())
- + return restart_syscall();
- +
- + list_for_each_entry(napi, &netdev->napi_list, dev_list) {
- + if (test_bit(NAPI_STATE_THREADED, &napi->state))
- + enabled = true;
- + }
- +
- + rtnl_unlock();
- +
- + return sprintf(buf, fmt_dec, enabled);
- +}
- +static DEVICE_ATTR_RW(napi_threaded);
- +
- static ssize_t phys_port_id_show(struct device *dev,
- struct device_attribute *attr, char *buf)
- {
- @@ -532,6 +578,7 @@ static struct attribute *net_class_attrs
- &dev_attr_flags.attr,
- &dev_attr_tx_queue_len.attr,
- &dev_attr_gro_flush_timeout.attr,
- + &dev_attr_napi_threaded.attr,
- &dev_attr_phys_port_id.attr,
- &dev_attr_phys_port_name.attr,
- &dev_attr_phys_switch_id.attr,
|