|
@@ -1,355 +0,0 @@
|
|
|
-From: Felix Fietkau <[email protected]>
|
|
|
|
|
-Date: Sun, 26 Jul 2020 14:03:21 +0200
|
|
|
|
|
-Subject: [PATCH] net: add support for threaded NAPI polling
|
|
|
|
|
-
|
|
|
|
|
-For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
|
|
|
|
|
-poll function does not perform well. Since NAPI poll is bound to the CPU it
|
|
|
|
|
-was scheduled from, we can easily end up with a few very busy CPUs spending
|
|
|
|
|
-most of their time in softirq/ksoftirqd and some idle ones.
|
|
|
|
|
-
|
|
|
|
|
-Introduce threaded NAPI for such drivers based on a workqueue. The API is the
|
|
|
|
|
-same except for using netif_threaded_napi_add instead of netif_napi_add.
|
|
|
|
|
-
|
|
|
|
|
-In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
|
|
|
|
|
-improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
|
|
|
|
|
-NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
|
|
|
|
|
-thread.
|
|
|
|
|
-
|
|
|
|
|
-With threaded NAPI it seems stable and consistent (and higher than the best
|
|
|
|
|
-results I got without it).
|
|
|
|
|
-
|
|
|
|
|
-Based on a patch by Hillf Danton
|
|
|
|
|
-
|
|
|
|
|
-Cc: Hillf Danton <[email protected]>
|
|
|
|
|
-Signed-off-by: Felix Fietkau <[email protected]>
|
|
|
|
|
----
|
|
|
|
|
-
|
|
|
|
|
---- a/include/linux/netdevice.h
|
|
|
|
|
-+++ b/include/linux/netdevice.h
|
|
|
|
|
-@@ -340,6 +340,7 @@ struct napi_struct {
|
|
|
|
|
- struct list_head dev_list;
|
|
|
|
|
- struct hlist_node napi_hash_node;
|
|
|
|
|
- unsigned int napi_id;
|
|
|
|
|
-+ struct work_struct work;
|
|
|
|
|
- };
|
|
|
|
|
-
|
|
|
|
|
- enum {
|
|
|
|
|
-@@ -350,6 +351,7 @@ enum {
|
|
|
|
|
- NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
|
|
|
|
|
- NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
|
|
|
|
- NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
|
|
|
|
-+ NAPI_STATE_THREADED, /* Use threaded NAPI */
|
|
|
|
|
- };
|
|
|
|
|
-
|
|
|
|
|
- enum {
|
|
|
|
|
-@@ -360,6 +362,7 @@ enum {
|
|
|
|
|
- NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
|
|
|
|
|
- NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
|
|
|
|
- NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
|
|
|
|
-+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
|
|
|
|
- };
|
|
|
|
|
-
|
|
|
|
|
- enum gro_result {
|
|
|
|
|
-@@ -2101,6 +2104,7 @@ struct net_device {
|
|
|
|
|
- struct lock_class_key addr_list_lock_key;
|
|
|
|
|
- bool proto_down;
|
|
|
|
|
- unsigned wol_enabled:1;
|
|
|
|
|
-+ unsigned threaded:1;
|
|
|
|
|
- };
|
|
|
|
|
- #define to_net_dev(d) container_of(d, struct net_device, dev)
|
|
|
|
|
-
|
|
|
|
|
-@@ -2281,6 +2285,26 @@ void netif_napi_add(struct net_device *d
|
|
|
|
|
- int (*poll)(struct napi_struct *, int), int weight);
|
|
|
|
|
-
|
|
|
|
|
- /**
|
|
|
|
|
-+ * netif_threaded_napi_add - initialize a NAPI context
|
|
|
|
|
-+ * @dev: network device
|
|
|
|
|
-+ * @napi: NAPI context
|
|
|
|
|
-+ * @poll: polling function
|
|
|
|
|
-+ * @weight: default weight
|
|
|
|
|
-+ *
|
|
|
|
|
-+ * This variant of netif_napi_add() should be used from drivers using NAPI
|
|
|
|
|
-+ * with CPU intensive poll functions.
|
|
|
|
|
-+ * This will schedule polling from a high priority workqueue
|
|
|
|
|
-+ */
|
|
|
|
|
-+static inline void netif_threaded_napi_add(struct net_device *dev,
|
|
|
|
|
-+ struct napi_struct *napi,
|
|
|
|
|
-+ int (*poll)(struct napi_struct *, int),
|
|
|
|
|
-+ int weight)
|
|
|
|
|
-+{
|
|
|
|
|
-+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
|
|
|
-+ netif_napi_add(dev, napi, poll, weight);
|
|
|
|
|
-+}
|
|
|
|
|
-+
|
|
|
|
|
-+/**
|
|
|
|
|
- * netif_tx_napi_add - initialize a NAPI context
|
|
|
|
|
- * @dev: network device
|
|
|
|
|
- * @napi: NAPI context
|
|
|
|
|
---- a/net/core/dev.c
|
|
|
|
|
-+++ b/net/core/dev.c
|
|
|
|
|
-@@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
|
|
|
|
|
- struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
|
|
|
|
|
- struct list_head ptype_all __read_mostly; /* Taps */
|
|
|
|
|
- static struct list_head offload_base __read_mostly;
|
|
|
|
|
-+static struct workqueue_struct *napi_workq __read_mostly;
|
|
|
|
|
-
|
|
|
|
|
- static int netif_rx_internal(struct sk_buff *skb);
|
|
|
|
|
- static int call_netdevice_notifiers_info(unsigned long val,
|
|
|
|
|
-@@ -5940,6 +5941,11 @@ void __napi_schedule(struct napi_struct
|
|
|
|
|
- {
|
|
|
|
|
- unsigned long flags;
|
|
|
|
|
-
|
|
|
|
|
-+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
|
|
|
-+ queue_work(napi_workq, &n->work);
|
|
|
|
|
-+ return;
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
- local_irq_save(flags);
|
|
|
|
|
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
|
|
|
- local_irq_restore(flags);
|
|
|
|
|
-@@ -5991,6 +5997,10 @@ EXPORT_SYMBOL(napi_schedule_prep);
|
|
|
|
|
- */
|
|
|
|
|
- void __napi_schedule_irqoff(struct napi_struct *n)
|
|
|
|
|
- {
|
|
|
|
|
-+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
|
|
|
|
|
-+ queue_work(napi_workq, &n->work);
|
|
|
|
|
-+ return;
|
|
|
|
|
-+ }
|
|
|
|
|
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
|
|
|
|
|
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
|
|
|
|
|
- else
|
|
|
|
|
-@@ -6255,9 +6265,89 @@ static void init_gro_hash(struct napi_st
|
|
|
|
|
- napi->gro_bitmask = 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
-+static int __napi_poll(struct napi_struct *n, bool *repoll)
|
|
|
|
|
-+{
|
|
|
|
|
-+ int work, weight;
|
|
|
|
|
-+
|
|
|
|
|
-+ weight = n->weight;
|
|
|
|
|
-+
|
|
|
|
|
-+ /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
|
|
|
-+ * with netpoll's poll_napi(). Only the entity which
|
|
|
|
|
-+ * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
|
|
|
-+ * actually make the ->poll() call. Therefore we avoid
|
|
|
|
|
-+ * accidentally calling ->poll() when NAPI is not scheduled.
|
|
|
|
|
-+ */
|
|
|
|
|
-+ work = 0;
|
|
|
|
|
-+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
|
|
|
-+ work = n->poll(n, weight);
|
|
|
|
|
-+ trace_napi_poll(n, work, weight);
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
-+ WARN_ON_ONCE(work > weight);
|
|
|
|
|
-+
|
|
|
|
|
-+ if (likely(work < weight))
|
|
|
|
|
-+ return work;
|
|
|
|
|
-+
|
|
|
|
|
-+ /* Drivers must not modify the NAPI state if they
|
|
|
|
|
-+ * consume the entire weight. In such cases this code
|
|
|
|
|
-+ * still "owns" the NAPI instance and therefore can
|
|
|
|
|
-+ * move the instance around on the list at-will.
|
|
|
|
|
-+ */
|
|
|
|
|
-+ if (unlikely(napi_disable_pending(n))) {
|
|
|
|
|
-+ napi_complete(n);
|
|
|
|
|
-+ return work;
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
-+ if (n->gro_bitmask) {
|
|
|
|
|
-+ /* flush too old packets
|
|
|
|
|
-+ * If HZ < 1000, flush all packets.
|
|
|
|
|
-+ */
|
|
|
|
|
-+ napi_gro_flush(n, HZ >= 1000);
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
-+ gro_normal_list(n);
|
|
|
|
|
-+
|
|
|
|
|
-+ *repoll = true;
|
|
|
|
|
-+
|
|
|
|
|
-+ return work;
|
|
|
|
|
-+}
|
|
|
|
|
-+
|
|
|
|
|
-+static void napi_workfn(struct work_struct *work)
|
|
|
|
|
-+{
|
|
|
|
|
-+ struct napi_struct *n = container_of(work, struct napi_struct, work);
|
|
|
|
|
-+ void *have;
|
|
|
|
|
-+
|
|
|
|
|
-+ for (;;) {
|
|
|
|
|
-+ bool repoll = false;
|
|
|
|
|
-+
|
|
|
|
|
-+ local_bh_disable();
|
|
|
|
|
-+
|
|
|
|
|
-+ have = netpoll_poll_lock(n);
|
|
|
|
|
-+ __napi_poll(n, &repoll);
|
|
|
|
|
-+ netpoll_poll_unlock(have);
|
|
|
|
|
-+
|
|
|
|
|
-+ local_bh_enable();
|
|
|
|
|
-+
|
|
|
|
|
-+ if (!repoll)
|
|
|
|
|
-+ return;
|
|
|
|
|
-+
|
|
|
|
|
-+ if (!need_resched())
|
|
|
|
|
-+ continue;
|
|
|
|
|
-+
|
|
|
|
|
-+ /*
|
|
|
|
|
-+ * have to pay for the latency of task switch even if
|
|
|
|
|
-+ * napi is scheduled
|
|
|
|
|
-+ */
|
|
|
|
|
-+ queue_work(napi_workq, work);
|
|
|
|
|
-+ return;
|
|
|
|
|
-+ }
|
|
|
|
|
-+}
|
|
|
|
|
-+
|
|
|
|
|
- void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
|
|
|
|
|
- int (*poll)(struct napi_struct *, int), int weight)
|
|
|
|
|
- {
|
|
|
|
|
-+ if (dev->threaded)
|
|
|
|
|
-+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
|
|
|
- INIT_LIST_HEAD(&napi->poll_list);
|
|
|
|
|
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
|
|
|
|
|
- napi->timer.function = napi_watchdog;
|
|
|
|
|
-@@ -6274,6 +6364,7 @@ void netif_napi_add(struct net_device *d
|
|
|
|
|
- #ifdef CONFIG_NETPOLL
|
|
|
|
|
- napi->poll_owner = -1;
|
|
|
|
|
- #endif
|
|
|
|
|
-+ INIT_WORK(&napi->work, napi_workfn);
|
|
|
|
|
- set_bit(NAPI_STATE_SCHED, &napi->state);
|
|
|
|
|
- set_bit(NAPI_STATE_NPSVC, &napi->state);
|
|
|
|
|
- list_add_rcu(&napi->dev_list, &dev->napi_list);
|
|
|
|
|
-@@ -6314,6 +6405,7 @@ static void flush_gro_hash(struct napi_s
|
|
|
|
|
- void netif_napi_del(struct napi_struct *napi)
|
|
|
|
|
- {
|
|
|
|
|
- might_sleep();
|
|
|
|
|
-+ cancel_work_sync(&napi->work);
|
|
|
|
|
- if (napi_hash_del(napi))
|
|
|
|
|
- synchronize_net();
|
|
|
|
|
- list_del_init(&napi->dev_list);
|
|
|
|
|
-@@ -6326,50 +6418,18 @@ EXPORT_SYMBOL(netif_napi_del);
|
|
|
|
|
-
|
|
|
|
|
- static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
|
|
|
|
- {
|
|
|
|
|
-+ bool do_repoll = false;
|
|
|
|
|
- void *have;
|
|
|
|
|
-- int work, weight;
|
|
|
|
|
-+ int work;
|
|
|
|
|
-
|
|
|
|
|
- list_del_init(&n->poll_list);
|
|
|
|
|
-
|
|
|
|
|
- have = netpoll_poll_lock(n);
|
|
|
|
|
-
|
|
|
|
|
-- weight = n->weight;
|
|
|
|
|
--
|
|
|
|
|
-- /* This NAPI_STATE_SCHED test is for avoiding a race
|
|
|
|
|
-- * with netpoll's poll_napi(). Only the entity which
|
|
|
|
|
-- * obtains the lock and sees NAPI_STATE_SCHED set will
|
|
|
|
|
-- * actually make the ->poll() call. Therefore we avoid
|
|
|
|
|
-- * accidentally calling ->poll() when NAPI is not scheduled.
|
|
|
|
|
-- */
|
|
|
|
|
-- work = 0;
|
|
|
|
|
-- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
|
|
|
|
|
-- work = n->poll(n, weight);
|
|
|
|
|
-- trace_napi_poll(n, work, weight);
|
|
|
|
|
-- }
|
|
|
|
|
--
|
|
|
|
|
-- WARN_ON_ONCE(work > weight);
|
|
|
|
|
-+ work = __napi_poll(n, &do_repoll);
|
|
|
|
|
-
|
|
|
|
|
-- if (likely(work < weight))
|
|
|
|
|
-- goto out_unlock;
|
|
|
|
|
--
|
|
|
|
|
-- /* Drivers must not modify the NAPI state if they
|
|
|
|
|
-- * consume the entire weight. In such cases this code
|
|
|
|
|
-- * still "owns" the NAPI instance and therefore can
|
|
|
|
|
-- * move the instance around on the list at-will.
|
|
|
|
|
-- */
|
|
|
|
|
-- if (unlikely(napi_disable_pending(n))) {
|
|
|
|
|
-- napi_complete(n);
|
|
|
|
|
-+ if (!do_repoll)
|
|
|
|
|
- goto out_unlock;
|
|
|
|
|
-- }
|
|
|
|
|
--
|
|
|
|
|
-- if (n->gro_bitmask) {
|
|
|
|
|
-- /* flush too old packets
|
|
|
|
|
-- * If HZ < 1000, flush all packets.
|
|
|
|
|
-- */
|
|
|
|
|
-- napi_gro_flush(n, HZ >= 1000);
|
|
|
|
|
-- }
|
|
|
|
|
--
|
|
|
|
|
-- gro_normal_list(n);
|
|
|
|
|
-
|
|
|
|
|
- /* Some drivers may have called napi_schedule
|
|
|
|
|
- * prior to exhausting their budget.
|
|
|
|
|
-@@ -10349,6 +10409,10 @@ static int __init net_dev_init(void)
|
|
|
|
|
- sd->backlog.weight = weight_p;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
-+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
|
|
|
|
|
-+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
|
|
|
|
|
-+ BUG_ON(!napi_workq);
|
|
|
|
|
-+
|
|
|
|
|
- dev_boot_phase = 0;
|
|
|
|
|
-
|
|
|
|
|
- /* The loopback device is special if any other network devices
|
|
|
|
|
---- a/net/core/net-sysfs.c
|
|
|
|
|
-+++ b/net/core/net-sysfs.c
|
|
|
|
|
-@@ -470,6 +470,52 @@ static ssize_t proto_down_store(struct d
|
|
|
|
|
- }
|
|
|
|
|
- NETDEVICE_SHOW_RW(proto_down, fmt_dec);
|
|
|
|
|
-
|
|
|
|
|
-+static int change_napi_threaded(struct net_device *dev, unsigned long val)
|
|
|
|
|
-+{
|
|
|
|
|
-+ struct napi_struct *napi;
|
|
|
|
|
-+
|
|
|
|
|
-+ if (list_empty(&dev->napi_list))
|
|
|
|
|
-+ return -EOPNOTSUPP;
|
|
|
|
|
-+
|
|
|
|
|
-+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
|
|
|
|
-+ if (val)
|
|
|
|
|
-+ set_bit(NAPI_STATE_THREADED, &napi->state);
|
|
|
|
|
-+ else
|
|
|
|
|
-+ clear_bit(NAPI_STATE_THREADED, &napi->state);
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
-+ return 0;
|
|
|
|
|
-+}
|
|
|
|
|
-+
|
|
|
|
|
-+static ssize_t napi_threaded_store(struct device *dev,
|
|
|
|
|
-+ struct device_attribute *attr,
|
|
|
|
|
-+ const char *buf, size_t len)
|
|
|
|
|
-+{
|
|
|
|
|
-+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
|
|
|
|
|
-+}
|
|
|
|
|
-+
|
|
|
|
|
-+static ssize_t napi_threaded_show(struct device *dev,
|
|
|
|
|
-+ struct device_attribute *attr,
|
|
|
|
|
-+ char *buf)
|
|
|
|
|
-+{
|
|
|
|
|
-+ struct net_device *netdev = to_net_dev(dev);
|
|
|
|
|
-+ struct napi_struct *napi;
|
|
|
|
|
-+ bool enabled = false;
|
|
|
|
|
-+
|
|
|
|
|
-+ if (!rtnl_trylock())
|
|
|
|
|
-+ return restart_syscall();
|
|
|
|
|
-+
|
|
|
|
|
-+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
|
|
|
|
|
-+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
|
|
|
|
|
-+ enabled = true;
|
|
|
|
|
-+ }
|
|
|
|
|
-+
|
|
|
|
|
-+ rtnl_unlock();
|
|
|
|
|
-+
|
|
|
|
|
-+ return sprintf(buf, fmt_dec, enabled);
|
|
|
|
|
-+}
|
|
|
|
|
-+static DEVICE_ATTR_RW(napi_threaded);
|
|
|
|
|
-+
|
|
|
|
|
- static ssize_t phys_port_id_show(struct device *dev,
|
|
|
|
|
- struct device_attribute *attr, char *buf)
|
|
|
|
|
- {
|
|
|
|
|
-@@ -581,6 +627,7 @@ static struct attribute *net_class_attrs
|
|
|
|
|
- &dev_attr_flags.attr,
|
|
|
|
|
- &dev_attr_tx_queue_len.attr,
|
|
|
|
|
- &dev_attr_gro_flush_timeout.attr,
|
|
|
|
|
-+ &dev_attr_napi_threaded.attr,
|
|
|
|
|
- &dev_attr_phys_port_id.attr,
|
|
|
|
|
- &dev_attr_phys_port_name.attr,
|
|
|
|
|
- &dev_attr_phys_switch_id.attr,
|
|
|