760-net-core-add-optional-threading-for-backlog-processi.patch 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. From: Felix Fietkau <[email protected]>
  2. Date: Thu, 16 Feb 2023 18:39:04 +0100
  3. Subject: [PATCH] net/core: add optional threading for backlog processing
  4. When dealing with few flows or an imbalance on CPU utilization, static RPS
  5. CPU assignment can be too inflexible. Add support for enabling threaded NAPI
  6. for backlog processing in order to allow the scheduler to better balance
  7. processing. This helps better spread the load across idle CPUs.
  8. Signed-off-by: Felix Fietkau <[email protected]>
  9. ---
  10. --- a/include/linux/netdevice.h
  11. +++ b/include/linux/netdevice.h
  12. @@ -520,6 +520,7 @@ static inline bool napi_complete(struct
  13. }
  14. int dev_set_threaded(struct net_device *dev, bool threaded);
  15. +int backlog_set_threaded(bool threaded);
  16. /**
  17. * napi_disable - prevent NAPI from scheduling
  18. @@ -3128,6 +3129,7 @@ struct softnet_data {
  19. unsigned int processed;
  20. unsigned int time_squeeze;
  21. unsigned int received_rps;
  22. + unsigned int process_queue_empty;
  23. #ifdef CONFIG_RPS
  24. struct softnet_data *rps_ipi_list;
  25. #endif
  26. --- a/net/core/dev.c
  27. +++ b/net/core/dev.c
  28. @@ -4593,7 +4593,7 @@ static int napi_schedule_rps(struct soft
  29. struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
  30. #ifdef CONFIG_RPS
  31. - if (sd != mysd) {
  32. + if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
  33. sd->rps_ipi_next = mysd->rps_ipi_list;
  34. mysd->rps_ipi_list = sd;
  35. @@ -5774,6 +5774,8 @@ static DEFINE_PER_CPU(struct work_struct
  36. /* Network device is going away, flush any packets still pending */
  37. static void flush_backlog(struct work_struct *work)
  38. {
  39. + unsigned int process_queue_empty;
  40. + bool threaded, flush_processq;
  41. struct sk_buff *skb, *tmp;
  42. struct softnet_data *sd;
  43. @@ -5788,8 +5790,17 @@ static void flush_backlog(struct work_st
  44. input_queue_head_incr(sd);
  45. }
  46. }
  47. +
  48. + threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
  49. + flush_processq = threaded &&
  50. + !skb_queue_empty_lockless(&sd->process_queue);
  51. + if (flush_processq)
  52. + process_queue_empty = sd->process_queue_empty;
  53. rps_unlock_irq_enable(sd);
  54. + if (threaded)
  55. + goto out;
  56. +
  57. skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
  58. if (skb->dev->reg_state == NETREG_UNREGISTERING) {
  59. __skb_unlink(skb, &sd->process_queue);
  60. @@ -5797,7 +5808,16 @@ static void flush_backlog(struct work_st
  61. input_queue_head_incr(sd);
  62. }
  63. }
  64. +
  65. +out:
  66. local_bh_enable();
  67. +
  68. + while (flush_processq) {
  69. + msleep(1);
  70. + rps_lock_irq_disable(sd);
  71. + flush_processq = process_queue_empty == sd->process_queue_empty;
  72. + rps_unlock_irq_enable(sd);
  73. + }
  74. }
  75. static bool flush_required(int cpu)
  76. @@ -5929,6 +5949,7 @@ static int process_backlog(struct napi_s
  77. }
  78. rps_lock_irq_disable(sd);
  79. + sd->process_queue_empty++;
  80. if (skb_queue_empty(&sd->input_pkt_queue)) {
  81. /*
  82. * Inline a custom version of __napi_complete().
  83. @@ -5938,7 +5959,8 @@ static int process_backlog(struct napi_s
  84. * We can use a plain write instead of clear_bit(),
  85. * and we dont need an smp_mb() memory barrier.
  86. */
  87. - napi->state = 0;
  88. + napi->state &= ~(NAPIF_STATE_SCHED |
  89. + NAPIF_STATE_SCHED_THREADED);
  90. again = false;
  91. } else {
  92. skb_queue_splice_tail_init(&sd->input_pkt_queue,
  93. @@ -6354,6 +6376,55 @@ int dev_set_threaded(struct net_device *
  94. }
  95. EXPORT_SYMBOL(dev_set_threaded);
  96. +int backlog_set_threaded(bool threaded)
  97. +{
  98. + static bool backlog_threaded;
  99. + int err = 0;
  100. + int i;
  101. +
  102. + if (backlog_threaded == threaded)
  103. + return 0;
  104. +
  105. + for_each_possible_cpu(i) {
  106. + struct softnet_data *sd = &per_cpu(softnet_data, i);
  107. + struct napi_struct *n = &sd->backlog;
  108. +
  109. + if (n->thread)
  110. + continue;
  111. + n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
  112. + if (IS_ERR(n->thread)) {
  113. + err = PTR_ERR(n->thread);
  114. + pr_err("kthread_run failed with err %d\n", err);
  115. + n->thread = NULL;
  116. + threaded = false;
  117. + break;
  118. + }
  119. +
  120. + }
  121. +
  122. + backlog_threaded = threaded;
  123. +
  124. + /* Make sure kthread is created before THREADED bit
  125. + * is set.
  126. + */
  127. + smp_mb__before_atomic();
  128. +
  129. + for_each_possible_cpu(i) {
  130. + struct softnet_data *sd = &per_cpu(softnet_data, i);
  131. + struct napi_struct *n = &sd->backlog;
  132. + unsigned long flags;
  133. +
  134. + rps_lock_irqsave(sd, &flags);
  135. + if (threaded)
  136. + n->state |= NAPIF_STATE_THREADED;
  137. + else
  138. + n->state &= ~NAPIF_STATE_THREADED;
  139. + rps_unlock_irq_restore(sd, &flags);
  140. + }
  141. +
  142. + return err;
  143. +}
  144. +
  145. void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
  146. int (*poll)(struct napi_struct *, int), int weight)
  147. {
  148. @@ -11126,6 +11197,9 @@ static int dev_cpu_dead(unsigned int old
  149. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  150. local_irq_enable();
  151. + if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
  152. + return 0;
  153. +
  154. #ifdef CONFIG_RPS
  155. remsd = oldsd->rps_ipi_list;
  156. oldsd->rps_ipi_list = NULL;
  157. @@ -11429,6 +11503,7 @@ static int __init net_dev_init(void)
  158. INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
  159. spin_lock_init(&sd->defer_lock);
  160. + INIT_LIST_HEAD(&sd->backlog.poll_list);
  161. init_gro_hash(&sd->backlog);
  162. sd->backlog.poll = process_backlog;
  163. sd->backlog.weight = weight_p;
  164. --- a/net/core/sysctl_net_core.c
  165. +++ b/net/core/sysctl_net_core.c
  166. @@ -29,6 +29,7 @@ static int int_3600 = 3600;
  167. static int min_sndbuf = SOCK_MIN_SNDBUF;
  168. static int min_rcvbuf = SOCK_MIN_RCVBUF;
  169. static int max_skb_frags = MAX_SKB_FRAGS;
  170. +static int backlog_threaded;
  171. static int net_msg_warn; /* Unused, but still a sysctl */
  172. @@ -112,6 +113,23 @@ static int rps_sock_flow_sysctl(struct c
  173. }
  174. #endif /* CONFIG_RPS */
  175. +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
  176. + void *buffer, size_t *lenp, loff_t *ppos)
  177. +{
  178. + static DEFINE_MUTEX(backlog_threaded_mutex);
  179. + int ret;
  180. +
  181. + mutex_lock(&backlog_threaded_mutex);
  182. +
  183. + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  184. + if (write && !ret)
  185. + ret = backlog_set_threaded(backlog_threaded);
  186. +
  187. + mutex_unlock(&backlog_threaded_mutex);
  188. +
  189. + return ret;
  190. +}
  191. +
  192. #ifdef CONFIG_NET_FLOW_LIMIT
  193. static DEFINE_MUTEX(flow_limit_update_mutex);
  194. @@ -473,6 +491,15 @@ static struct ctl_table net_core_table[]
  195. .proc_handler = rps_sock_flow_sysctl
  196. },
  197. #endif
  198. + {
  199. + .procname = "backlog_threaded",
  200. + .data = &backlog_threaded,
  201. + .maxlen = sizeof(unsigned int),
  202. + .mode = 0644,
  203. + .proc_handler = backlog_threaded_sysctl,
  204. + .extra1 = SYSCTL_ZERO,
  205. + .extra2 = SYSCTL_ONE
  206. + },
  207. #ifdef CONFIG_NET_FLOW_LIMIT
  208. {
  209. .procname = "flow_limit_cpu_bitmap",