ceph-scheduler-fix.patch 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. commit 8974189222159154c55f24ddad33e3613960521a
  2. Author: Peter Zijlstra <[email protected]>
  3. Date: Thu Jun 16 10:50:40 2016 +0200
  4. sched/fair: Fix cfs_rq avg tracking underflow
  5. As per commit:
  6. b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization")
  7. > the code generated from update_cfs_rq_load_avg():
  8. >
  9. > if (atomic_long_read(&cfs_rq->removed_load_avg)) {
  10. > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
  11. > sa->load_avg = max_t(long, sa->load_avg - r, 0);
  12. > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
  13. > removed_load = 1;
  14. > }
  15. >
  16. > turns into:
  17. >
  18. > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax
  19. > ffffffff8108706b: 48 85 c0 test %rax,%rax
  20. > ffffffff8108706e: 74 40 je ffffffff810870b0 <update_blocked_averages+0xc0>
  21. > ffffffff81087070: 4c 89 f8 mov %r15,%rax
  22. > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13)
  23. > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13)
  24. > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx
  25. > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx
  26. > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13)
  27. > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx
  28. >
  29. > Which you'll note ends up with sa->load_avg -= r in memory at
  30. > ffffffff8108707a.
  31. So I _should_ have looked at other unserialized users of ->load_avg,
  32. but alas. Luckily nikbor reported a similar /0 from task_h_load() which
  33. instantly triggered recollection of this here problem.
  34. Aside from the intermediate value hitting memory and causing problems,
  35. there's another problem: the underflow detection relies on the signed
  36. bit. This reduces the effective width of the variables, IOW its
  37. effectively the same as having these variables be of signed type.
  38. This patch changes to a different means of unsigned underflow
  39. detection to not rely on the signed bit. This allows the variables to
  40. use the 'full' unsigned range. And it does so with explicit LOAD -
  41. STORE to ensure any intermediate value will never be visible in
  42. memory, allowing these unserialized loads.
  43. Note: GCC generates crap code for this, might warrant a look later.
  44. Note2: I say 'full' above, if we end up at U*_MAX we'll still explode;
  45. maybe we should do clamping on add too.
  46. Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
  47. Cc: Andrey Ryabinin <[email protected]>
  48. Cc: Chris Wilson <[email protected]>
  49. Cc: Linus Torvalds <[email protected]>
  50. Cc: Mike Galbraith <[email protected]>
  51. Cc: Peter Zijlstra <[email protected]>
  52. Cc: Thomas Gleixner <[email protected]>
  53. Cc: Yuyang Du <[email protected]>
  54. Cc: [email protected]
  55. Cc: [email protected]
  56. Cc: [email protected]
  57. Cc: [email protected]
  58. Cc: [email protected]
  59. Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
  60. Link: http://lkml.kernel.org/r/[email protected]
  61. Signed-off-by: Ingo Molnar <[email protected]>
  62. ---
  63. kernel/sched/fair.c | 33 +++++++++++++++++++++++++--------
  64. 1 file changed, 25 insertions(+), 8 deletions(-)
  65. --- a/kernel/sched/fair.c
  66. +++ b/kernel/sched/fair.c
  67. @@ -2682,6 +2682,23 @@ static inline void update_tg_load_avg(st
  68. static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  69. +/*
  70. + * Unsigned subtract and clamp on underflow.
  71. + *
  72. + * Explicitly do a load-store to ensure the intermediate value never hits
  73. + * memory. This allows lockless observations without ever seeing the negative
  74. + * values.
  75. + */
  76. +#define sub_positive(_ptr, _val) do { \
  77. + typeof(_ptr) ptr = (_ptr); \
  78. + typeof(*ptr) val = (_val); \
  79. + typeof(*ptr) res, var = READ_ONCE(*ptr); \
  80. + res = var - val; \
  81. + if (res > var) \
  82. + res = 0; \
  83. + WRITE_ONCE(*ptr, res); \
  84. +} while (0)
  85. +
  86. /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
  87. static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  88. {
  89. @@ -2690,15 +2707,15 @@ static inline int update_cfs_rq_load_avg
  90. if (atomic_long_read(&cfs_rq->removed_load_avg)) {
  91. s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
  92. - sa->load_avg = max_t(long, sa->load_avg - r, 0);
  93. - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
  94. + sub_positive(&sa->load_avg, r);
  95. + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
  96. removed = 1;
  97. }
  98. if (atomic_long_read(&cfs_rq->removed_util_avg)) {
  99. long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
  100. - sa->util_avg = max_t(long, sa->util_avg - r, 0);
  101. - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
  102. + sub_positive(&sa->util_avg, r);
  103. + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
  104. }
  105. decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
  106. @@ -2764,10 +2781,10 @@ static void detach_entity_load_avg(struc
  107. &se->avg, se->on_rq * scale_load_down(se->load.weight),
  108. cfs_rq->curr == se, NULL);
  109. - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
  110. - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
  111. - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
  112. - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
  113. + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
  114. + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
  115. + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
  116. + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
  117. }
  118. /* Add the load generated by se into cfs_rq's load average */