gpu.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /*
  2. Copyright 2020 Docker Compose CLI authors
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package ecs
  14. import (
  15. "fmt"
  16. "math"
  17. "strconv"
  18. "github.com/compose-spec/compose-go/types"
  19. "github.com/docker/go-units"
  20. )
  21. type machine struct {
  22. id string
  23. cpus float64
  24. memory types.UnitBytes
  25. gpus int64
  26. }
  27. type family []machine
  28. var gpufamily = family{
  29. {
  30. id: "g4dn.xlarge",
  31. cpus: 4,
  32. memory: 16 * units.GiB,
  33. gpus: 1,
  34. },
  35. {
  36. id: "g4dn.2xlarge",
  37. cpus: 8,
  38. memory: 32 * units.GiB,
  39. gpus: 1,
  40. },
  41. {
  42. id: "g4dn.4xlarge",
  43. cpus: 16,
  44. memory: 64 * units.GiB,
  45. gpus: 1,
  46. },
  47. {
  48. id: "g4dn.8xlarge",
  49. cpus: 32,
  50. memory: 128 * units.GiB,
  51. gpus: 1,
  52. },
  53. {
  54. id: "g4dn.12xlarge",
  55. cpus: 48,
  56. memory: 192 * units.GiB,
  57. gpus: 4,
  58. },
  59. {
  60. id: "g4dn.16xlarge",
  61. cpus: 64,
  62. memory: 256 * units.GiB,
  63. gpus: 1,
  64. },
  65. {
  66. id: "g4dn.metal",
  67. cpus: 96,
  68. memory: 384 * units.GiB,
  69. gpus: 8,
  70. },
  71. }
  72. type filterFn func(machine) bool
  73. func (f family) filter(fn filterFn) family {
  74. var filtered family
  75. for _, machine := range f {
  76. if fn(machine) {
  77. filtered = append(filtered, machine)
  78. }
  79. }
  80. return filtered
  81. }
  82. func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
  83. if len(f) == 0 {
  84. return machine{}, fmt.Errorf(msg, args...)
  85. }
  86. return f[0], nil
  87. }
  88. func guessMachineType(project *types.Project) (string, error) {
  89. // we select a machine type to match all gpus-bound services requirements
  90. // once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
  91. requirements, err := getResourceRequirements(project)
  92. if err != nil {
  93. return "", err
  94. }
  95. instanceType, err := gpufamily.
  96. filter(func(m machine) bool {
  97. return m.memory > requirements.memory // actual memory available for ECS tasks < total machine memory
  98. }).
  99. filter(func(m machine) bool {
  100. return m.cpus >= requirements.cpus
  101. }).
  102. filter(func(m machine) bool {
  103. return m.gpus >= requirements.gpus
  104. }).
  105. firstOrError("none of the Amazon EC2 G4 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
  106. if err != nil {
  107. return "", err
  108. }
  109. return instanceType.id, nil
  110. }
  111. type resourceRequirements struct {
  112. memory types.UnitBytes
  113. cpus float64
  114. gpus int64
  115. }
  116. func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
  117. return toResourceRequirementsSlice(project).
  118. filter(func(requirements *resourceRequirements) bool {
  119. return requirements != nil && requirements.gpus != 0
  120. }).
  121. max()
  122. }
  123. type eitherRequirementsOrError struct {
  124. requirements []*resourceRequirements
  125. err error
  126. }
  127. func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
  128. var requirements []*resourceRequirements
  129. for _, service := range project.Services {
  130. r, err := toResourceRequirements(service)
  131. if err != nil {
  132. return eitherRequirementsOrError{nil, err}
  133. }
  134. requirements = append(requirements, r)
  135. }
  136. return eitherRequirementsOrError{requirements, nil}
  137. }
  138. func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
  139. if r.err != nil {
  140. return r
  141. }
  142. var requirements []*resourceRequirements
  143. for _, req := range r.requirements {
  144. if fn(req) {
  145. requirements = append(requirements, req)
  146. }
  147. }
  148. return eitherRequirementsOrError{requirements, nil}
  149. }
  150. func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
  151. if service.Deploy == nil {
  152. return nil, nil
  153. }
  154. reservations := service.Deploy.Resources.Reservations
  155. if reservations == nil {
  156. return nil, nil
  157. }
  158. var requiredGPUs int64
  159. for _, r := range reservations.GenericResources {
  160. if r.DiscreteResourceSpec.Kind == "gpus" {
  161. requiredGPUs = r.DiscreteResourceSpec.Value
  162. break
  163. }
  164. }
  165. for _, r := range reservations.Devices {
  166. requiresGpus := false
  167. for _, c := range r.Capabilities {
  168. if c == "gpu" {
  169. requiresGpus = true
  170. break
  171. }
  172. }
  173. if requiresGpus {
  174. requiredGPUs = r.Count
  175. if requiredGPUs <= 0 {
  176. requiredGPUs = 1
  177. }
  178. break
  179. }
  180. }
  181. var nanocpu float64
  182. if reservations.NanoCPUs != "" {
  183. v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
  184. if err != nil {
  185. return nil, err
  186. }
  187. nanocpu = v
  188. }
  189. return &resourceRequirements{
  190. memory: reservations.MemoryBytes,
  191. cpus: nanocpu,
  192. gpus: requiredGPUs,
  193. }, nil
  194. }
  195. func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
  196. if o == nil {
  197. return r
  198. }
  199. return resourceRequirements{
  200. memory: maxUnitBytes(r.memory, o.memory),
  201. cpus: math.Max(r.cpus, o.cpus),
  202. gpus: maxInt64(r.gpus, o.gpus),
  203. }
  204. }
  205. func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
  206. if r.err != nil {
  207. return nil, r.err
  208. }
  209. min := resourceRequirements{}
  210. for _, req := range r.requirements {
  211. min = min.combine(req)
  212. }
  213. return &min, nil
  214. }
  215. func maxInt64(a, b int64) int64 {
  216. if a > b {
  217. return a
  218. }
  219. return b
  220. }
  221. func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
  222. if a > b {
  223. return a
  224. }
  225. return b
  226. }