Ver Fonte

lib/relay, lib/svcutil: Improve service logging (fixes #7580) (#7647)

Simon Frei há 4 anos atrás
pai
commit
c7f4f15272
3 ficheiros alterados com 43 adições e 11 exclusões
  1. 2 1
      lib/relay/client/dynamic.go
  2. 8 9
      lib/relay/client/static.go
  3. 33 1
      lib/svcutil/svcutil.go

+ 2 - 1
lib/relay/client/dynamic.go

@@ -93,7 +93,8 @@ func (c *dynamicClient) serve(ctx context.Context) error {
 			c.client = client
 			c.mut.Unlock()
 
-			c.client.Serve(ctx)
+			err = c.client.Serve(ctx)
+			l.Debugf("Disconnected from %s://%s: %v", c.client.URI().Scheme, c.client.URI().Host, err)
 
 			c.mut.Lock()
 			c.client = nil

+ 8 - 9
lib/relay/client/static.go

@@ -48,7 +48,7 @@ func newStaticClient(uri *url.URL, certs []tls.Certificate, invitations chan pro
 
 func (c *staticClient) serve(ctx context.Context) error {
 	if err := c.connect(ctx); err != nil {
-		l.Infof("Could not connect to relay %s: %s", c.uri, err)
+		l.Debugf("Could not connect to relay %s: %s", c.uri, err)
 		return err
 	}
 
@@ -56,17 +56,16 @@ func (c *staticClient) serve(ctx context.Context) error {
 	defer c.disconnect()
 
 	if err := c.join(); err != nil {
-		l.Infof("Could not join relay %s: %s", c.uri, err)
+		l.Debugf("Could not join relay %s: %s", c.uri, err)
 		return err
 	}
 
 	if err := c.conn.SetDeadline(time.Time{}); err != nil {
-		l.Infoln("Relay set deadline:", err)
+		l.Debugln("Relay set deadline:", err)
 		return err
 	}
 
 	l.Infof("Joined relay %s://%s", c.uri.Scheme, c.uri.Host)
-	defer l.Infof("Disconnected from relay %s://%s", c.uri.Scheme, c.uri.Host)
 
 	c.mut.Lock()
 	c.connected = true
@@ -88,7 +87,7 @@ func (c *staticClient) serve(ctx context.Context) error {
 			switch msg := message.(type) {
 			case protocol.Ping:
 				if err := protocol.WriteMessage(c.conn, protocol.Pong{}); err != nil {
-					l.Infoln("Relay write:", err)
+					l.Debugln("Relay write:", err)
 					return err
 				}
 				l.Debugln(c, "sent pong")
@@ -106,20 +105,20 @@ func (c *staticClient) serve(ctx context.Context) error {
 				}
 
 			case protocol.RelayFull:
-				l.Infof("Disconnected from relay %s due to it becoming full.", c.uri)
+				l.Debugf("Disconnected from relay %s due to it becoming full.", c.uri)
 				return errors.New("relay full")
 
 			default:
-				l.Infoln("Relay: protocol error: unexpected message %v", msg)
+				l.Debugln("Relay: protocol error: unexpected message %v", msg)
 				return fmt.Errorf("protocol error: unexpected message %v", msg)
 			}
 
 		case <-ctx.Done():
 			l.Debugln(c, "stopping")
-			return nil
+			return ctx.Err()
 
 		case err := <-errorsc:
-			l.Infof("Disconnecting from relay %s due to error: %s", c.uri, err)
+			l.Debugf("Disconnecting from relay %s due to error: %s", c.uri, err)
 			return err
 
 		case <-timeout.C:

+ 33 - 1
lib/svcutil/svcutil.go

@@ -160,7 +160,7 @@ func SpecWithDebugLogger(l logger.Logger) suture.Spec {
 }
 
 func SpecWithInfoLogger(l logger.Logger) suture.Spec {
-	return spec(func(e suture.Event) { l.Infoln(e) })
+	return spec(infoEventHook(l))
 }
 
 func spec(eventHook suture.EventHook) suture.Spec {
@@ -171,3 +171,35 @@ func spec(eventHook suture.EventHook) suture.Spec {
 		DontPropagateTermination: false,
 	}
 }
+
+// infoEventHook prints service failures and failures to stop services at level
+// info. All other events and identical, consecutive failures are logged at
+// debug only.
+func infoEventHook(l logger.Logger) suture.EventHook {
+	var prevTerminate suture.EventServiceTerminate
+	return func(ei suture.Event) {
+		switch e := ei.(type) {
+		case suture.EventStopTimeout:
+			l.Infof("%s: Service %s failed to terminate in a timely manner", e.SupervisorName, e.ServiceName)
+		case suture.EventServicePanic:
+			l.Warnln("Caught a service panic, which shouldn't happen")
+			l.Infoln(e)
+		case suture.EventServiceTerminate:
+			msg := fmt.Sprintf("%s: service %s failed: %s", e.SupervisorName, e.ServiceName, e.Err)
+			if e.ServiceName == prevTerminate.ServiceName && e.Err == prevTerminate.Err {
+				l.Debugln(msg)
+			} else {
+				l.Infoln(msg)
+			}
+			prevTerminate = e
+			l.Debugln(e) // Contains some backoff statistics
+		case suture.EventBackoff:
+			l.Debugf("%s: exiting the backoff state.", e.SupervisorName)
+		case suture.EventResume:
+			l.Debugf("%s: too many service failures - entering the backoff state.", e.SupervisorName)
+		default:
+			l.Warnln("Unknown suture supervisor event type", e.Type())
+			l.Infoln(e)
+		}
+	}
+}