Browse Source

logtail/filch: fix filch test panic (#18660)

Updates rotateLocked so that we hold the activeStderrWriteForTest write
lock around the dup2Stderr call, rather than acquiring it only after
dup2 was already compelete.  This ensures no stderrWriteForTest calls
can race with the dup2 syscall. The now unused waitIdleStderrForTest has
been removed.

On macOS, dup2 and write on the same file descriptor are not atomic with
respect to each other, when rotateLocked called dup2Stderr to redirect
the stderr fd to a new file, concurrent goroutines calling
stderrWriteForTest could observe the fd in a transiently invalid state,
resulting in the bad file descripter.

Fixes tailscale/corp#36953

Signed-off-by: James Scott <[email protected]>
James Scott 3 weeks ago
parent
commit
6cbfc2f3ba
2 changed files with 14 additions and 27 deletions
  1. 14 13
      logtail/filch/filch.go
  2. 0 14
      logtail/filch/filch_test.go

+ 14 - 13
logtail/filch/filch.go

@@ -297,12 +297,6 @@ func stderrWriteForTest(b []byte) int {
 	return must.Get(os.Stderr.Write(b))
 }
 
-// waitIdleStderrForTest waits until there are no active stderrWriteForTest calls.
-func waitIdleStderrForTest() {
-	activeStderrWriteForTest.Lock()
-	defer activeStderrWriteForTest.Unlock()
-}
-
 // rotateLocked swaps f.newer and f.older such that:
 //
 //   - f.newer will be truncated and future writes will be appended to the end.
@@ -350,8 +344,15 @@ func (f *Filch) rotateLocked() error {
 	// Note that mutex does not prevent stderr writes.
 	prevSize := f.newlyWrittenBytes + f.newlyFilchedBytes
 	f.newlyWrittenBytes, f.newlyFilchedBytes = 0, 0
+
+	// Hold the write lock around dup2 to prevent concurrent
+	// stderrWriteForTest calls from racing with dup2 on the same fd.
+	// On macOS, dup2 and write are not atomic with respect to each other,
+	// so a concurrent write can observe a bad file descriptor.
+	activeStderrWriteForTest.Lock()
 	if f.OrigStderr != nil {
 		if err := dup2Stderr(f.newer); err != nil {
+			activeStderrWriteForTest.Unlock()
 			return err
 		}
 	}
@@ -369,15 +370,15 @@ func (f *Filch) rotateLocked() error {
 	// In rare cases, it is possible that [Filch.TryReadLine] consumes
 	// the entire older file before the write commits,
 	// leading to dropped stderr lines.
-	waitIdleStderrForTest()
-	if fi, err := f.older.Stat(); err != nil {
+	fi, err := f.older.Stat()
+	activeStderrWriteForTest.Unlock()
+	if err != nil {
 		return err
-	} else {
-		filchedBytes := max(0, fi.Size()-prevSize)
-		f.writeBytes.Add(filchedBytes)
-		f.filchedBytes.Add(filchedBytes)
-		f.storedBytes.Set(fi.Size()) // newer has been truncated, so only older matters
 	}
+	filchedBytes := max(0, fi.Size()-prevSize)
+	f.writeBytes.Add(filchedBytes)
+	f.filchedBytes.Add(filchedBytes)
+	f.storedBytes.Set(fi.Size()) // newer has been truncated, so only older matters
 
 	// Start reading from the start of older.
 	if _, err := f.older.Seek(0, io.SeekStart); err != nil {

+ 0 - 14
logtail/filch/filch_test.go

@@ -5,7 +5,6 @@ package filch
 
 import (
 	"bytes"
-	"crypto/sha256"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -121,19 +120,7 @@ func setupStderr(t *testing.T) {
 	tstest.Replace(t, &os.Stderr, pipeW)
 }
 
-func skipDarwin(t testing.TB) {
-	if runtime.GOOS != "darwin" {
-		return
-	}
-	src := must.Get(os.ReadFile("filch.go"))
-	if fmt.Sprintf("%x", sha256.Sum256(src)) != "a32da5e22034823c19ac7f29960e3646f540d67f85a0028832cab1f1557fc693" {
-		t.Errorf("filch.go has changed since this test was skipped; please delete this skip")
-	}
-	t.Skip("skipping known failing test on darwin; fixed in progress by https://github.com/tailscale/tailscale/pull/18660")
-}
-
 func TestConcurrentWriteAndRead(t *testing.T) {
-	skipDarwin(t)
 	if replaceStderrSupportedForTest {
 		setupStderr(t)
 	}
@@ -296,7 +283,6 @@ func TestMaxLineSize(t *testing.T) {
 }
 
 func TestMaxFileSize(t *testing.T) {
-	skipDarwin(t)
 	if replaceStderrSupportedForTest {
 		t.Run("ReplaceStderr:true", func(t *testing.T) { testMaxFileSize(t, true) })
 	}