瀏覽代碼

chore(sqlite): use normalised tables for file names and versions (#10383)

This changes the files table to use normalisation for the names and
versions. The idea is that these are often common between all remote
devices, and repeating an integer is more efficient than repeating a
long string. A new benchmark bears this out; for a database with 100k
files shared between 31 devices, with some worst case assumption on
version vector size, the database is reduced in size by 50% and the test
finishes quicker:

    Current:
        db_bench_test.go:322: Total size: 6263.70 MiB
    --- PASS: TestBenchmarkSizeManyFilesRemotes (1084.89s)

    New:
        db_bench_test.go:326: Total size: 3049.95 MiB
    --- PASS: TestBenchmarkSizeManyFilesRemotes (776.97s)

The other benchmarks end up about the same within the margin of
variability, with one possible exception being that RemoteNeed seems to
be a little slower on average:

                                          old files/s   new files/s
    Update/n=RemoteNeed/size=1000-8            5.051k        4.654k
    Update/n=RemoteNeed/size=2000-8            5.201k        4.384k
    Update/n=RemoteNeed/size=4000-8            4.943k        4.242k
    Update/n=RemoteNeed/size=8000-8            5.099k        3.527k
    Update/n=RemoteNeed/size=16000-8           3.686k        3.847k
    Update/n=RemoteNeed/size=30000-8           4.456k        3.482k

I'm not sure why, possibly that query can be optimised anyhow.

Signed-off-by: Jakob Borg <[email protected]>
Jakob Borg 1 月之前
父節點
當前提交
9ee208b441

+ 2 - 19
cmd/syncthing/perfstats_unix.go

@@ -18,6 +18,7 @@ import (
 
 	"github.com/syncthing/syncthing/lib/build"
 	"github.com/syncthing/syncthing/lib/locations"
+	"github.com/syncthing/syncthing/lib/osutil"
 	"github.com/syncthing/syncthing/lib/protocol"
 	"golang.org/x/exp/constraints"
 )
@@ -61,7 +62,7 @@ func savePerfStats(file string) {
 			rss,
 			rate(prevIn, in, timeDiff, 1e3),
 			rate(prevOut, out, timeDiff, 1e3),
-			dirsize(locations.Get(locations.Database))/1024,
+			osutil.DirSize(locations.Get(locations.Database))/1024,
 		)
 
 		prevTime = t
@@ -84,21 +85,3 @@ func rate[T number](prev, cur T, d time.Duration, div float64) float64 {
 	rate := float64(diff) / d.Seconds() / div
 	return rate
 }
-
-func dirsize(location string) int64 {
-	entries, err := os.ReadDir(location)
-	if err != nil {
-		return 0
-	}
-
-	var size int64
-	for _, entry := range entries {
-		fi, err := entry.Info()
-		if err != nil {
-			continue
-		}
-		size += fi.Size()
-	}
-
-	return size
-}

+ 49 - 3
internal/db/sqlite/basedb.go

@@ -7,6 +7,7 @@
 package sqlite
 
 import (
+	"context"
 	"database/sql"
 	"embed"
 	"io/fs"
@@ -26,7 +27,7 @@ import (
 )
 
 const (
-	currentSchemaVersion = 4
+	currentSchemaVersion = 5
 	applicationIDMain    = 0x53546d6e // "STmn", Syncthing main database
 	applicationIDFolder  = 0x53546664 // "STfd", Syncthing folder database
 )
@@ -87,7 +88,31 @@ func openBase(path string, maxConns int, pragmas, schemaScripts, migrationScript
 		},
 	}
 
-	tx, err := db.sql.Beginx()
+	// Create a specific connection for the schema setup and migration to
+	// run in. We do this because we need to disable foreign keys for the
+	// duration, which is a thing that needs to happen outside of a
+	// transaction and affects the connection it's run on. So we need to a)
+	// make sure all our commands run on this specific connection (which the
+	// transaction accomplishes naturally) and b) make sure these pragmas
+	// don't leak to anyone else afterwards.
+	ctx := context.TODO()
+	conn, err := db.sql.Connx(ctx)
+	if err != nil {
+		return nil, wrap(err)
+	}
+	defer func() {
+		_, _ = conn.ExecContext(ctx, "PRAGMA foreign_keys = ON")
+		_, _ = conn.ExecContext(ctx, "PRAGMA legacy_alter_table = OFF")
+		conn.Close()
+	}()
+	if _, err := conn.ExecContext(ctx, "PRAGMA foreign_keys = OFF"); err != nil {
+		return nil, wrap(err)
+	}
+	if _, err := conn.ExecContext(ctx, "PRAGMA legacy_alter_table = ON"); err != nil {
+		return nil, wrap(err)
+	}
+
+	tx, err := conn.BeginTxx(ctx, nil)
 	if err != nil {
 		return nil, wrap(err)
 	}
@@ -124,6 +149,22 @@ func openBase(path string, maxConns int, pragmas, schemaScripts, migrationScript
 				return nil, wrap(err)
 			}
 		}
+
+		// Run the initial schema scripts once more. This is generally a
+		// no-op. However, dropping a table removes associated triggers etc,
+		// and that's a thing we sometimes do in migrations. To avoid having
+		// to repeat the setup of associated triggers and indexes in the
+		// migration, we re-run the initial schema scripts.
+		for _, script := range schemaScripts {
+			if err := db.runScripts(tx, script); err != nil {
+				return nil, wrap(err)
+			}
+		}
+
+		// Finally, ensure nothing we've done along the way has violated key integrity.
+		if _, err := conn.ExecContext(ctx, "PRAGMA foreign_key_check"); err != nil {
+			return nil, wrap(err)
+		}
 	}
 
 	// Set the current schema version, if not already set
@@ -271,7 +312,12 @@ nextScript:
 		// also statement-internal semicolons in the triggers.
 		for _, stmt := range strings.Split(string(bs), "\n;") {
 			if _, err := tx.Exec(s.expandTemplateVars(stmt)); err != nil {
-				return wrap(err, stmt)
+				if strings.Contains(stmt, "syncthing:ignore-failure") {
+					// We're ok with this failing. Just note it.
+					slog.Debug("Script failed, but with ignore-failure annotation", slog.String("script", scr), slogutil.Error(wrap(err, stmt)))
+				} else {
+					return wrap(err, stmt)
+				}
 			}
 		}
 	}

+ 61 - 1
internal/db/sqlite/db_bench_test.go

@@ -8,11 +8,13 @@ package sqlite
 
 import (
 	"fmt"
+	"os"
 	"testing"
 	"time"
 
 	"github.com/syncthing/syncthing/internal/timeutil"
 	"github.com/syncthing/syncthing/lib/config"
+	"github.com/syncthing/syncthing/lib/osutil"
 	"github.com/syncthing/syncthing/lib/protocol"
 	"github.com/syncthing/syncthing/lib/rand"
 )
@@ -223,7 +225,7 @@ func BenchmarkUpdate(b *testing.B) {
 }
 
 func TestBenchmarkDropAllRemote(t *testing.T) {
-	if testing.Short() {
+	if testing.Short() || os.Getenv("LONG_TEST") == "" {
 		t.Skip("slow test")
 	}
 
@@ -266,3 +268,61 @@ func TestBenchmarkDropAllRemote(t *testing.T) {
 	d := time.Since(t0)
 	t.Log("drop all took", d)
 }
+
+func TestBenchmarkSizeManyFilesRemotes(t *testing.T) {
+	// Reports the database size for a setup with many files and many remote
+	// devices each announcing every files, with fairly long file names and
+	// "worst case" version vectors.
+
+	if testing.Short() || os.Getenv("LONG_TEST") == "" {
+		t.Skip("slow test")
+	}
+
+	dir := t.TempDir()
+	db, err := Open(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() {
+		if err := db.Close(); err != nil {
+			t.Fatal(err)
+		}
+	})
+
+	// This is equivalent to about 800 GiB in 100k files (i.e., 8 MiB per
+	// file), shared between 31 devices where each have touched every file.
+	const numFiles = 1e5
+	const numRemotes = 30
+	const numBlocks = 64
+	const filenameLen = 64
+
+	fs := make([]protocol.FileInfo, 1000)
+	n := 0
+	seq := 0
+	for n < numFiles {
+		for i := range fs {
+			seq++
+			fs[i] = genFile(rand.String(filenameLen), numBlocks, seq)
+			for r := range numRemotes {
+				fs[i].Version = fs[i].Version.Update(42 + protocol.ShortID(r))
+			}
+		}
+		if err := db.Update(folderID, protocol.LocalDeviceID, fs); err != nil {
+			t.Fatal(err)
+		}
+		for r := range numRemotes {
+			if err := db.Update(folderID, protocol.DeviceID{byte(42 + r)}, fs); err != nil {
+				t.Fatal(err)
+			}
+		}
+		n += len(fs)
+		t.Log(n, (numRemotes+1)*n)
+	}
+
+	if err := db.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	size := osutil.DirSize(dir)
+	t.Logf("Total size: %.02f MiB", float64(size)/1024/1024)
+}

+ 31 - 0
internal/db/sqlite/db_service.go

@@ -125,6 +125,9 @@ func (s *Service) periodic(ctx context.Context) error {
 			if err := garbageCollectOldDeletedLocked(ctx, fdb); err != nil {
 				return wrap(err)
 			}
+			if err := garbageCollectNamesAndVersions(ctx, fdb); err != nil {
+				return wrap(err)
+			}
 			if err := garbageCollectBlocklistsAndBlocksLocked(ctx, fdb); err != nil {
 				return wrap(err)
 			}
@@ -152,6 +155,34 @@ func tidy(ctx context.Context, db *sqlx.DB) error {
 	return nil
 }
 
+func garbageCollectNamesAndVersions(ctx context.Context, fdb *folderDB) error {
+	l := slog.With("folder", fdb.folderID, "fdb", fdb.baseName)
+
+	res, err := fdb.stmt(`
+		DELETE FROM file_names
+		WHERE NOT EXISTS (SELECT 1 FROM files f WHERE f.name_idx = idx)
+	`).Exec()
+	if err != nil {
+		return wrap(err, "delete names")
+	}
+	if aff, err := res.RowsAffected(); err == nil {
+		l.DebugContext(ctx, "Removed old file names", "affected", aff)
+	}
+
+	res, err = fdb.stmt(`
+		DELETE FROM file_versions
+		WHERE NOT EXISTS (SELECT 1 FROM files f WHERE f.version_idx = idx)
+	`).Exec()
+	if err != nil {
+		return wrap(err, "delete versions")
+	}
+	if aff, err := res.RowsAffected(); err == nil {
+		l.DebugContext(ctx, "Removed old file versions", "affected", aff)
+	}
+
+	return nil
+}
+
 func garbageCollectOldDeletedLocked(ctx context.Context, fdb *folderDB) error {
 	l := slog.With("folder", fdb.folderID, "fdb", fdb.baseName)
 	if fdb.deleteRetention <= 0 {

+ 2 - 2
internal/db/sqlite/folderdb_counts.go

@@ -84,7 +84,7 @@ func (s *folderDB) needSizeRemote(device protocol.DeviceID) (db.Counts, error) {
 		WHERE g.local_flags & {{.FlagLocalGlobal}} != 0 AND NOT g.deleted AND g.local_flags & {{.LocalInvalidFlags}} = 0 AND NOT EXISTS (
 			SELECT 1 FROM FILES f
 			INNER JOIN devices d ON d.idx = f.device_idx
-			WHERE f.name = g.name AND f.version = g.version AND d.device_id = ?
+			WHERE f.name_idx = g.name_idx AND f.version_idx = g.version_idx AND d.device_id = ?
 		)
 		GROUP BY g.type, g.local_flags, g.deleted
 
@@ -94,7 +94,7 @@ func (s *folderDB) needSizeRemote(device protocol.DeviceID) (db.Counts, error) {
 		WHERE g.local_flags & {{.FlagLocalGlobal}} != 0 AND g.deleted AND g.local_flags & {{.LocalInvalidFlags}} = 0 AND EXISTS (
 			SELECT 1 FROM FILES f
 			INNER JOIN devices d ON d.idx = f.device_idx
-			WHERE f.name = g.name AND d.device_id = ? AND NOT f.deleted AND f.local_flags & {{.LocalInvalidFlags}} = 0
+			WHERE f.name_idx = g.name_idx AND d.device_id = ? AND NOT f.deleted AND f.local_flags & {{.LocalInvalidFlags}} = 0
 		)
 		GROUP BY g.type, g.local_flags, g.deleted
 	`).Select(&res, device.String(),

+ 21 - 14
internal/db/sqlite/folderdb_global.go

@@ -27,7 +27,8 @@ func (s *folderDB) GetGlobalFile(file string) (protocol.FileInfo, bool, error) {
 		SELECT fi.fiprotobuf, bl.blprotobuf FROM fileinfos fi
 		INNER JOIN files f on fi.sequence = f.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = f.blocklist_hash
-		WHERE f.name = ? AND f.local_flags & {{.FlagLocalGlobal}} != 0
+		INNER JOIN file_names n ON f.name_idx = n.idx
+		WHERE n.name = ? AND f.local_flags & {{.FlagLocalGlobal}} != 0
 	`).Get(&ind, file)
 	if errors.Is(err, sql.ErrNoRows) {
 		return protocol.FileInfo{}, false, nil
@@ -49,8 +50,9 @@ func (s *folderDB) GetGlobalAvailability(file string) ([]protocol.DeviceID, erro
 	err := s.stmt(`
 		SELECT d.device_id FROM files f
 		INNER JOIN devices d ON d.idx = f.device_idx
-		INNER JOIN files g ON g.version = f.version AND g.name = f.name
-		WHERE g.name = ? AND g.local_flags & {{.FlagLocalGlobal}} != 0 AND f.device_idx != {{.LocalDeviceIdx}}
+		INNER JOIN files g ON g.version_idx = f.version_idx AND g.name_idx = f.name_idx
+		INNER JOIN file_names n ON f.name_idx = n.idx
+		WHERE n.name = ? AND g.local_flags & {{.FlagLocalGlobal}} != 0 AND f.device_idx != {{.LocalDeviceIdx}}
 		ORDER BY d.device_id
 	`).Select(&devStrs, file)
 	if errors.Is(err, sql.ErrNoRows) {
@@ -74,9 +76,10 @@ func (s *folderDB) GetGlobalAvailability(file string) ([]protocol.DeviceID, erro
 
 func (s *folderDB) AllGlobalFiles() (iter.Seq[db.FileMetadata], func() error) {
 	it, errFn := iterStructs[db.FileMetadata](s.stmt(`
-		SELECT f.sequence, f.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
+		SELECT f.sequence, n.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
+		INNER JOIN file_names n ON f.name_idx = n.idx
 		WHERE f.local_flags & {{.FlagLocalGlobal}} != 0
-		ORDER BY f.name
+		ORDER BY n.name
 	`).Queryx())
 	return itererr.Map(it, errFn, func(m db.FileMetadata) (db.FileMetadata, error) {
 		m.Name = osutil.NativeFilename(m.Name)
@@ -93,9 +96,10 @@ func (s *folderDB) AllGlobalFilesPrefix(prefix string) (iter.Seq[db.FileMetadata
 	end := prefixEnd(prefix)
 
 	it, errFn := iterStructs[db.FileMetadata](s.stmt(`
-		SELECT f.sequence, f.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
-		WHERE f.name >= ? AND f.name < ? AND f.local_flags & {{.FlagLocalGlobal}} != 0
-		ORDER BY f.name
+		SELECT f.sequence, n.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
+		INNER JOIN file_names n ON f.name_idx = n.idx
+		WHERE n.name >= ? AND n.name < ? AND f.local_flags & {{.FlagLocalGlobal}} != 0
+		ORDER BY n.name
 	`).Queryx(prefix, end))
 	return itererr.Map(it, errFn, func(m db.FileMetadata) (db.FileMetadata, error) {
 		m.Name = osutil.NativeFilename(m.Name)
@@ -109,7 +113,7 @@ func (s *folderDB) AllNeededGlobalFiles(device protocol.DeviceID, order config.P
 	case config.PullOrderRandom:
 		selectOpts = "ORDER BY RANDOM()"
 	case config.PullOrderAlphabetic:
-		selectOpts = "ORDER BY g.name ASC"
+		selectOpts = "ORDER BY n.name ASC"
 	case config.PullOrderSmallestFirst:
 		selectOpts = "ORDER BY g.size ASC"
 	case config.PullOrderLargestFirst:
@@ -137,9 +141,10 @@ func (s *folderDB) AllNeededGlobalFiles(device protocol.DeviceID, order config.P
 func (s *folderDB) neededGlobalFilesLocal(selectOpts string) (iter.Seq[protocol.FileInfo], func() error) {
 	// Select all the non-ignored files with the need bit set.
 	it, errFn := iterStructs[indirectFI](s.stmt(`
-		SELECT fi.fiprotobuf, bl.blprotobuf, g.name, g.size, g.modified FROM fileinfos fi
+		SELECT fi.fiprotobuf, bl.blprotobuf, n.name, g.size, g.modified FROM fileinfos fi
 		INNER JOIN files g on fi.sequence = g.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = g.blocklist_hash
+		INNER JOIN file_names n ON g.name_idx = n.idx
 		WHERE g.local_flags & {{.FlagLocalIgnored}} = 0 AND g.local_flags & {{.FlagLocalNeeded}} != 0
 	` + selectOpts).Queryx())
 	return itererr.Map(it, errFn, indirectFI.FileInfo)
@@ -155,24 +160,26 @@ func (s *folderDB) neededGlobalFilesRemote(device protocol.DeviceID, selectOpts
 	//   non-deleted and valid remote file (of any version)
 
 	it, errFn := iterStructs[indirectFI](s.stmt(`
-		SELECT fi.fiprotobuf, bl.blprotobuf, g.name, g.size, g.modified FROM fileinfos fi
+		SELECT fi.fiprotobuf, bl.blprotobuf, n.name, g.size, g.modified FROM fileinfos fi
 		INNER JOIN files g on fi.sequence = g.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = g.blocklist_hash
+		INNER JOIN file_names n ON g.name_idx = n.idx
 		WHERE g.local_flags & {{.FlagLocalGlobal}} != 0 AND NOT g.deleted AND g.local_flags & {{.LocalInvalidFlags}} = 0 AND NOT EXISTS (
 			SELECT 1 FROM FILES f
 			INNER JOIN devices d ON d.idx = f.device_idx
-			WHERE f.name = g.name AND f.version = g.version AND d.device_id = ?
+			WHERE f.name_idx = g.name_idx AND f.version_idx = g.version_idx AND d.device_id = ?
 		)
 
 		UNION ALL
 
-		SELECT fi.fiprotobuf, bl.blprotobuf, g.name, g.size, g.modified FROM fileinfos fi
+		SELECT fi.fiprotobuf, bl.blprotobuf, n.name, g.size, g.modified FROM fileinfos fi
 		INNER JOIN files g on fi.sequence = g.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = g.blocklist_hash
+		INNER JOIN file_names n ON g.name_idx = n.idx
 		WHERE g.local_flags & {{.FlagLocalGlobal}} != 0 AND g.deleted AND g.local_flags & {{.LocalInvalidFlags}} = 0 AND EXISTS (
 			SELECT 1 FROM FILES f
 			INNER JOIN devices d ON d.idx = f.device_idx
-			WHERE f.name = g.name AND d.device_id = ? AND NOT f.deleted AND f.local_flags & {{.LocalInvalidFlags}} = 0 
+			WHERE f.name_idx = g.name_idx AND d.device_id = ? AND NOT f.deleted AND f.local_flags & {{.LocalInvalidFlags}} = 0
 		)
 	`+selectOpts).Queryx(
 		device.String(),

+ 13 - 7
internal/db/sqlite/folderdb_local.go

@@ -32,7 +32,8 @@ func (s *folderDB) GetDeviceFile(device protocol.DeviceID, file string) (protoco
 		INNER JOIN files f on fi.sequence = f.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = f.blocklist_hash
 		INNER JOIN devices d ON f.device_idx = d.idx
-		WHERE d.device_id = ? AND f.name = ?
+		INNER JOIN file_names n ON f.name_idx = n.idx
+		WHERE d.device_id = ? AND n.name = ?
 	`).Get(&ind, device.String(), file)
 	if errors.Is(err, sql.ErrNoRows) {
 		return protocol.FileInfo{}, false, nil
@@ -87,14 +88,16 @@ func (s *folderDB) AllLocalFilesWithPrefix(device protocol.DeviceID, prefix stri
 		INNER JOIN files f on fi.sequence = f.sequence
 		LEFT JOIN blocklists bl ON bl.blocklist_hash = f.blocklist_hash
 		INNER JOIN devices d ON d.idx = f.device_idx
-		WHERE d.device_id = ? AND f.name >= ? AND f.name < ?
+		INNER JOIN file_names n ON f.name_idx = n.idx
+		WHERE d.device_id = ? AND n.name >= ? AND n.name < ?
 	`, device.String(), prefix, end))
 	return itererr.Map(it, errFn, indirectFI.FileInfo)
 }
 
 func (s *folderDB) AllLocalFilesWithBlocksHash(h []byte) (iter.Seq[db.FileMetadata], func() error) {
 	return iterStructs[db.FileMetadata](s.stmt(`
-		SELECT f.sequence, f.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
+		SELECT f.sequence, n.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags FROM files f
+		INNER JOIN file_names n ON f.name_idx = n.idx
 		WHERE f.device_idx = {{.LocalDeviceIdx}} AND f.blocklist_hash = ?
 	`).Queryx(h))
 }
@@ -104,7 +107,8 @@ func (s *folderDB) AllLocalBlocksWithHash(hash []byte) (iter.Seq[db.BlockMapEntr
 	// & blocklists is deferred (garbage collected) while the files list is
 	// not. This filters out blocks that are in fact deleted.
 	return iterStructs[db.BlockMapEntry](s.stmt(`
-		SELECT f.blocklist_hash as blocklisthash, b.idx as blockindex, b.offset, b.size, f.name as filename FROM files f
+		SELECT f.blocklist_hash as blocklisthash, b.idx as blockindex, b.offset, b.size, n.name as filename FROM files f
+		INNER JOIN file_names n ON f.name_idx = n.idx
 		LEFT JOIN blocks b ON f.blocklist_hash = b.blocklist_hash
 		WHERE f.device_idx = {{.LocalDeviceIdx}} AND b.hash = ?
 	`).Queryx(hash))
@@ -170,10 +174,12 @@ func (s *folderDB) DebugFilePattern(out io.Writer, name string) error {
 	}
 	name = "%" + name + "%"
 	res := itererr.Zip(iterStructs[hashFileMetadata](s.stmt(`
-		SELECT f.sequence, f.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags, f.version, f.blocklist_hash as blocklisthash, d.device_id as deviceid FROM files f
+		SELECT f.sequence, n.name, f.type, f.modified as modnanos, f.size, f.deleted, f.local_flags as localflags, v.version, f.blocklist_hash as blocklisthash, d.device_id as deviceid FROM files f
 		INNER JOIN devices d ON d.idx = f.device_idx
-		WHERE f.name LIKE ?
-		ORDER BY f.name, f.device_idx
+		INNER JOIN file_names n ON n.idx = f.name_idx
+		INNER JOIN file_versions v ON v.idx = f.version_idx
+		WHERE n.name LIKE ?
+		ORDER BY n.name, f.device_idx
 	`).Queryx(name)))
 
 	delMap := map[bool]string{

+ 5 - 8
internal/db/sqlite/folderdb_open.go

@@ -95,16 +95,13 @@ func openFolderDBForMigration(folder, path string, deleteRetention time.Duration
 
 func (s *folderDB) deviceIdxLocked(deviceID protocol.DeviceID) (int64, error) {
 	devStr := deviceID.String()
-	if _, err := s.stmt(`
-		INSERT OR IGNORE INTO devices(device_id)
-		VALUES (?)
-	`).Exec(devStr); err != nil {
-		return 0, wrap(err)
-	}
 	var idx int64
 	if err := s.stmt(`
-		SELECT idx FROM devices
-		WHERE device_id = ?
+		INSERT INTO devices(device_id)
+		VALUES (?)
+		ON CONFLICT(device_id) DO UPDATE
+			SET device_id = excluded.device_id
+		RETURNING idx
 	`).Get(&idx, devStr); err != nil {
 		return 0, wrap(err)
 	}

+ 51 - 11
internal/db/sqlite/folderdb_update.go

@@ -46,9 +46,33 @@ func (s *folderDB) Update(device protocol.DeviceID, fs []protocol.FileInfo) erro
 	defer tx.Rollback() //nolint:errcheck
 	txp := &txPreparedStmts{Tx: tx}
 
+	//nolint:sqlclosecheck
+	insertNameStmt, err := txp.Preparex(`
+		INSERT INTO file_names(name)
+		VALUES (?)
+		ON CONFLICT(name) DO UPDATE
+			SET name = excluded.name
+		RETURNING idx
+	`)
+	if err != nil {
+		return wrap(err, "prepare insert name")
+	}
+
+	//nolint:sqlclosecheck
+	insertVersionStmt, err := txp.Preparex(`
+		INSERT INTO file_versions (version)
+		VALUES (?)
+		ON CONFLICT(version) DO UPDATE
+			SET version = excluded.version
+		RETURNING idx
+	`)
+	if err != nil {
+		return wrap(err, "prepare insert version")
+	}
+
 	//nolint:sqlclosecheck
 	insertFileStmt, err := txp.Preparex(`
-		INSERT OR REPLACE INTO files (device_idx, remote_sequence, name, type, modified, size, version, deleted, local_flags, blocklist_hash)
+		INSERT OR REPLACE INTO files (device_idx, remote_sequence, type, modified, size, deleted, local_flags, blocklist_hash, name_idx, version_idx)
 		VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 		RETURNING sequence
 	`)
@@ -102,8 +126,19 @@ func (s *folderDB) Update(device protocol.DeviceID, fs []protocol.FileInfo) erro
 			prevRemoteSeq = f.Sequence
 			remoteSeq = &f.Sequence
 		}
+
+		var nameIdx int64
+		if err := insertNameStmt.Get(&nameIdx, f.Name); err != nil {
+			return wrap(err, "insert name")
+		}
+
+		var versionIdx int64
+		if err := insertVersionStmt.Get(&versionIdx, f.Version.String()); err != nil {
+			return wrap(err, "insert version")
+		}
+
 		var localSeq int64
-		if err := insertFileStmt.Get(&localSeq, deviceIdx, remoteSeq, f.Name, f.Type, f.ModTime().UnixNano(), f.Size, f.Version.String(), f.IsDeleted(), f.LocalFlags, blockshash); err != nil {
+		if err := insertFileStmt.Get(&localSeq, deviceIdx, remoteSeq, f.Type, f.ModTime().UnixNano(), f.Size, f.IsDeleted(), f.LocalFlags, blockshash, nameIdx, versionIdx); err != nil {
 			return wrap(err, "insert file")
 		}
 
@@ -246,7 +281,9 @@ func (s *folderDB) DropFilesNamed(device protocol.DeviceID, names []string) erro
 
 	query, args, err := sqlx.In(`
 		DELETE FROM files
-		WHERE device_idx = ? AND name IN (?)
+		WHERE device_idx = ? AND name_idx IN (
+			SELECT idx FROM file_names WHERE name IN (?)
+		)
 	`, deviceIdx, names)
 	if err != nil {
 		return wrap(err)
@@ -299,12 +336,13 @@ func (s *folderDB) recalcGlobalForFolder(txp *txPreparedStmts) error {
 	// recalculate.
 	//nolint:sqlclosecheck
 	namesStmt, err := txp.Preparex(`
-		SELECT f.name FROM files f
+		SELECT n.name FROM files f
+		INNER JOIN file_names n ON n.idx = f.name_idx
 		WHERE NOT EXISTS (
 			SELECT 1 FROM files g
-			WHERE g.name = f.name AND g.local_flags & ? != 0
+			WHERE g.name_idx = f.name_idx AND g.local_flags & ? != 0
 		)
-		GROUP BY name
+		GROUP BY n.name
 	`)
 	if err != nil {
 		return wrap(err)
@@ -329,11 +367,13 @@ func (s *folderDB) recalcGlobalForFolder(txp *txPreparedStmts) error {
 func (s *folderDB) recalcGlobalForFile(txp *txPreparedStmts, file string) error {
 	//nolint:sqlclosecheck
 	selStmt, err := txp.Preparex(`
-		SELECT name, device_idx, sequence, modified, version, deleted, local_flags FROM files
-		WHERE name = ?
+		SELECT n.name, f.device_idx, f.sequence, f.modified, v.version, f.deleted, f.local_flags FROM files f
+		INNER JOIN file_versions v ON v.idx = f.version_idx
+		INNER JOIN file_names n ON n.idx = f.name_idx
+		WHERE n.name = ?
 	`)
 	if err != nil {
-		return wrap(err)
+		return wrap(err, "prepare select")
 	}
 	es, err := itererr.Collect(iterStructs[fileRow](selStmt.Queryx(file)))
 	if err != nil {
@@ -389,10 +429,10 @@ func (s *folderDB) recalcGlobalForFile(txp *txPreparedStmts, file string) error
 	//nolint:sqlclosecheck
 	upStmt, err = txp.Preparex(`
 		UPDATE files SET local_flags = local_flags & ?
-		WHERE name = ? AND sequence != ? AND local_flags & ? != 0
+		WHERE name_idx = (SELECT idx FROM file_names WHERE name = ?) AND sequence != ? AND local_flags & ? != 0
 	`)
 	if err != nil {
-		return wrap(err)
+		return wrap(err, "prepare update")
 	}
 	if _, err := upStmt.Exec(^(protocol.FlagLocalNeeded | protocol.FlagLocalGlobal), global.Name, global.Sequence, protocol.FlagLocalNeeded|protocol.FlagLocalGlobal); err != nil {
 		return wrap(err)

+ 53 - 0
internal/db/sqlite/sql/migrations/folder/05-normalize-files.sql

@@ -0,0 +1,53 @@
+-- Copyright (C) 2025 The Syncthing Authors.
+--
+-- This Source Code Form is subject to the terms of the Mozilla Public
+-- License, v. 2.0. If a copy of the MPL was not distributed with this file,
+-- You can obtain one at https://mozilla.org/MPL/2.0/.
+
+-- Grab all unique names into the names table
+
+INSERT INTO file_names (idx, name) SELECT DISTINCT null, name FROM files
+;
+
+-- Grab all unique versions into the versions table
+
+INSERT INTO file_versions (idx, version) SELECT DISTINCT null, version FROM files
+;
+
+-- Create the new files table
+
+DROP TABLE IF EXISTS files_v5
+;
+
+CREATE TABLE files_v5 (
+    device_idx INTEGER NOT NULL,
+    sequence INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    remote_sequence INTEGER,
+    name_idx INTEGER NOT NULL, -- changed
+    type INTEGER NOT NULL,
+    modified INTEGER NOT NULL,
+    size INTEGER NOT NULL,
+    version_idx INTEGER NOT NULL, -- changed
+    deleted INTEGER NOT NULL,
+    local_flags INTEGER NOT NULL,
+    blocklist_hash BLOB,
+    FOREIGN KEY(device_idx) REFERENCES devices(idx) ON DELETE CASCADE,
+    FOREIGN KEY(name_idx) REFERENCES file_names(idx), -- added
+    FOREIGN KEY(version_idx) REFERENCES file_versions(idx) -- added
+) STRICT
+;
+
+-- Populate the new files table and move it in place
+
+INSERT INTO files_v5
+    SELECT f.device_idx, f.sequence, f.remote_sequence, n.idx as name_idx, f.type, f.modified, f.size, v.idx as version_idx, f.deleted, f.local_flags, f.blocklist_hash
+    FROM files f
+    INNER JOIN file_names n ON n.name = f.name
+    INNER JOIN file_versions v ON v.version = f.version
+;
+
+DROP TABLE files
+;
+
+ALTER TABLE files_v5 RENAME TO files
+;

+ 25 - 7
internal/db/sqlite/sql/schema/folder/20-files.sql

@@ -25,15 +25,27 @@ CREATE TABLE IF NOT EXISTS files (
     device_idx INTEGER NOT NULL, -- actual device ID or LocalDeviceID
     sequence INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, -- our local database sequence, for each and every entry
     remote_sequence INTEGER, -- remote device's sequence number, null for local or synthetic entries
-    name TEXT NOT NULL COLLATE BINARY,
+    name_idx INTEGER NOT NULL,
     type INTEGER NOT NULL, -- protocol.FileInfoType
     modified INTEGER NOT NULL, -- Unix nanos
     size INTEGER NOT NULL,
-    version TEXT NOT NULL COLLATE BINARY,
+    version_idx INTEGER NOT NULL,
     deleted INTEGER NOT NULL, -- boolean
     local_flags INTEGER NOT NULL,
     blocklist_hash BLOB, -- null when there are no blocks
-    FOREIGN KEY(device_idx) REFERENCES devices(idx) ON DELETE CASCADE
+    FOREIGN KEY(device_idx) REFERENCES devices(idx) ON DELETE CASCADE,
+    FOREIGN KEY(name_idx) REFERENCES file_names(idx),
+    FOREIGN KEY(version_idx) REFERENCES file_versions(idx)
+) STRICT
+;
+CREATE TABLE IF NOT EXISTS file_names (
+    idx INTEGER NOT NULL PRIMARY KEY,
+    name TEXT NOT NULL UNIQUE COLLATE BINARY
+) STRICT
+;
+CREATE TABLE IF NOT EXISTS file_versions (
+    idx INTEGER NOT NULL PRIMARY KEY,
+    version TEXT NOT NULL UNIQUE COLLATE BINARY
 ) STRICT
 ;
 -- FileInfos store the actual protobuf object. We do this separately to keep
@@ -49,11 +61,17 @@ CREATE UNIQUE INDEX IF NOT EXISTS files_remote_sequence ON files (device_idx, re
     WHERE remote_sequence IS NOT NULL
 ;
 -- There can be only one file per folder, device, and name
-CREATE UNIQUE INDEX IF NOT EXISTS files_device_name ON files (device_idx, name)
-;
--- We want to be able to look up & iterate files based on just folder and name
-CREATE INDEX IF NOT EXISTS files_name_only ON files (name)
+CREATE UNIQUE INDEX IF NOT EXISTS files_device_name ON files (device_idx, name_idx)
 ;
 -- We want to be able to look up & iterate files based on blocks hash
 CREATE INDEX IF NOT EXISTS files_blocklist_hash_only ON files (blocklist_hash, device_idx) WHERE blocklist_hash IS NOT NULL
 ;
+-- We need to look by name_idx or version_idx for garbage collection.
+-- This will fail pre-migration for v4 schemas, which is fine.
+-- syncthing:ignore-failure
+CREATE INDEX IF NOT EXISTS files_name_idx_only ON files (name_idx)
+;
+-- This will fail pre-migration for v4 schemas, which is fine.
+-- syncthing:ignore-failure
+CREATE INDEX IF NOT EXISTS files_version_idx_only ON files (version_idx)
+;

+ 19 - 0
lib/osutil/osutil.go

@@ -8,6 +8,7 @@
 package osutil
 
 import (
+	"os"
 	"path/filepath"
 	"strings"
 	"sync"
@@ -142,3 +143,21 @@ func IsDeleted(ffs fs.Filesystem, name string) bool {
 	}
 	return false
 }
+
+func DirSize(location string) int64 {
+	entries, err := os.ReadDir(location)
+	if err != nil {
+		return 0
+	}
+
+	var size int64
+	for _, entry := range entries {
+		fi, err := entry.Info()
+		if err != nil {
+			continue
+		}
+		size += fi.Size()
+	}
+
+	return size
+}