diff --git a/linux-tkg-patches/6.6/0008-6.6-bcachefs.patch b/linux-tkg-patches/6.6/0008-6.6-bcachefs.patch index fc6193a..08b210f 100644 --- a/linux-tkg-patches/6.6/0008-6.6-bcachefs.patch +++ b/linux-tkg-patches/6.6/0008-6.6-bcachefs.patch @@ -1,6 +1,6 @@ -From eba9f7cba146ee634aa1f329b87e5fe09eef9f51 Mon Sep 17 00:00:00 2001 +From 0a195912e89bd49e868e7e4879d137091f0762c8 Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Fri, 27 Oct 2023 15:37:35 +0200 +Date: Wed, 8 Nov 2023 08:07:19 +0100 Subject: [PATCH] bcachefs Signed-off-by: Piotr Gorski @@ -13,99 +13,101 @@ Signed-off-by: Piotr Gorski drivers/md/bcache/util.h | 3 +- fs/Kconfig | 1 + fs/Makefile | 1 + - fs/bcachefs/Kconfig | 85 + - fs/bcachefs/Makefile | 88 + + fs/bcachefs/Kconfig | 83 + + fs/bcachefs/Makefile | 89 + fs/bcachefs/acl.c | 463 +++ fs/bcachefs/acl.h | 60 + - fs/bcachefs/alloc_background.c | 2146 +++++++++++ - fs/bcachefs/alloc_background.h | 258 ++ - fs/bcachefs/alloc_foreground.c | 1576 ++++++++ + fs/bcachefs/alloc_background.c | 2159 +++++++++++ + fs/bcachefs/alloc_background.h | 259 ++ + fs/bcachefs/alloc_foreground.c | 1600 ++++++++ fs/bcachefs/alloc_foreground.h | 224 ++ fs/bcachefs/alloc_types.h | 126 + - fs/bcachefs/backpointers.c | 868 +++++ - fs/bcachefs/backpointers.h | 131 + - fs/bcachefs/bbpos.h | 48 + - fs/bcachefs/bcachefs.h | 1156 ++++++ - fs/bcachefs/bcachefs_format.h | 2413 +++++++++++++ + fs/bcachefs/backpointers.c | 860 +++++ + fs/bcachefs/backpointers.h | 140 + + fs/bcachefs/bbpos.h | 37 + + fs/bcachefs/bbpos_types.h | 18 + + fs/bcachefs/bcachefs.h | 1161 ++++++ + fs/bcachefs/bcachefs_format.h | 2425 ++++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ fs/bcachefs/bkey.c | 1120 ++++++ - fs/bcachefs/bkey.h | 782 ++++ + fs/bcachefs/bkey.h | 778 ++++ fs/bcachefs/bkey_buf.h | 61 + fs/bcachefs/bkey_cmp.h | 129 + - fs/bcachefs/bkey_methods.c | 458 +++ - fs/bcachefs/bkey_methods.h | 188 + - fs/bcachefs/bkey_sort.c | 201 ++ + fs/bcachefs/bkey_methods.c | 459 +++ + fs/bcachefs/bkey_methods.h | 179 + + fs/bcachefs/bkey_sort.c | 201 + fs/bcachefs/bkey_sort.h | 54 + fs/bcachefs/bset.c | 1592 ++++++++ fs/bcachefs/bset.h | 541 +++ - fs/bcachefs/btree_cache.c | 1202 ++++++ - fs/bcachefs/btree_cache.h | 130 + - fs/bcachefs/btree_gc.c | 2111 +++++++++++ + fs/bcachefs/btree_cache.c | 1215 ++++++ + fs/bcachefs/btree_cache.h | 131 + + fs/bcachefs/btree_gc.c | 2145 +++++++++++ fs/bcachefs/btree_gc.h | 114 + - fs/bcachefs/btree_io.c | 2223 ++++++++++++ + fs/bcachefs/btree_io.c | 2298 ++++++++++++ fs/bcachefs/btree_io.h | 228 ++ - fs/bcachefs/btree_iter.c | 3215 +++++++++++++++++ - fs/bcachefs/btree_iter.h | 939 +++++ + fs/bcachefs/btree_iter.c | 3242 +++++++++++++++++ + fs/bcachefs/btree_iter.h | 943 +++++ fs/bcachefs/btree_journal_iter.c | 531 +++ fs/bcachefs/btree_journal_iter.h | 57 + fs/bcachefs/btree_key_cache.c | 1072 ++++++ fs/bcachefs/btree_key_cache.h | 48 + - fs/bcachefs/btree_locking.c | 791 ++++ - fs/bcachefs/btree_locking.h | 423 +++ - fs/bcachefs/btree_trans_commit.c | 1150 ++++++ - fs/bcachefs/btree_types.h | 739 ++++ + fs/bcachefs/btree_locking.c | 817 +++++ + fs/bcachefs/btree_locking.h | 433 +++ + fs/bcachefs/btree_trans_commit.c | 1145 ++++++ + fs/bcachefs/btree_types.h | 756 ++++ fs/bcachefs/btree_update.c | 933 +++++ fs/bcachefs/btree_update.h | 340 ++ - fs/bcachefs/btree_update_interior.c | 2480 +++++++++++++ + fs/bcachefs/btree_update_interior.c | 2474 +++++++++++++ fs/bcachefs/btree_update_interior.h | 337 ++ fs/bcachefs/btree_write_buffer.c | 375 ++ fs/bcachefs/btree_write_buffer.h | 14 + fs/bcachefs/btree_write_buffer_types.h | 44 + - fs/bcachefs/buckets.c | 2106 +++++++++++ - fs/bcachefs/buckets.h | 443 +++ + fs/bcachefs/buckets.c | 2168 +++++++++++ + fs/bcachefs/buckets.h | 458 +++ fs/bcachefs/buckets_types.h | 92 + fs/bcachefs/buckets_waiting_for_journal.c | 166 + fs/bcachefs/buckets_waiting_for_journal.h | 15 + .../buckets_waiting_for_journal_types.h | 23 + fs/bcachefs/chardev.c | 784 ++++ fs/bcachefs/chardev.h | 31 + - fs/bcachefs/checksum.c | 804 +++++ + fs/bcachefs/checksum.c | 804 ++++ fs/bcachefs/checksum.h | 213 ++ fs/bcachefs/clock.c | 193 + fs/bcachefs/clock.h | 38 + fs/bcachefs/clock_types.h | 37 + - fs/bcachefs/compress.c | 710 ++++ - fs/bcachefs/compress.h | 55 + + fs/bcachefs/compress.c | 728 ++++ + fs/bcachefs/compress.h | 73 + fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + - fs/bcachefs/darray.h | 87 + - fs/bcachefs/data_update.c | 558 +++ - fs/bcachefs/data_update.h | 43 + + fs/bcachefs/darray.h | 93 + + fs/bcachefs/data_update.c | 551 +++ + fs/bcachefs/data_update.h | 44 + fs/bcachefs/debug.c | 954 +++++ fs/bcachefs/debug.h | 32 + - fs/bcachefs/dirent.c | 587 +++ + fs/bcachefs/dirent.c | 577 +++ fs/bcachefs/dirent.h | 70 + - fs/bcachefs/disk_groups.c | 550 +++ - fs/bcachefs/disk_groups.h | 106 + - fs/bcachefs/ec.c | 1966 ++++++++++ + fs/bcachefs/disk_groups.c | 620 ++++ + fs/bcachefs/disk_groups.h | 111 + + fs/bcachefs/disk_groups_types.h | 18 + + fs/bcachefs/ec.c | 1969 ++++++++++ fs/bcachefs/ec.h | 260 ++ fs/bcachefs/ec_types.h | 41 + fs/bcachefs/errcode.c | 68 + - fs/bcachefs/errcode.h | 265 ++ - fs/bcachefs/error.c | 293 ++ - fs/bcachefs/error.h | 206 ++ + fs/bcachefs/errcode.h | 269 ++ + fs/bcachefs/error.c | 299 ++ + fs/bcachefs/error.h | 242 ++ fs/bcachefs/extent_update.c | 173 + fs/bcachefs/extent_update.h | 12 + - fs/bcachefs/extents.c | 1403 +++++++ - fs/bcachefs/extents.h | 758 ++++ + fs/bcachefs/extents.c | 1516 ++++++++ + fs/bcachefs/extents.h | 765 ++++ fs/bcachefs/extents_types.h | 40 + fs/bcachefs/eytzinger.h | 281 ++ fs/bcachefs/fifo.h | 127 + fs/bcachefs/fs-common.c | 501 +++ fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io-buffered.c | 1093 ++++++ + fs/bcachefs/fs-io-buffered.c | 1106 ++++++ fs/bcachefs/fs-io-buffered.h | 27 + - fs/bcachefs/fs-io-direct.c | 679 ++++ + fs/bcachefs/fs-io-direct.c | 680 ++++ fs/bcachefs/fs-io-direct.h | 16 + fs/bcachefs/fs-io-pagecache.c | 791 ++++ fs/bcachefs/fs-io-pagecache.h | 176 + @@ -113,22 +115,22 @@ Signed-off-by: Piotr Gorski fs/bcachefs/fs-io.h | 184 + fs/bcachefs/fs-ioctl.c | 572 +++ fs/bcachefs/fs-ioctl.h | 81 + - fs/bcachefs/fs.c | 1980 ++++++++++ + fs/bcachefs/fs.c | 1977 ++++++++++ fs/bcachefs/fs.h | 209 ++ - fs/bcachefs/fsck.c | 2417 +++++++++++++ - fs/bcachefs/fsck.h | 14 + - fs/bcachefs/inode.c | 1133 ++++++ - fs/bcachefs/inode.h | 207 ++ - fs/bcachefs/io_misc.c | 515 +++ + fs/bcachefs/fsck.c | 2490 +++++++++++++ + fs/bcachefs/fsck.h | 15 + + fs/bcachefs/inode.c | 1198 ++++++ + fs/bcachefs/inode.h | 217 ++ + fs/bcachefs/io_misc.c | 524 +++ fs/bcachefs/io_misc.h | 34 + - fs/bcachefs/io_read.c | 1210 +++++++ + fs/bcachefs/io_read.c | 1210 ++++++ fs/bcachefs/io_read.h | 158 + - fs/bcachefs/io_write.c | 1671 +++++++++ + fs/bcachefs/io_write.c | 1675 +++++++++ fs/bcachefs/io_write.h | 110 + fs/bcachefs/io_write_types.h | 96 + - fs/bcachefs/journal.c | 1449 ++++++++ - fs/bcachefs/journal.h | 548 +++ - fs/bcachefs/journal_io.c | 1894 ++++++++++ + fs/bcachefs/journal.c | 1468 ++++++++ + fs/bcachefs/journal.h | 549 +++ + fs/bcachefs/journal_io.c | 1947 ++++++++++ fs/bcachefs/journal_io.h | 65 + fs/bcachefs/journal_reclaim.c | 876 +++++ fs/bcachefs/journal_reclaim.h | 87 + @@ -142,76 +144,79 @@ Signed-off-by: Piotr Gorski fs/bcachefs/keylist_types.h | 16 + fs/bcachefs/logged_ops.c | 112 + fs/bcachefs/logged_ops.h | 20 + - fs/bcachefs/lru.c | 162 + + fs/bcachefs/lru.c | 164 + fs/bcachefs/lru.h | 69 + fs/bcachefs/mean_and_variance.c | 159 + fs/bcachefs/mean_and_variance.h | 198 + fs/bcachefs/mean_and_variance_test.c | 240 ++ fs/bcachefs/migrate.c | 179 + fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 1159 ++++++ - fs/bcachefs/move.h | 96 + + fs/bcachefs/move.c | 1198 ++++++ + fs/bcachefs/move.h | 139 + fs/bcachefs/move_types.h | 36 + - fs/bcachefs/movinggc.c | 414 +++ + fs/bcachefs/movinggc.c | 431 +++ fs/bcachefs/movinggc.h | 12 + fs/bcachefs/nocow_locking.c | 144 + fs/bcachefs/nocow_locking.h | 50 + fs/bcachefs/nocow_locking_types.h | 20 + - fs/bcachefs/opts.c | 605 ++++ + fs/bcachefs/opts.c | 602 +++ fs/bcachefs/opts.h | 564 +++ fs/bcachefs/printbuf.c | 425 +++ fs/bcachefs/printbuf.h | 284 ++ - fs/bcachefs/quota.c | 978 +++++ + fs/bcachefs/quota.c | 979 +++++ fs/bcachefs/quota.h | 74 + fs/bcachefs/quota_types.h | 43 + - fs/bcachefs/rebalance.c | 366 ++ - fs/bcachefs/rebalance.h | 28 + - fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1049 ++++++ + fs/bcachefs/rebalance.c | 464 +++ + fs/bcachefs/rebalance.h | 27 + + fs/bcachefs/rebalance_types.h | 37 + + fs/bcachefs/recovery.c | 1057 ++++++ fs/bcachefs/recovery.h | 33 + - fs/bcachefs/recovery_types.h | 49 + - fs/bcachefs/reflink.c | 405 +++ + fs/bcachefs/recovery_types.h | 53 + + fs/bcachefs/reflink.c | 406 +++ fs/bcachefs/reflink.h | 81 + - fs/bcachefs/replicas.c | 1058 ++++++ + fs/bcachefs/replicas.c | 1050 ++++++ fs/bcachefs/replicas.h | 91 + fs/bcachefs/replicas_types.h | 27 + - fs/bcachefs/sb-clean.c | 395 ++ + fs/bcachefs/sb-clean.c | 398 ++ fs/bcachefs/sb-clean.h | 16 + - fs/bcachefs/sb-members.c | 339 ++ - fs/bcachefs/sb-members.h | 182 + + fs/bcachefs/sb-errors.c | 172 + + fs/bcachefs/sb-errors.h | 270 ++ + fs/bcachefs/sb-errors_types.h | 16 + + fs/bcachefs/sb-members.c | 420 +++ + fs/bcachefs/sb-members.h | 227 ++ fs/bcachefs/seqmutex.h | 48 + fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + - fs/bcachefs/six.c | 913 +++++ + fs/bcachefs/six.c | 917 +++++ fs/bcachefs/six.h | 393 ++ - fs/bcachefs/snapshot.c | 1689 +++++++++ - fs/bcachefs/snapshot.h | 270 ++ + fs/bcachefs/snapshot.c | 1713 +++++++++ + fs/bcachefs/snapshot.h | 268 ++ fs/bcachefs/str_hash.h | 370 ++ - fs/bcachefs/subvolume.c | 450 +++ + fs/bcachefs/subvolume.c | 437 +++ fs/bcachefs/subvolume.h | 35 + fs/bcachefs/subvolume_types.h | 31 + - fs/bcachefs/super-io.c | 1258 +++++++ - fs/bcachefs/super-io.h | 124 + - fs/bcachefs/super.c | 2022 +++++++++++ + fs/bcachefs/super-io.c | 1266 +++++++ + fs/bcachefs/super-io.h | 94 + + fs/bcachefs/super.c | 2017 ++++++++++ fs/bcachefs/super.h | 52 + - fs/bcachefs/super_types.h | 52 + - fs/bcachefs/sysfs.c | 1031 ++++++ + fs/bcachefs/super_types.h | 40 + + fs/bcachefs/sysfs.c | 1034 ++++++ fs/bcachefs/sysfs.h | 48 + fs/bcachefs/tests.c | 919 +++++ fs/bcachefs/tests.h | 15 + - fs/bcachefs/trace.c | 16 + - fs/bcachefs/trace.h | 1284 +++++++ + fs/bcachefs/trace.c | 17 + + fs/bcachefs/trace.h | 1334 +++++++ fs/bcachefs/two_state_shared_lock.c | 8 + fs/bcachefs/two_state_shared_lock.h | 59 + - fs/bcachefs/util.c | 1141 ++++++ - fs/bcachefs/util.h | 852 +++++ + fs/bcachefs/util.c | 1159 ++++++ + fs/bcachefs/util.h | 833 +++++ fs/bcachefs/varint.c | 129 + fs/bcachefs/varint.h | 11 + fs/bcachefs/vstructs.h | 63 + - fs/bcachefs/xattr.c | 651 ++++ + fs/bcachefs/xattr.c | 643 ++++ fs/bcachefs/xattr.h | 50 + fs/dcache.c | 12 +- - .../md/bcache => include/linux}/closure.h | 46 +- + .../md/bcache => include/linux}/closure.h | 56 +- include/linux/dcache.h | 1 + include/linux/exportfs.h | 6 + include/linux/generic-radix-tree.h | 68 +- @@ -223,12 +228,12 @@ Signed-off-by: Piotr Gorski lib/Kconfig | 3 + lib/Kconfig.debug | 9 + lib/Makefile | 2 + - {drivers/md/bcache => lib}/closure.c | 36 +- + {drivers/md/bcache => lib}/closure.c | 45 +- lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- lib/string_helpers.c | 10 +- tools/objtool/noreturns.h | 2 + - 223 files changed, 95037 insertions(+), 56 deletions(-) + 228 files changed, 96727 insertions(+), 60 deletions(-) create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile create mode 100644 fs/bcachefs/acl.c @@ -241,6 +246,7 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/backpointers.c create mode 100644 fs/bcachefs/backpointers.h create mode 100644 fs/bcachefs/bbpos.h + create mode 100644 fs/bcachefs/bbpos_types.h create mode 100644 fs/bcachefs/bcachefs.h create mode 100644 fs/bcachefs/bcachefs_format.h create mode 100644 fs/bcachefs/bcachefs_ioctl.h @@ -303,6 +309,7 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/dirent.h create mode 100644 fs/bcachefs/disk_groups.c create mode 100644 fs/bcachefs/disk_groups.h + create mode 100644 fs/bcachefs/disk_groups_types.h create mode 100644 fs/bcachefs/ec.c create mode 100644 fs/bcachefs/ec.h create mode 100644 fs/bcachefs/ec_types.h @@ -393,6 +400,9 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/replicas_types.h create mode 100644 fs/bcachefs/sb-clean.c create mode 100644 fs/bcachefs/sb-clean.h + create mode 100644 fs/bcachefs/sb-errors.c + create mode 100644 fs/bcachefs/sb-errors.h + create mode 100644 fs/bcachefs/sb-errors_types.h create mode 100644 fs/bcachefs/sb-members.c create mode 100644 fs/bcachefs/sb-members.h create mode 100644 fs/bcachefs/seqmutex.h @@ -426,14 +436,14 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/vstructs.h create mode 100644 fs/bcachefs/xattr.c create mode 100644 fs/bcachefs/xattr.h - rename {drivers/md/bcache => include/linux}/closure.h (93%) - rename {drivers/md/bcache => lib}/closure.c (88%) + rename {drivers/md/bcache => include/linux}/closure.h (91%) + rename {drivers/md/bcache => lib}/closure.c (83%) diff --git a/MAINTAINERS b/MAINTAINERS -index 7a7bd8bd80e9..0b57e61205f9 100644 +index 2894f0777537..ce1c7073f40c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -3481,6 +3481,14 @@ W: http://bcache.evilpiepirate.org +@@ -3482,6 +3482,14 @@ W: http://bcache.evilpiepirate.org C: irc://irc.oftc.net/bcache F: drivers/md/bcache/ @@ -448,7 +458,7 @@ index 7a7bd8bd80e9..0b57e61205f9 100644 BDISP ST MEDIA DRIVER M: Fabien Dessenne L: linux-media@vger.kernel.org -@@ -5067,6 +5075,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core +@@ -5068,6 +5076,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/devicetree/bindings/timer/ F: drivers/clocksource/ @@ -463,7 +473,7 @@ index 7a7bd8bd80e9..0b57e61205f9 100644 CMPC ACPI DRIVER M: Thadeu Lima de Souza Cascardo M: Daniel Oliveira Nascimento -@@ -8747,6 +8763,13 @@ S: Supported +@@ -8748,6 +8764,13 @@ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git F: drivers/pmdomain/ @@ -597,10 +607,10 @@ index f9541f40be4e..75522f88e763 100644 obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000000..df13a4f9a6e3 +index 000000000000..c08c2c7d6fbb --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,85 @@ +@@ -0,0 +1,83 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support (EXPERIMENTAL)" @@ -627,7 +637,6 @@ index 000000000000..df13a4f9a6e3 + select XXHASH + select SRCU + select SYMBOLIC_ERRNAME -+ select MEAN_AND_VARIANCE + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. @@ -645,7 +654,6 @@ index 000000000000..df13a4f9a6e3 +config BCACHEFS_DEBUG_TRANSACTIONS + bool "bcachefs runtime info" + depends on BCACHEFS_FS -+ default y + help + This makes the list of running btree transactions available in debugfs. + @@ -681,17 +689,17 @@ index 000000000000..df13a4f9a6e3 +config MEAN_AND_VARIANCE_UNIT_TEST + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS + depends on KUNIT -+ select MEAN_AND_VARIANCE ++ depends on BCACHEFS_FS + default KUNIT_ALL_TESTS + help + This option enables the kunit tests for mean_and_variance module. + If unsure, say N. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 -index 000000000000..0749731b9072 +index 000000000000..45b64f89258c --- /dev/null +++ b/fs/bcachefs/Makefile -@@ -0,0 +1,88 @@ +@@ -0,0 +1,89 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + @@ -764,6 +772,7 @@ index 000000000000..0749731b9072 + reflink.o \ + replicas.o \ + sb-clean.o \ ++ sb-errors.o \ + sb-members.o \ + siphash.o \ + six.o \ @@ -1317,10 +1326,10 @@ index 000000000000..27e7eec0f278 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000000..2d516207e223 +index 000000000000..1fec0e67891f --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,2146 @@ +@@ -0,0 +1,2159 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -1515,123 +1524,109 @@ index 000000000000..2d516207e223 + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + -+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ int ret = 0; + + /* allow for unknown fields */ -+ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { -+ prt_printf(err, "incorrect value size (%zu < %u)", -+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, ++ alloc_v1_val_size_bad, ++ "incorrect value size (%zu < %u)", ++ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); ++fsck_err: ++ return ret; +} + -+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; ++ int ret = 0; + -+ if (bch2_alloc_unpack_v2(&u, k)) { -+ prt_printf(err, "unpack error"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, ++ alloc_v2_unpack_error, ++ "unpack error"); ++fsck_err: ++ return ret; +} + -+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; ++ int ret = 0; + -+ if (bch2_alloc_unpack_v3(&u, k)) { -+ prt_printf(err, "unpack error"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, ++ alloc_v2_unpack_error, ++ "unpack error"); ++fsck_err: ++ return ret; +} + -+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); ++ int ret = 0; + -+ if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { -+ prt_printf(err, "bad val size (%u > %zu)", -+ alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, ++ alloc_v4_val_size_bad, ++ "bad val size (%u > %zu)", ++ alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + -+ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && -+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { -+ prt_printf(err, "invalid backpointers_start"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && ++ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, ++ alloc_v4_backpointers_start_bad, ++ "invalid backpointers_start"); + -+ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { -+ prt_printf(err, "invalid data type (got %u should be %u)", -+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, ++ alloc_key_data_type_bad, ++ "invalid data type (got %u should be %u)", ++ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: -+ if (a.v->dirty_sectors || -+ a.v->cached_sectors || -+ a.v->stripe) { -+ prt_printf(err, "empty data type free but have data"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(a.v->dirty_sectors || ++ a.v->cached_sectors || ++ a.v->stripe, c, err, ++ alloc_key_empty_but_have_data, ++ "empty data type free but have data"); + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: -+ if (!a.v->dirty_sectors) { -+ prt_printf(err, "data_type %s but dirty_sectors==0", -+ bch2_data_types[a.v->data_type]); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!a.v->dirty_sectors, c, err, ++ alloc_key_dirty_sectors_0, ++ "data_type %s but dirty_sectors==0", ++ bch2_data_types[a.v->data_type]); + break; + case BCH_DATA_cached: -+ if (!a.v->cached_sectors || -+ a.v->dirty_sectors || -+ a.v->stripe) { -+ prt_printf(err, "data type inconsistency"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!a.v->cached_sectors || ++ a.v->dirty_sectors || ++ a.v->stripe, c, err, ++ alloc_key_cached_inconsistency, ++ "data type inconsistency"); + -+ if (!a.v->io_time[READ] && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { -+ prt_printf(err, "cached bucket with read_time == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!a.v->io_time[READ] && ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, ++ c, err, ++ alloc_key_cached_but_read_time_zero, ++ "cached bucket with read_time == 0"); + break; + case BCH_DATA_stripe: + break; + } -+ -+ return 0; -+} -+ -+static inline u64 swab40(u64 x) -+{ -+ return (((x & 0x00000000ffULL) << 32)| -+ ((x & 0x000000ff00ULL) << 16)| -+ ((x & 0x0000ff0000ULL) >> 0)| -+ ((x & 0x00ff000000ULL) >> 16)| -+ ((x & 0xff00000000ULL) >> 32)); ++fsck_err: ++ return ret; +} + +void bch2_alloc_v4_swab(struct bkey_s k) @@ -1647,6 +1642,7 @@ index 000000000000..2d516207e223 + a->io_time[1] = swab64(a->io_time[1]); + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); ++ a->fragmentation_lru = swab64(a->fragmentation_lru); + + bps = alloc_v4_backpointers(a); + for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { @@ -1844,17 +1840,18 @@ index 000000000000..2d516207e223 + : 0; +} + -+int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { -+ prt_printf(err, "bad val size (%zu != %zu)", -+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, ++ bucket_gens_val_size_bad, ++ "bad val size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); ++fsck_err: ++ return ret; +} + +void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) @@ -2050,7 +2047,7 @@ index 000000000000..2d516207e223 + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", -+ bch2_btree_ids[btree], ++ bch2_btree_id_str(btree), + iter.pos.inode, + iter.pos.offset, + bch2_bkey_types[old.k->type], @@ -2309,6 +2306,7 @@ index 000000000000..2d516207e223 + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, ++ alloc_key_to_missing_dev_bucket, + "alloc key for invalid device:bucket %llu:%llu", + alloc_k.k->p.inode, alloc_k.k->p.offset)) + return bch2_btree_delete_at(trans, alloc_iter, 0); @@ -2328,7 +2326,8 @@ index 000000000000..2d516207e223 + + if (k.k->type != discard_key_type && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" ++ fsck_err(c, need_discard_key_wrong, ++ "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], @@ -2358,7 +2357,8 @@ index 000000000000..2d516207e223 + + if (k.k->type != freespace_key_type && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" ++ fsck_err(c, freespace_key_wrong, ++ "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], @@ -2389,7 +2389,8 @@ index 000000000000..2d516207e223 + + if (a->gen != alloc_gen(k, gens_offset) && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" ++ fsck_err(c, bucket_gens_key_wrong, ++ "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), @@ -2447,7 +2448,8 @@ index 000000000000..2d516207e223 + + if (k.k->type != KEY_TYPE_set && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "hole in alloc btree missing in freespace btree\n" ++ fsck_err(c, freespace_hole_missing, ++ "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, @@ -2510,6 +2512,7 @@ index 000000000000..2d516207e223 + + for (i = gens_offset; i < gens_end_offset; i++) { + if (fsck_err_on(g.v.gens[i], c, ++ bucket_gens_hole_wrong, + "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", + bucket_gens_pos_to_alloc(k.k->p, i).inode, + bucket_gens_pos_to_alloc(k.k->p, i).offset, @@ -2567,8 +2570,9 @@ index 000000000000..2d516207e223 + return ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, ++ need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", -+ bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) ++ bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) + goto delete; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -2576,9 +2580,10 @@ index 000000000000..2d516207e223 + if (fsck_err_on(a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a)), c, ++ need_discard_freespace_key_bad, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), -+ bch2_btree_ids[iter->btree_id], ++ bch2_btree_id_str(iter->btree_id), + iter->pos.inode, + iter->pos.offset, + a->data_type == state, @@ -2643,6 +2648,7 @@ index 000000000000..2d516207e223 + dev_exists = bch2_dev_exists2(c, k.k->p.inode); + if (!dev_exists) { + if (fsck_err_on(!dev_exists, c, ++ bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); @@ -2653,6 +2659,7 @@ index 000000000000..2d516207e223 + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (fsck_err_on(end <= ca->mi.first_bucket || + start >= ca->mi.nbuckets, c, ++ bucket_gens_to_invalid_buckets, + "bucket_gens key for invalid buckets:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); @@ -2661,6 +2668,7 @@ index 000000000000..2d516207e223 + + for (b = start; b < ca->mi.first_bucket; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, ++ bucket_gens_nonzero_for_invalid_buckets, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; @@ -2668,6 +2676,7 @@ index 000000000000..2d516207e223 + + for (b = ca->mi.nbuckets; b < end; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, ++ bucket_gens_nonzero_for_invalid_buckets, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; @@ -2818,11 +2827,13 @@ index 000000000000..2d516207e223 + return ret; + + if (fsck_err_on(!a->io_time[READ], c, ++ alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || + fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, ++ alloc_key_to_missing_lru_entry, + "missing lru entry\n" + " %s", + (printbuf_reset(&buf), @@ -3398,6 +3409,17 @@ index 000000000000..2d516207e223 + closure_wake_up(&c->freelist_wait); +} + ++u64 bch2_min_rw_member_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ u64 ret = U64_MAX; ++ ++ for_each_rw_member(ca, c, i) ++ ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); ++ return ret; ++} ++ +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) +{ + struct open_bucket *ob; @@ -3469,10 +3491,10 @@ index 000000000000..2d516207e223 +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000000..97042067d2a9 +index 000000000000..73faf99a222a --- /dev/null +++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,258 @@ +@@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H @@ -3624,13 +3646,13 @@ index 000000000000..97042067d2a9 + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + -+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -3668,7 +3690,7 @@ index 000000000000..97042067d2a9 + .min_val_size = 48, \ +}) + -+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -3724,6 +3746,7 @@ index 000000000000..97042067d2a9 +int bch2_fs_freespace_init(struct bch_fs *); + +void bch2_recalc_capacity(struct bch_fs *); ++u64 bch2_min_rw_member_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); @@ -3733,10 +3756,10 @@ index 000000000000..97042067d2a9 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000000..3bc4abd3d7d5 +index 000000000000..b85c7765272f --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1576 @@ +@@ -0,0 +1,1600 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -4138,12 +4161,23 @@ index 000000000000..3bc4abd3d7d5 + struct bucket_alloc_state *s, + struct closure *cl) +{ -+ struct btree_iter iter; -+ struct bkey_s_c k; ++ struct btree_iter iter, citer; ++ struct bkey_s_c k, ck; + struct open_bucket *ob = NULL; -+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); -+ u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); ++ u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); ++ u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); ++ u64 alloc_cursor = alloc_start; + int ret; ++ ++ /* ++ * Scan with an uncached iterator to avoid polluting the key cache. An ++ * uncached iter will return a cached key if one exists, but if not ++ * there is no other underlying protection for the associated key cache ++ * slot. To avoid racing bucket allocations, look up the cached key slot ++ * of any likely allocation candidate before attempting to proceed with ++ * the allocation. This provides proper exclusion on the associated ++ * bucket. ++ */ +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), + BTREE_ITER_SLOTS, k, ret) { @@ -4158,25 +4192,38 @@ index 000000000000..3bc4abd3d7d5 + continue; + + a = bch2_alloc_to_v4(k, &a_convert); -+ + if (a->data_type != BCH_DATA_free) + continue; + ++ /* now check the cached key to serialize concurrent allocs of the bucket */ ++ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); ++ ret = bkey_err(ck); ++ if (ret) ++ break; ++ ++ a = bch2_alloc_to_v4(ck, &a_convert); ++ if (a->data_type != BCH_DATA_free) ++ goto next; ++ + s->buckets_seen++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); ++next: ++ citer.path->preserve = false; ++ bch2_trans_iter_exit(trans, &citer); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + ++ alloc_cursor = iter.pos.offset; + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + -+ if (!ob && alloc_cursor > alloc_start) { -+ alloc_cursor = alloc_start; ++ if (!ob && alloc_start > first_bucket) { ++ alloc_cursor = alloc_start = first_bucket; + goto again; + } + @@ -5677,10 +5724,10 @@ index 000000000000..b91b7a461056 +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 -index 000000000000..cc856150a948 +index 000000000000..ef02c9bb0354 --- /dev/null +++ b/fs/bcachefs/backpointers.c -@@ -0,0 +1,868 @@ +@@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" @@ -5688,6 +5735,7 @@ index 000000000000..cc856150a948 +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" ++#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "error.h" + @@ -5720,25 +5768,26 @@ index 000000000000..cc856150a948 + return false; +} + -+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); ++ int ret = 0; + -+ if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { -+ prt_str(err, "backpointer at wrong pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), ++ c, err, ++ backpointer_pos_wrong, ++ "backpointer at wrong pos"); ++fsck_err: ++ return ret; +} + +void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +{ + prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", -+ bch2_btree_ids[bp->btree_id], ++ bch2_btree_id_str(bp->btree_id), + bp->level, + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), @@ -5759,7 +5808,7 @@ index 000000000000..cc856150a948 +{ + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + -+ bp.v->bucket_offset = swab32(bp.v->bucket_offset); ++ bp.v->bucket_offset = swab40(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); +} @@ -5902,18 +5951,22 @@ index 000000000000..cc856150a948 +static void backpointer_not_found(struct btree_trans *trans, + struct bpos bp_pos, + struct bch_backpointer bp, -+ struct bkey_s_c k, -+ const char *thing_it_points_to) ++ struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + ++ /* ++ * If we're using the btree write buffer, the backpointer we were ++ * looking at may have already been deleted - failure to find what it ++ * pointed to is not an error: ++ */ + if (likely(!bch2_backpointers_no_use_write_buffer)) + return; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", -+ thing_it_points_to); ++ bp.level ? "btree node" : "extent"); + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, "\n "); @@ -5939,56 +5992,37 @@ index 000000000000..cc856150a948 + struct bch_backpointer bp, + unsigned iter_flags) +{ -+ struct bch_fs *c = trans->c; -+ struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); -+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos); -+ struct bkey_s_c k; -+ -+ bch2_trans_node_iter_init(trans, iter, -+ bp.btree_id, -+ bp.pos, -+ 0, -+ min(bp.level, r->level), -+ iter_flags); -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) { -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+ } -+ -+ if (bp.level == r->level + 1) -+ k = bkey_i_to_s_c(&r->key); -+ -+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) -+ return k; -+ -+ bch2_trans_iter_exit(trans, iter); -+ -+ if (unlikely(bch2_backpointers_no_use_write_buffer)) { -+ if (bp.level) { -+ struct btree *b; -+ -+ /* -+ * If a backpointer for a btree node wasn't found, it may be -+ * because it was overwritten by a new btree node that hasn't -+ * been written out yet - backpointer_get_node() checks for -+ * this: -+ */ -+ b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); -+ if (!IS_ERR_OR_NULL(b)) -+ return bkey_i_to_s_c(&b->key); ++ if (likely(!bp.level)) { ++ struct bch_fs *c = trans->c; ++ struct bpos bucket = bp_pos_to_bucket(c, bp_pos); ++ struct bkey_s_c k; + ++ bch2_trans_node_iter_init(trans, iter, ++ bp.btree_id, ++ bp.pos, ++ 0, 0, ++ iter_flags); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); -+ -+ if (IS_ERR(b)) -+ return bkey_s_c_err(PTR_ERR(b)); -+ return bkey_s_c_null; ++ return k; + } + -+ backpointer_not_found(trans, bp_pos, bp, k, "extent"); -+ } ++ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) ++ return k; + -+ return bkey_s_c_null; ++ bch2_trans_iter_exit(trans, iter); ++ backpointer_not_found(trans, bp_pos, bp, k); ++ return bkey_s_c_null; ++ } else { ++ struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); ++ ++ if (IS_ERR_OR_NULL(b)) { ++ bch2_trans_iter_exit(trans, iter); ++ return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; ++ } ++ return bkey_i_to_s_c(&b->key); ++ } +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, @@ -6012,6 +6046,8 @@ index 000000000000..cc856150a948 + if (IS_ERR(b)) + goto err; + ++ BUG_ON(b->c.level != bp.level - 1); ++ + if (b && extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) @@ -6020,8 +6056,7 @@ index 000000000000..cc856150a948 + if (b && btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { -+ backpointer_not_found(trans, bp_pos, bp, -+ bkey_i_to_s_c(&b->key), "btree node"); ++ backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); + b = NULL; + } +err: @@ -6039,6 +6074,7 @@ index 000000000000..cc856150a948 + int ret = 0; + + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, ++ backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); @@ -6052,6 +6088,7 @@ index 000000000000..cc856150a948 + goto out; + + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, ++ backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -6136,14 +6173,14 @@ index 000000000000..cc856150a948 + return ret; +missing: + prt_printf(&buf, "missing backpointer for btree=%s l=%u ", -+ bch2_btree_ids[bp.btree_id], bp.level); ++ bch2_btree_id_str(bp.btree_id), bp.level); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_printf(&buf, "\nbp pos "); + bch2_bpos_to_text(&buf, bp_iter.pos); + + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || + c->opts.reconstruct_alloc || -+ fsck_err(c, "%s", buf.buf)) ++ fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + + goto out; @@ -6476,7 +6513,9 @@ index 000000000000..cc856150a948 + } + + if (fsck_err_on(!k.k, c, -+ "backpointer for missing extent\n %s", ++ backpointer_to_missing_ptr, ++ "backpointer for missing %s\n %s", ++ bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; @@ -6551,10 +6590,10 @@ index 000000000000..cc856150a948 +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 -index 000000000000..547e0617602a +index 000000000000..ab866feeaf66 --- /dev/null +++ b/fs/bcachefs/backpointers.h -@@ -0,0 +1,131 @@ +@@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H @@ -6564,7 +6603,16 @@ index 000000000000..547e0617602a +#include "buckets.h" +#include "super.h" + -+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, ++static inline u64 swab40(u64 x) ++{ ++ return (((x & 0x00000000ffULL) << 32)| ++ ((x & 0x000000ff00ULL) << 16)| ++ ((x & 0x0000ff0000ULL) >> 0)| ++ ((x & 0x00ff000000ULL) >> 16)| ++ ((x & 0xff00000000ULL) >> 32)); ++} ++ ++int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, + enum bkey_invalid_flags, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -6688,28 +6736,17 @@ index 000000000000..547e0617602a +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h new file mode 100644 -index 000000000000..1fbed1f8378d +index 000000000000..be2edced5213 --- /dev/null +++ b/fs/bcachefs/bbpos.h -@@ -0,0 +1,48 @@ +@@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_H +#define _BCACHEFS_BBPOS_H + ++#include "bbpos_types.h" +#include "bkey_methods.h" -+ -+struct bbpos { -+ enum btree_id btree; -+ struct bpos pos; -+}; -+ -+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -+{ -+ return (struct bbpos) { btree, pos }; -+} -+ -+#define BBPOS_MIN BBPOS(0, POS_MIN) -+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) ++#include "btree_cache.h" + +static inline int bbpos_cmp(struct bbpos l, struct bbpos r) +{ @@ -6734,18 +6771,42 @@ index 000000000000..1fbed1f8378d + +static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) +{ -+ prt_str(out, bch2_btree_ids[pos.btree]); ++ prt_str(out, bch2_btree_id_str(pos.btree)); + prt_char(out, ':'); + bch2_bpos_to_text(out, pos.pos); +} + +#endif /* _BCACHEFS_BBPOS_H */ +diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h +new file mode 100644 +index 000000000000..5198e94cf3b8 +--- /dev/null ++++ b/fs/bcachefs/bbpos_types.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BBPOS_TYPES_H ++#define _BCACHEFS_BBPOS_TYPES_H ++ ++struct bbpos { ++ enum btree_id btree; ++ struct bpos pos; ++}; ++ ++static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) ++{ ++ return (struct bbpos) { btree, pos }; ++} ++ ++#define BBPOS_MIN BBPOS(0, POS_MIN) ++#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) ++ ++#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000000..53ffa88cae16 +index 000000000000..9cb8684959ee --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,1156 @@ +@@ -0,0 +1,1161 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -6957,6 +7018,7 @@ index 000000000000..53ffa88cae16 +#include "nocow_locking_types.h" +#include "opts.h" +#include "recovery_types.h" ++#include "sb-errors_types.h" +#include "seqmutex.h" +#include "util.h" + @@ -7166,6 +7228,7 @@ index 000000000000..53ffa88cae16 +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" ++#include "disk_groups_types.h" +#include "ec_types.h" +#include "journal_types.h" +#include "keylist_types.h" @@ -7211,6 +7274,7 @@ index 000000000000..53ffa88cae16 + GC_PHASE_BTREE_snapshot_trees, + GC_PHASE_BTREE_deleted_inodes, + GC_PHASE_BTREE_logged_ops, ++ GC_PHASE_BTREE_rebalance_work, + + GC_PHASE_PENDING_DELETE, +}; @@ -7248,6 +7312,8 @@ index 000000000000..53ffa88cae16 + * Committed by bch2_write_super() -> bch_fs_mi_update() + */ + struct bch_member_cpu mi; ++ atomic64_t errors[BCH_MEMBER_ERROR_NR]; ++ + __uuid_t uuid; + char name[BDEVNAME_SIZE]; + @@ -7326,7 +7392,7 @@ index 000000000000..53ffa88cae16 + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, + -+ BCH_FS_HAVE_DELETED_SNAPSHOTS, ++ BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, + + /* errors: */ + BCH_FS_ERROR, @@ -7686,9 +7752,6 @@ index 000000000000..53ffa88cae16 + struct list_head moving_context_list; + struct mutex moving_context_lock; + -+ struct list_head data_progress_list; -+ struct mutex data_progress_lock; -+ + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + @@ -7739,11 +7802,6 @@ index 000000000000..53ffa88cae16 + struct bio_set dio_read_bioset; + struct bio_set nocow_flush_bioset; + -+ /* ERRORS */ -+ struct list_head fsck_errors; -+ struct mutex fsck_error_lock; -+ bool fsck_alloc_err; -+ + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + @@ -7792,6 +7850,14 @@ index 000000000000..53ffa88cae16 + struct bch2_time_stats times[BCH_TIME_STAT_NR]; + + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; ++ ++ /* ERRORS */ ++ struct list_head fsck_error_msgs; ++ struct mutex fsck_error_msgs_lock; ++ bool fsck_alloc_msgs_err; ++ ++ bch_sb_errors_cpu fsck_error_counts; ++ struct mutex fsck_error_counts_lock; +}; + +extern struct wait_queue_head bch2_read_only_wait; @@ -7904,10 +7970,10 @@ index 000000000000..53ffa88cae16 +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000000..99749f3315fe +index 000000000000..0a750953ff92 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,2413 @@ +@@ -0,0 +1,2425 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -8523,31 +8589,17 @@ index 000000000000..99749f3315fe +#endif +}; + -+struct bch_extent_reservation { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:22, -+ replicas:4, -+ generation:32; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 generation:32, -+ replicas:4, -+ unused:22, -+ type:6; -+#endif -+}; -+ +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:7, -+ unused:33, -+ compression:8, ++ __u64 type:6, ++ unused:34, ++ compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, -+ unused:33, -+ type:7; ++ unused:34, ++ type:6; +#endif +}; + @@ -8748,34 +8800,30 @@ index 000000000000..99749f3315fe + Inode_opt_nr, +}; + -+enum { -+ /* -+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL -+ * flags) -+ */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, ++#define BCH_INODE_FLAGS() \ ++ x(sync, 0) \ ++ x(immutable, 1) \ ++ x(append, 2) \ ++ x(nodump, 3) \ ++ x(noatime, 4) \ ++ x(i_size_dirty, 5) \ ++ x(i_sectors_dirty, 6) \ ++ x(unlinked, 7) \ ++ x(backptr_untrusted, 8) + -+ __BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */ -+ __BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */ -+ __BCH_INODE_UNLINKED = 7, -+ __BCH_INODE_BACKPTR_UNTRUSTED = 8, ++/* bits 20+ reserved for packed fields below: */ + -+ /* bits 20+ reserved for packed fields below: */ ++enum bch_inode_flags { ++#define x(t, n) BCH_INODE_##t = 1U << n, ++ BCH_INODE_FLAGS() ++#undef x +}; + -+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) ++enum __bch_inode_flags { ++#define x(t, n) __BCH_INODE_##t = n, ++ BCH_INODE_FLAGS() ++#undef x ++}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -9142,7 +9190,8 @@ index 000000000000..99749f3315fe + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) \ + x(counters, 10) \ -+ x(members_v2, 11) ++ x(members_v2, 11) \ ++ x(errors, 12) + +enum bch_sb_field_type { +#define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -9192,6 +9241,18 @@ index 000000000000..99749f3315fe + BCH_IOPS_NR +}; + ++#define BCH_MEMBER_ERROR_TYPES() \ ++ x(read, 0) \ ++ x(write, 1) \ ++ x(checksum, 2) ++ ++enum bch_member_error_type { ++#define x(t, n) BCH_MEMBER_ERROR_##t = n, ++ BCH_MEMBER_ERROR_TYPES() ++#undef x ++ BCH_MEMBER_ERROR_NR ++}; ++ +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ @@ -9202,6 +9263,9 @@ index 000000000000..99749f3315fe + + __le64 flags; + __le32 iops[4]; ++ __le64 errors[BCH_MEMBER_ERROR_NR]; ++ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; ++ __le64 errors_reset_time; +}; + +#define BCH_MEMBER_V1_BYTES 56 @@ -9525,11 +9589,20 @@ index 000000000000..99749f3315fe + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; -+ -+ struct journal_seq_blacklist_entry start[0]; -+ __u64 _data[]; ++ struct journal_seq_blacklist_entry start[]; +}; + ++struct bch_sb_field_errors { ++ struct bch_sb_field field; ++ struct bch_sb_field_error_entry { ++ __le64 v; ++ __le64 last_error_time; ++ } entries[]; ++}; ++ ++LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); ++LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); ++ +/* Superblock: */ + +/* @@ -9592,7 +9665,9 @@ index 000000000000..99749f3315fe + x(snapshot_skiplists, BCH_VERSION(1, 1), \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ + x(deleted_inodes, BCH_VERSION(1, 2), \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) ++ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ ++ x(rebalance_work, BCH_VERSION(1, 3), \ ++ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, @@ -9603,7 +9678,7 @@ index 000000000000..99749f3315fe +}; + +static const __maybe_unused -+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; ++unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; + +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) + @@ -10157,7 +10232,8 @@ index 000000000000..99749f3315fe +enum btree_id_flags { + BTREE_ID_EXTENTS = BIT(0), + BTREE_ID_SNAPSHOTS = BIT(1), -+ BTREE_ID_DATA = BIT(2), ++ BTREE_ID_SNAPSHOT_FIELD = BIT(2), ++ BTREE_ID_DATA = BIT(3), +}; + +#define BCH_BTREE_IDS() \ @@ -10212,11 +10288,13 @@ index 000000000000..99749f3315fe + BIT_ULL(KEY_TYPE_bucket_gens)) \ + x(snapshot_trees, 15, 0, \ + BIT_ULL(KEY_TYPE_snapshot_tree)) \ -+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ ++ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)) \ + x(logged_ops, 17, 0, \ + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ -+ BIT_ULL(KEY_TYPE_logged_op_finsert)) ++ BIT_ULL(KEY_TYPE_logged_op_finsert)) \ ++ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ ++ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + +enum btree_id { +#define x(name, nr, ...) BTREE_ID_##name = nr, @@ -11823,10 +11901,10 @@ index 000000000000..abdb05507d16 +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 -index 000000000000..518450209236 +index 000000000000..831be01809f2 --- /dev/null +++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,782 @@ +@@ -0,0 +1,778 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H @@ -11921,19 +11999,15 @@ index 000000000000..518450209236 +#define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + -+#define bkey_copy(_dst, _src) \ -+do { \ -+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ -+ !type_is(_dst, struct bkey_packed *)); \ -+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ -+ !type_is(_src, struct bkey_packed *)); \ -+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ -+ (u64 *) (_dst) < (u64 *) (_src) + \ -+ ((struct bkey *) (_src))->u64s); \ -+ \ -+ memcpy_u64s_small((_dst), (_src), \ -+ ((struct bkey *) (_src))->u64s); \ -+} while (0) ++static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) ++{ ++ memcpy_u64s_small(dst, src, src->u64s); ++} ++ ++static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) ++{ ++ memcpy_u64s_small(dst, src, src->k.u64s); ++} + +struct btree; + @@ -12813,15 +12887,16 @@ index 000000000000..5f42a6e69360 +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000000..d9fb1fc81f1e +index 000000000000..761f5e33b1e6 --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,458 @@ +@@ -0,0 +1,459 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "backpointers.h" +#include "bkey_methods.h" ++#include "btree_cache.h" +#include "btree_types.h" +#include "alloc_background.h" +#include "dirent.h" @@ -12844,7 +12919,7 @@ index 000000000000..d9fb1fc81f1e + NULL +}; + -+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; @@ -12858,23 +12933,24 @@ index 000000000000..d9fb1fc81f1e + .key_invalid = deleted_key_invalid, \ +}) + -+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ -+ if (bkey_val_bytes(k.k)) { -+ prt_printf(err, "incorrect value size (%zu != 0)", -+ bkey_val_bytes(k.k)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, ++ bkey_val_size_nonzero, ++ "incorrect value size (%zu != 0)", ++ bkey_val_bytes(k.k)); ++fsck_err: ++ return ret; +} + +#define bch2_bkey_ops_error ((struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +}) + -+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, ++static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; @@ -12889,7 +12965,7 @@ index 000000000000..d9fb1fc81f1e + .key_invalid = empty_val_key_invalid, \ +}) + -+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; @@ -12910,18 +12986,6 @@ index 000000000000..d9fb1fc81f1e + .val_to_text = key_type_inline_data_to_text, \ +}) + -+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, struct printbuf *err) -+{ -+ if (bkey_val_bytes(k.k)) { -+ prt_printf(err, "incorrect value size (%zu != %zu)", -+ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); @@ -12929,7 +12993,7 @@ index 000000000000..d9fb1fc81f1e +} + +#define bch2_bkey_ops_set ((struct bkey_ops) { \ -+ .key_invalid = key_type_set_invalid, \ ++ .key_invalid = empty_val_key_invalid, \ + .key_merge = key_type_set_merge, \ +}) + @@ -12947,84 +13011,95 @@ index 000000000000..d9fb1fc81f1e + struct printbuf *err) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); ++ int ret = 0; + -+ if (bkey_val_bytes(k.k) < ops->min_val_size) { -+ prt_printf(err, "bad val size (%zu < %u)", -+ bkey_val_bytes(k.k), ops->min_val_size); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, ++ bkey_val_size_too_small, ++ "bad val size (%zu < %u)", ++ bkey_val_bytes(k.k), ops->min_val_size); + + if (!ops->key_invalid) + return 0; + -+ return ops->key_invalid(c, k, flags, err); ++ ret = ops->key_invalid(c, k, flags, err); ++fsck_err: ++ return ret; +} + +static u64 bch2_key_types_allowed[] = { -+#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, -+ BCH_BTREE_IDS() -+#undef x + [BKEY_TYPE_btree] = + BIT_ULL(KEY_TYPE_deleted)| + BIT_ULL(KEY_TYPE_btree_ptr)| + BIT_ULL(KEY_TYPE_btree_ptr_v2), ++#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, ++ BCH_BTREE_IDS() ++#undef x +}; + ++const char *bch2_btree_node_type_str(enum btree_node_type type) ++{ ++ return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); ++} ++ +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (k.k->u64s < BKEY_U64s) { -+ prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ if (flags & BKEY_INVALID_COMMIT && -+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { -+ prt_printf(err, "invalid key type for btree %s (%s)", -+ bch2_btree_ids[type], bch2_bkey_types[k.k->type]); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, ++ bkey_u64s_too_small, ++ "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); ++ ++ if (type >= BKEY_TYPE_NR) ++ return 0; ++ ++ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && ++ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, ++ bkey_invalid_type_for_btree, ++ "invalid key type for btree %s (%s)", ++ bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { -+ if (k.k->size == 0) { -+ prt_printf(err, "size == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->size == 0, c, err, ++ bkey_extent_size_zero, ++ "size == 0"); + -+ if (k.k->size > k.k->p.offset) { -+ prt_printf(err, "size greater than offset (%u > %llu)", -+ k.k->size, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, ++ bkey_extent_size_greater_than_offset, ++ "size greater than offset (%u > %llu)", ++ k.k->size, k.k->p.offset); + } else { -+ if (k.k->size) { -+ prt_printf(err, "size != 0"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->size, c, err, ++ bkey_size_nonzero, ++ "size != 0"); + } + + if (type != BKEY_TYPE_btree) { -+ if (!btree_type_has_snapshots((enum btree_id) type) && -+ k.k->p.snapshot) { -+ prt_printf(err, "nonzero snapshot"); -+ return -BCH_ERR_invalid_bkey; ++ enum btree_id btree = type - 1; ++ ++ if (btree_type_has_snapshots(btree)) { ++ bkey_fsck_err_on(!k.k->p.snapshot, c, err, ++ bkey_snapshot_zero, ++ "snapshot == 0"); ++ } else if (!btree_type_has_snapshot_field(btree)) { ++ bkey_fsck_err_on(k.k->p.snapshot, c, err, ++ bkey_snapshot_nonzero, ++ "nonzero snapshot"); ++ } else { ++ /* ++ * btree uses snapshot field but it's not required to be ++ * nonzero ++ */ + } + -+ if (btree_type_has_snapshots((enum btree_id) type) && -+ !k.k->p.snapshot) { -+ prt_printf(err, "snapshot == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bkey_eq(k.k->p, POS_MAX)) { -+ prt_printf(err, "key at POS_MAX"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, ++ bkey_at_pos_max, ++ "key at POS_MAX"); + } -+ -+ return 0; ++fsck_err: ++ return ret; +} + +int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -13036,20 +13111,20 @@ index 000000000000..d9fb1fc81f1e + bch2_bkey_val_invalid(c, k, flags, err); +} + -+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, -+ struct printbuf *err) ++int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, ++ struct bkey_s_c k, struct printbuf *err) +{ -+ if (bpos_lt(k.k->p, b->data->min_key)) { -+ prt_printf(err, "key before start of btree node"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ if (bpos_gt(k.k->p, b->data->max_key)) { -+ prt_printf(err, "key past end of btree node"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, ++ bkey_before_start_of_btree_node, ++ "key before start of btree node"); + -+ return 0; ++ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, ++ bkey_after_end_of_btree_node, ++ "key past end of btree node"); ++fsck_err: ++ return ret; +} + +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) @@ -13277,10 +13352,10 @@ index 000000000000..d9fb1fc81f1e +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000000..668f595e2fcf +index 000000000000..3a370b7087ac --- /dev/null +++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,188 @@ +@@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H @@ -13304,7 +13379,7 @@ index 000000000000..668f595e2fcf + * being read or written; more aggressive checks can be enabled when rw == WRITE. + */ +struct bkey_ops { -+ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, ++ int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); @@ -13338,7 +13413,8 @@ index 000000000000..668f595e2fcf + enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); ++int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, ++ struct bkey_s_c, struct printbuf *); + +void bch2_bpos_to_text(struct printbuf *, struct bpos); +void bch2_bkey_to_text(struct printbuf *, const struct bkey *); @@ -13402,16 +13478,6 @@ index 000000000000..668f595e2fcf +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + -+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ -+ ((1U << KEY_TYPE_alloc)| \ -+ (1U << KEY_TYPE_alloc_v2)| \ -+ (1U << KEY_TYPE_alloc_v3)| \ -+ (1U << KEY_TYPE_alloc_v4)| \ -+ (1U << KEY_TYPE_stripe)| \ -+ (1U << KEY_TYPE_inode)| \ -+ (1U << KEY_TYPE_inode_v2)| \ -+ (1U << KEY_TYPE_snapshot)) -+ +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, @@ -13471,7 +13537,7 @@ index 000000000000..668f595e2fcf +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 -index 000000000000..b9aa027c881b +index 000000000000..bcca9e76a0b4 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,201 @@ @@ -13583,7 +13649,7 @@ index 000000000000..b9aa027c881b + while ((k = sort_iter_peek(iter))) { + if (!bkey_deleted(k) && + !should_drop_next_key(iter)) { -+ bkey_copy(out, k); ++ bkey_p_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_p_next(out); + } @@ -13614,7 +13680,7 @@ index 000000000000..b9aa027c881b + continue; + + if (!transform) -+ bkey_copy(out, in); ++ bkey_p_copy(out, in); + else if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; @@ -13668,7 +13734,7 @@ index 000000000000..b9aa027c881b + memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { -+ bkey_copy(out, in); ++ bkey_p_copy(out, in); + } + out->needs_whiteout |= needs_whiteout; + out = bkey_p_next(out); @@ -15883,10 +15949,10 @@ index 000000000000..632c2b8c5460 +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000000..82cf243aa288 +index 000000000000..0b084fbc478a --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1202 @@ +@@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -16365,7 +16431,7 @@ index 000000000000..82cf243aa288 + bc->shrink.count_objects = bch2_btree_cache_count; + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; -+ ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); ++ ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name); + if (ret) + goto err; + @@ -16672,12 +16738,12 @@ index 000000000000..82cf243aa288 + "btree node header doesn't match ptr\n" + "btree %s level %u\n" + "ptr: ", -+ bch2_btree_ids[b->c.btree_id], b->c.level); ++ bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_printf(&buf, "\nheader: btree %s level %llu\n" + "min ", -+ bch2_btree_ids[BTREE_NODE_ID(b->data)], ++ bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data)); + bch2_bpos_to_text(&buf, b->data->min_key); + @@ -17040,8 +17106,21 @@ index 000000000000..82cf243aa288 + six_unlock_intent(&b->c.lock); +} + -+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct btree *b) ++const char *bch2_btree_id_str(enum btree_id btree) ++{ ++ return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; ++} ++ ++void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) ++{ ++ prt_printf(out, "%s level %u/%u\n ", ++ bch2_btree_id_str(b->c.btree_id), ++ b->c.level, ++ bch2_btree_id_root(c, b->c.btree_id)->level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + struct bset_stats stats; + @@ -17091,10 +17170,10 @@ index 000000000000..82cf243aa288 +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000000..1e562b6efa62 +index 000000000000..cfb80b201d61 --- /dev/null +++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,130 @@ +@@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H @@ -17220,17 +17299,18 @@ index 000000000000..1e562b6efa62 + return bch2_btree_id_root(c, b->c.btree_id)->b; +} + -+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, -+ const struct btree *); ++const char *bch2_btree_id_str(enum btree_id); ++void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000000..693ed067b1a7 +index 000000000000..0b5d09c8475d --- /dev/null +++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,2111 @@ +@@ -0,0 +1,2145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -17328,15 +17408,15 @@ index 000000000000..693ed067b1a7 + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); + + if (__fsck_err(c, -+ FSCK_CAN_FIX| -+ FSCK_CAN_IGNORE| -+ FSCK_NO_RATELIMIT, -+ "btree node with incorrect min_key at btree %s level %u:\n" -+ " prev %s\n" -+ " cur %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf) && -+ should_restart_for_topology_repair(c)) { ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ btree_node_topology_bad_min_key, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_id_str(b->c.btree_id), b->c.level, ++ buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + goto err; @@ -17355,14 +17435,12 @@ index 000000000000..693ed067b1a7 + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); + bch2_bpos_to_text(&buf2, node_end); + -+ if (__fsck_err(c, -+ FSCK_CAN_FIX| -+ FSCK_CAN_IGNORE| -+ FSCK_NO_RATELIMIT, ++ if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, ++ btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf) && + should_restart_for_topology_repair(c)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); @@ -17520,10 +17598,11 @@ index 000000000000..693ed067b1a7 + + if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, + cur->data->min_key), c, ++ btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node at btree %s level %u:\n" + " node %s\n" + " next %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_PREV_NODE; + goto out; @@ -17531,10 +17610,11 @@ index 000000000000..693ed067b1a7 + + if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, ++ btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " node %s\n" + " next %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); @@ -17543,20 +17623,22 @@ index 000000000000..693ed067b1a7 + + if (mustfix_fsck_err_on(bpos_ge(expected_start, + cur->data->max_key), c, ++ btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node at btree %s level %u:\n" + " prev %s\n" + " node %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_THIS_NODE; + goto out; + } + + if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, ++ btree_node_topology_bad_min_key, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); + } @@ -17577,10 +17659,11 @@ index 000000000000..693ed067b1a7 + bch2_bpos_to_text(&buf2, b->key.k.p); + + if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, ++ btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, ++ bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = set_node_max(c, child, b->key.k.p); + if (ret) @@ -17629,9 +17712,10 @@ index 000000000000..693ed067b1a7 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + + if (mustfix_fsck_err_on(ret == -EIO, c, ++ btree_node_unreadable, + "Topology repair: unreadable btree node at btree %s level %u:\n" + " %s", -+ bch2_btree_ids[b->c.btree_id], ++ bch2_btree_id_str(b->c.btree_id), + b->c.level - 1, + buf.buf)) { + bch2_btree_node_evict(trans, cur_k.k); @@ -17737,9 +17821,10 @@ index 000000000000..693ed067b1a7 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (mustfix_fsck_err_on(!have_child, c, ++ btree_node_topology_interior_node_empty, + "empty interior btree node at btree %s level %u\n" + " %s", -+ bch2_btree_ids[b->c.btree_id], ++ bch2_btree_id_str(b->c.btree_id), + b->c.level, buf.buf)) + ret = DROP_THIS_NODE; +err: @@ -17815,7 +17900,8 @@ index 000000000000..693ed067b1a7 + + if (!g->gen_valid && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" ++ fsck_err(c, ptr_to_missing_alloc_key, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -17832,7 +17918,8 @@ index 000000000000..693ed067b1a7 + + if (gen_cmp(p.ptr.gen, g->gen) > 0 && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" ++ fsck_err(c, ptr_gen_newer_than_bucket_gen, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -17853,7 +17940,8 @@ index 000000000000..693ed067b1a7 + + if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ fsck_err(c, ptr_gen_newer_than_bucket_gen, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -17864,7 +17952,8 @@ index 000000000000..693ed067b1a7 + + if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && + (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" ++ fsck_err(c, stale_dirty_ptr, ++ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -17878,6 +17967,7 @@ index 000000000000..693ed067b1a7 + + if (fsck_err_on(bucket_data_type(g->data_type) && + bucket_data_type(g->data_type) != data_type, c, ++ ptr_bucket_data_type_mismatch, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -17897,6 +17987,7 @@ index 000000000000..693ed067b1a7 + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, ++ ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, @@ -17905,6 +17996,7 @@ index 000000000000..693ed067b1a7 + do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, ++ ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, @@ -18044,6 +18136,7 @@ index 000000000000..693ed067b1a7 + goto err; + + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, ++ bkey_version_in_future, + "key version number higher than recorded: %llu > %llu", + k->k->version.lo, + atomic64_read(&c->key_version))) @@ -18201,9 +18294,10 @@ index 000000000000..693ed067b1a7 + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, ++ btree_node_read_error, + "Unreadable btree node at btree %s level %u:\n" + " %s", -+ bch2_btree_ids[b->c.btree_id], ++ bch2_btree_id_str(b->c.btree_id), + b->c.level - 1, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && @@ -18258,6 +18352,7 @@ index 000000000000..693ed067b1a7 + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, ++ btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; @@ -18267,6 +18362,7 @@ index 000000000000..693ed067b1a7 + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, ++ btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; @@ -18440,16 +18536,16 @@ index 000000000000..693ed067b1a7 + + percpu_down_write(&c->mark_lock); + -+#define copy_field(_f, _msg, ...) \ ++#define copy_field(_err, _f, _msg, ...) \ + if (dst->_f != src->_f && \ + (!verify || \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ fsck_err(c, _err, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f))) \ + dst->_f = src->_f -+#define copy_dev_field(_f, _msg, ...) \ -+ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) -+#define copy_fs_field(_f, _msg, ...) \ -+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++#define copy_dev_field(_err, _f, _msg, ...) \ ++ copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) ++#define copy_fs_field(_err, _f, _msg, ...) \ ++ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); @@ -18460,13 +18556,17 @@ index 000000000000..693ed067b1a7 + bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, + dev_usage_u64s()); + -+ copy_dev_field(buckets_ec, "buckets_ec"); -+ + for (i = 0; i < BCH_DATA_NR; i++) { -+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); -+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); -+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); ++ copy_dev_field(dev_usage_buckets_wrong, ++ d[i].buckets, "%s buckets", bch2_data_types[i]); ++ copy_dev_field(dev_usage_sectors_wrong, ++ d[i].sectors, "%s sectors", bch2_data_types[i]); ++ copy_dev_field(dev_usage_fragmented_wrong, ++ d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } ++ ++ copy_dev_field(dev_usage_buckets_ec_wrong, ++ buckets_ec, "buckets_ec"); + } + + { @@ -18475,17 +18575,24 @@ index 000000000000..693ed067b1a7 + struct bch_fs_usage *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); + -+ copy_fs_field(hidden, "hidden"); -+ copy_fs_field(btree, "btree"); ++ copy_fs_field(fs_usage_hidden_wrong, ++ hidden, "hidden"); ++ copy_fs_field(fs_usage_btree_wrong, ++ btree, "btree"); + + if (!metadata_only) { -+ copy_fs_field(data, "data"); -+ copy_fs_field(cached, "cached"); -+ copy_fs_field(reserved, "reserved"); -+ copy_fs_field(nr_inodes,"nr_inodes"); ++ copy_fs_field(fs_usage_data_wrong, ++ data, "data"); ++ copy_fs_field(fs_usage_cached_wrong, ++ cached, "cached"); ++ copy_fs_field(fs_usage_reserved_wrong, ++ reserved, "reserved"); ++ copy_fs_field(fs_usage_nr_inodes_wrong, ++ nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ copy_fs_field(persistent_reserved[i], ++ copy_fs_field(fs_usage_persistent_reserved_wrong, ++ persistent_reserved[i], + "persistent_reserved[%i]", i); + } + @@ -18501,7 +18608,8 @@ index 000000000000..693ed067b1a7 + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, e); + -+ copy_fs_field(replicas[i], "%s", buf.buf); ++ copy_fs_field(fs_usage_replicas_wrong, ++ replicas[i], "%s", buf.buf); + } + } + @@ -18637,6 +18745,7 @@ index 000000000000..693ed067b1a7 + + if (c->opts.reconstruct_alloc || + fsck_err_on(new.data_type != gc.data_type, c, ++ alloc_key_data_type_wrong, + "bucket %llu:%llu gen %u has wrong data_type" + ": got %s, should be %s", + iter->pos.inode, iter->pos.offset, @@ -18645,9 +18754,9 @@ index 000000000000..693ed067b1a7 + bch2_data_types[gc.data_type])) + new.data_type = gc.data_type; + -+#define copy_bucket_field(_f) \ ++#define copy_bucket_field(_errtype, _f) \ + if (c->opts.reconstruct_alloc || \ -+ fsck_err_on(new._f != gc._f, c, \ ++ fsck_err_on(new._f != gc._f, c, _errtype, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ @@ -18656,11 +18765,16 @@ index 000000000000..693ed067b1a7 + new._f, gc._f)) \ + new._f = gc._f; \ + -+ copy_bucket_field(gen); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); -+ copy_bucket_field(stripe_redundancy); -+ copy_bucket_field(stripe); ++ copy_bucket_field(alloc_key_gen_wrong, ++ gen); ++ copy_bucket_field(alloc_key_dirty_sectors_wrong, ++ dirty_sectors); ++ copy_bucket_field(alloc_key_cached_sectors_wrong, ++ cached_sectors); ++ copy_bucket_field(alloc_key_stripe_wrong, ++ stripe); ++ copy_bucket_field(alloc_key_stripe_redundancy_wrong, ++ stripe_redundancy); +#undef copy_bucket_field + + if (!bch2_alloc_v4_cmp(*old, new)) @@ -18817,6 +18931,7 @@ index 000000000000..693ed067b1a7 + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, ++ reflink_v_refcount_wrong, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", @@ -18942,7 +19057,8 @@ index 000000000000..693ed067b1a7 + if (bad) + bch2_bkey_val_to_text(&buf, c, k); + -+ if (fsck_err_on(bad, c, "%s", buf.buf)) { ++ if (fsck_err_on(bad, c, stripe_sector_count_wrong, ++ "%s", buf.buf)) { + struct bkey_i_stripe *new; + + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); @@ -19187,19 +19303,17 @@ index 000000000000..693ed067b1a7 + trans = bch2_trans_get(c); + + for_each_member_device(ca, c, i) { -+ struct bucket_gens *gens; ++ struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(ca->oldest_gen); + -+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); ++ ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -BCH_ERR_ENOMEM_gc_gens; + goto err; + } + -+ gens = bucket_gens(ca); -+ + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; @@ -19464,10 +19578,10 @@ index 000000000000..607575f83a00 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000000..a869cf6ac7c6 +index 000000000000..37d896edb06e --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2223 @@ +@@ -0,0 +1,2298 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -19654,7 +19768,7 @@ index 000000000000..a869cf6ac7c6 + k = new_whiteouts; + + while (ptrs != ptrs_end) { -+ bkey_copy(k, *ptrs); ++ bkey_p_copy(k, *ptrs); + k = bkey_p_next(k); + ptrs++; + } @@ -19730,7 +19844,7 @@ index 000000000000..a869cf6ac7c6 + n = bkey_p_next(k); + + if (!bkey_deleted(k)) { -+ bkey_copy(out, k); ++ bkey_p_copy(out, k); + out = bkey_p_next(out); + } else { + BUG_ON(k->needs_whiteout); @@ -19980,16 +20094,6 @@ index 000000000000..a869cf6ac7c6 + bch2_trans_node_reinit_iter(trans, b); +} + -+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ prt_printf(out, "%s level %u/%u\n ", -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level, -+ bch2_btree_id_root(c, b->c.btree_id)->level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+} -+ +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, struct bset *i, @@ -20002,7 +20106,7 @@ index 000000000000..a869cf6ac7c6 + if (ca) + prt_printf(out, "on %s ", ca->name); + prt_printf(out, "at btree "); -+ btree_pos_to_text(out, c, b); ++ bch2_btree_pos_to_text(out, c, b); + + prt_printf(out, "\n node offset %u", b->written); + if (i) @@ -20010,7 +20114,7 @@ index 000000000000..a869cf6ac7c6 + prt_str(out, ": "); +} + -+__printf(8, 9) ++__printf(9, 10) +static int __btree_err(int ret, + struct bch_fs *c, + struct bch_dev *ca, @@ -20018,6 +20122,7 @@ index 000000000000..a869cf6ac7c6 + struct bset *i, + int write, + bool have_retry, ++ enum bch_sb_error_id err_type, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; @@ -20042,9 +20147,15 @@ index 000000000000..a869cf6ac7c6 + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) + ret = -BCH_ERR_btree_node_read_err_bad_node; + ++ if (ret != -BCH_ERR_btree_node_read_err_fixable) ++ bch2_sb_error_count(c, err_type); ++ + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: -+ mustfix_fsck_err(c, "%s", out.buf); ++ ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); ++ if (ret != -BCH_ERR_fsck_fix && ++ ret != -BCH_ERR_fsck_ignore) ++ goto fsck_err; + ret = -BCH_ERR_fsck_fix; + break; + case -BCH_ERR_btree_node_read_err_want_retry: @@ -20069,9 +20180,11 @@ index 000000000000..a869cf6ac7c6 + return ret; +} + -+#define btree_err(type, c, ca, b, i, msg, ...) \ ++#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +({ \ -+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ ++ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ ++ BCH_FSCK_ERR_##_err_type, \ ++ msg, ##__VA_ARGS__); \ + \ + if (_ret != -BCH_ERR_fsck_fix) { \ + ret = _ret; \ @@ -20146,13 +20259,17 @@ index 000000000000..a869cf6ac7c6 + int ret = 0; + + btree_err_on(!bch2_version_compatible(version), -+ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_incompatible, ++ c, ca, b, i, ++ btree_node_unsupported_version, + "unsupported bset version %u.%u", + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); + + if (btree_err_on(version < c->sb.version_min, -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bset_older_than_sb_min, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); @@ -20163,7 +20280,9 @@ index 000000000000..a869cf6ac7c6 + + if (btree_err_on(BCH_VERSION_MAJOR(version) > + BCH_VERSION_MAJOR(c->sb.version), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bset_newer_than_sb, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); @@ -20173,11 +20292,15 @@ index 000000000000..a869cf6ac7c6 + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -+ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_incompatible, ++ c, ca, b, i, ++ btree_node_unsupported_version, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + + if (btree_err_on(offset + sectors > btree_sectors(c), -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, ++ bset_past_end_of_btree_node, + "bset past end of btree node")) { + i->u64s = 0; + ret = 0; @@ -20185,12 +20308,15 @@ index 000000000000..a869cf6ac7c6 + } + + btree_err_on(offset && !i->u64s, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, ++ bset_empty, + "empty bset"); + -+ btree_err_on(BSET_OFFSET(i) && -+ BSET_OFFSET(i) != offset, -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, ++ btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, ++ bset_wrong_sector_offset, + "bset at wrong sector offset"); + + if (!offset) { @@ -20204,16 +20330,22 @@ index 000000000000..a869cf6ac7c6 + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ bset_bad_seq, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, i, ++ btree_node_bad_btree, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, i, ++ btree_node_bad_level, + "incorrect level"); + + if (!write) @@ -20230,7 +20362,9 @@ index 000000000000..a869cf6ac7c6 + } + + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ btree_node_bad_min_key, + "incorrect min_key: got %s should be %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), @@ -20239,7 +20373,9 @@ index 000000000000..a869cf6ac7c6 + } + + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, i, ++ btree_node_bad_max_key, + "incorrect max key %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); @@ -20249,7 +20385,9 @@ index 000000000000..a869cf6ac7c6 + BSET_BIG_ENDIAN(i), write, bn); + + btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -+ -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_bad_node, ++ c, ca, b, i, ++ btree_node_bad_format, + "invalid bkey format: %s\n %s", buf1.buf, + (printbuf_reset(&buf2), + bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); @@ -20272,7 +20410,7 @@ index 000000000000..a869cf6ac7c6 + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: -+ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: ++ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: + (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); +} + @@ -20293,14 +20431,18 @@ index 000000000000..a869cf6ac7c6 + struct bkey tmp; + + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bkey_past_bset_end, + "key extends past end of bset")) { + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bkey_bad_format, + "invalid bkey format %u", k->format)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), @@ -20319,12 +20461,14 @@ index 000000000000..a869cf6ac7c6 + printbuf_reset(&buf); + if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { + printbuf_reset(&buf); -+ prt_printf(&buf, "invalid bkey: "); + bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + -+ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); ++ btree_err(-BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bad_bkey, ++ "invalid bkey: %s", buf.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), @@ -20348,7 +20492,10 @@ index 000000000000..a869cf6ac7c6 + + bch2_dump_bset(c, b, i, 0); + -+ if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) { ++ if (btree_err(-BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bkey_out_of_order, ++ "%s", buf.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); @@ -20389,47 +20536,62 @@ index 000000000000..a869cf6ac7c6 + sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); + + if (bch2_meta_read_fault("btree")) -+ btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, ++ btree_err(-BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ btree_node_fault_injected, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ btree_node_bad_magic, + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); + -+ btree_err_on(!b->data->keys.seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "bad btree header: seq 0"); -+ + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ btree_node_bad_seq, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); ++ } else { ++ btree_err_on(!b->data->keys.seq, ++ -BCH_ERR_btree_node_read_err_must_retry, ++ c, ca, b, NULL, ++ btree_node_bad_seq, ++ "bad btree header: seq 0"); + } + + while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors; + struct nonce nonce; -+ struct bch_csum csum; + bool first = !b->written; ++ bool csum_bad; + + if (!b->written) { + i = &b->data->keys; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "unknown checksum type %llu", -+ BSET_CSUM_TYPE(i)); ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, ++ bset_unknown_csum, ++ "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + -+ btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, ++ csum_bad = bch2_crc_cmp(b->data->csum, ++ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); ++ if (csum_bad) ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); ++ ++ btree_err_on(csum_bad, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, ++ bset_bad_csum, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); @@ -20439,7 +20601,9 @@ index 000000000000..a869cf6ac7c6 + + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -+ -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_incompatible, ++ c, NULL, b, NULL, ++ btree_node_unsupported_version, + "btree node does not have NEW_EXTENT_OVERWRITE set"); + + sectors = vstruct_sectors(b->data, c->block_bits); @@ -20451,15 +20615,21 @@ index 000000000000..a869cf6ac7c6 + break; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "unknown checksum type %llu", -+ BSET_CSUM_TYPE(i)); ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, ++ bset_unknown_csum, ++ "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ csum_bad = bch2_crc_cmp(bne->csum, ++ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); ++ if (csum_bad) ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + -+ btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, ++ btree_err_on(csum_bad, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, ++ bset_bad_csum, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); @@ -20492,12 +20662,16 @@ index 000000000000..a869cf6ac7c6 + true); + + btree_err_on(blacklisted && first, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, ++ bset_blacklisted_journal_seq, + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, ++ first_bset_blacklisted_journal_seq, + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); @@ -20514,7 +20688,9 @@ index 000000000000..a869cf6ac7c6 + + if (ptr_written) { + btree_err_on(b->written < ptr_written, -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, NULL, ++ btree_node_data_missing, + "btree node data missing: expected %u sectors, found %u", + ptr_written, b->written); + } else { @@ -20525,7 +20701,9 @@ index 000000000000..a869cf6ac7c6 + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, NULL, ++ btree_node_bset_after_end, + "found bset signature after last bset"); + } + @@ -20567,7 +20745,10 @@ index 000000000000..a869cf6ac7c6 + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + -+ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); ++ btree_err(-BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, i, ++ btree_node_bad_bkey, ++ "%s", buf.buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + @@ -20647,8 +20828,9 @@ index 000000000000..a869cf6ac7c6 + } +start: + printbuf_reset(&buf); -+ btree_pos_to_text(&buf, c, b); -+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", ++ bch2_btree_pos_to_text(&buf, c, b); ++ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, ++ "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); @@ -20683,7 +20865,7 @@ index 000000000000..a869cf6ac7c6 + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->key.k.p); + bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", -+ __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); ++ __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); + + bch2_btree_node_rewrite_async(c, b); + } @@ -20792,14 +20974,20 @@ index 000000000000..a869cf6ac7c6 + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); -+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, ++ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, NULL, ++ btree_node_replicas_sectors_written_mismatch, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, NULL, ++ btree_node_bset_after_end, + "found bset signature after last bset") || + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, NULL, b, NULL, ++ btree_node_replicas_data_mismatch, + "btree node replicas content mismatch")) + dump_bset_maps = true; + @@ -20994,7 +21182,7 @@ index 000000000000..a869cf6ac7c6 + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "btree node read error: no device to read from\n at "); -+ btree_pos_to_text(&buf, c, b); ++ bch2_btree_pos_to_text(&buf, c, b); + bch_err(c, "%s", buf.buf); + + if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && @@ -21229,7 +21417,8 @@ index 000000000000..a869cf6ac7c6 + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, ++ "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); @@ -21927,10 +22116,10 @@ index 000000000000..7e03dd76fb38 +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000000..1d79514754d7 +index 000000000000..c2adf3fbb0b3 --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,3215 @@ +@@ -0,0 +1,3242 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -22190,7 +22379,7 @@ index 000000000000..1d79514754d7 + + BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && -+ !btree_type_has_snapshots(iter->btree_id)); ++ !btree_type_has_snapshot_field(iter->btree_id)); + + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); @@ -22295,7 +22484,7 @@ index 000000000000..1d79514754d7 + bch2_bpos_to_text(&buf, pos); + + panic("not locked: %s %s%s\n", -+ bch2_btree_ids[id], buf.buf, ++ bch2_btree_id_str(id), buf.buf, + key_cache ? " cached" : ""); +} + @@ -23042,6 +23231,9 @@ index 000000000000..1d79514754d7 + if (unlikely(ret)) + goto out; + ++ if (unlikely(!trans->srcu_held)) ++ bch2_trans_srcu_lock(trans); ++ + /* + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: @@ -23304,7 +23496,7 @@ index 000000000000..1d79514754d7 + struct bkey_s_c old = { &i->old_k, i->old_v }; + + prt_printf(buf, "update: btree=%s cached=%u %pS", -+ bch2_btree_ids[i->btree_id], ++ bch2_btree_id_str(i->btree_id), + i->cached, + (void *) i->ip_allocated); + prt_newline(buf); @@ -23320,7 +23512,7 @@ index 000000000000..1d79514754d7 + + trans_for_each_wb_update(trans, wb) { + prt_printf(buf, "update: btree=%s wb=1 %pS", -+ bch2_btree_ids[wb->btree], ++ bch2_btree_id_str(wb->btree), + (void *) i->ip_allocated); + prt_newline(buf); + @@ -23349,7 +23541,7 @@ index 000000000000..1d79514754d7 + path->idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', -+ bch2_btree_ids[path->btree_id], ++ bch2_btree_id_str(path->btree_id), + path->level); + bch2_bpos_to_text(out, path->pos); + @@ -23456,6 +23648,7 @@ index 000000000000..1d79514754d7 + path->ref = 0; + path->intent_ref = 0; + path->nodes_locked = 0; ++ path->alloc_seq++; + + btree_path_list_add(trans, pos, path); + trans->paths_sorted = false; @@ -23531,7 +23724,7 @@ index 000000000000..1d79514754d7 + + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) -+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); ++ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); + + return path; +} @@ -24762,18 +24955,36 @@ index 000000000000..1d79514754d7 + return p; +} + -+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) ++static inline void check_srcu_held_too_long(struct btree_trans *trans) +{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; ++ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), ++ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", ++ (jiffies - trans->srcu_lock_time) / HZ); ++} + -+ trans_for_each_path(trans, path) -+ if (path->cached && !btree_node_locked(path, 0)) -+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); ++void bch2_trans_srcu_unlock(struct btree_trans *trans) ++{ ++ if (trans->srcu_held) { ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; + -+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); -+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -+ trans->srcu_lock_time = jiffies; ++ trans_for_each_path(trans, path) ++ if (path->cached && !btree_node_locked(path, 0)) ++ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); ++ ++ check_srcu_held_too_long(trans); ++ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ trans->srcu_held = false; ++ } ++} ++ ++void bch2_trans_srcu_lock(struct btree_trans *trans) ++{ ++ if (!trans->srcu_held) { ++ trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); ++ trans->srcu_lock_time = jiffies; ++ trans->srcu_held = true; ++ } +} + +/** @@ -24827,8 +25038,9 @@ index 000000000000..1d79514754d7 + } + trans->last_begin_time = now; + -+ if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) -+ bch2_trans_reset_srcu_lock(trans); ++ if (unlikely(trans->srcu_held && ++ time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) ++ bch2_trans_srcu_unlock(trans); + + trans->last_begin_ip = _RET_IP_; + if (trans->restarted) { @@ -24913,8 +25125,9 @@ index 000000000000..1d79514754d7 + trans->wb_updates_size = s->wb_updates_size; + } + -+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; ++ trans->srcu_held = true; + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + struct btree_trans *pos; @@ -24958,7 +25171,7 @@ index 000000000000..1d79514754d7 + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", -+ bch2_btree_ids[path->btree_id], ++ bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); @@ -24991,7 +25204,10 @@ index 000000000000..1d79514754d7 + + check_btree_paths_leaked(trans); + -+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ if (trans->srcu_held) { ++ check_srcu_held_too_long(trans); ++ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ } + + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + @@ -25033,7 +25249,7 @@ index 000000000000..1d79514754d7 + + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', -+ b->level, bch2_btree_ids[b->btree_id]); ++ b->level, bch2_btree_id_str(b->btree_id)); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_tab(out); @@ -25063,7 +25279,7 @@ index 000000000000..1d79514754d7 + path->idx, + path->cached ? 'c' : 'b', + path->level, -+ bch2_btree_ids[path->btree_id]); ++ bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + @@ -25148,10 +25364,10 @@ index 000000000000..1d79514754d7 +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000000..fbe273453db3 +index 000000000000..85e7cb52f6b6 --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,939 @@ +@@ -0,0 +1,943 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -25428,6 +25644,7 @@ index 000000000000..fbe273453db3 +int bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock_notrace(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); ++void bch2_trans_unlock_long(struct btree_trans *); +bool bch2_trans_locked(struct btree_trans *); + +static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) @@ -25565,11 +25782,11 @@ index 000000000000..fbe273453db3 + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && -+ btree_node_type_is_extents(btree_id)) ++ btree_id_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && -+ !btree_type_has_snapshots(btree_id)) ++ !btree_type_has_snapshot_field(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && @@ -25733,6 +25950,9 @@ index 000000000000..fbe273453db3 + __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(*_val), _val) + ++void bch2_trans_srcu_unlock(struct btree_trans *); ++void bch2_trans_srcu_lock(struct btree_trans *); ++ +u32 bch2_trans_begin(struct btree_trans *); + +/* @@ -26693,7 +26913,7 @@ index 000000000000..5d64e7e22f26 +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000000..29a0b566a4fe +index 000000000000..3304bff7d464 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c @@ -0,0 +1,1072 @@ @@ -27023,7 +27243,7 @@ index 000000000000..29a0b566a4fe + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", -+ bch2_btree_ids[path->btree_id]); ++ bch2_btree_id_str(path->btree_id)); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + } + @@ -27106,7 +27326,7 @@ index 000000000000..29a0b566a4fe + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", -+ bch2_btree_ids[ck->key.btree_id], new_u64s); ++ bch2_btree_id_str(ck->key.btree_id), new_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + goto err; + } @@ -27208,7 +27428,7 @@ index 000000000000..29a0b566a4fe + * path->uptodate yet: + */ + if (!path->locks_want && -+ !__bch2_btree_path_upgrade(trans, path, 1)) { ++ !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); + goto err; @@ -27741,7 +27961,7 @@ index 000000000000..29a0b566a4fe + bc->shrink.seeks = 0; + bc->shrink.count_objects = bch2_btree_key_cache_count; + bc->shrink.scan_objects = bch2_btree_key_cache_scan; -+ if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) ++ if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return 0; +} @@ -27825,10 +28045,10 @@ index 000000000000..be3acde2caa0 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 -index 000000000000..40c8ed8f7bf1 +index 000000000000..3d48834d091f --- /dev/null +++ b/fs/bcachefs/btree_locking.c -@@ -0,0 +1,791 @@ +@@ -0,0 +1,817 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -28262,7 +28482,8 @@ index 000000000000..40c8ed8f7bf1 + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, -+ bool upgrade) ++ bool upgrade, ++ struct get_locks_fail *f) +{ + unsigned l = path->level; + int fail_idx = -1; @@ -28273,8 +28494,14 @@ index 000000000000..40c8ed8f7bf1 + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) -+ : bch2_btree_node_relock(trans, path, l))) -+ fail_idx = l; ++ : bch2_btree_node_relock(trans, path, l))) { ++ fail_idx = l; ++ ++ if (f) { ++ f->l = l; ++ f->b = path->l[l].b; ++ } ++ } + + l++; + } while (l < path->locks_want); @@ -28415,7 +28642,9 @@ index 000000000000..40c8ed8f7bf1 +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ -+ return btree_path_get_locks(trans, path, false); ++ struct get_locks_fail f; ++ ++ return btree_path_get_locks(trans, path, false, &f); +} + +int __bch2_btree_path_relock(struct btree_trans *trans, @@ -28431,22 +28660,24 @@ index 000000000000..40c8ed8f7bf1 + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, -+ unsigned new_locks_want) ++ unsigned new_locks_want, ++ struct get_locks_fail *f) +{ + EBUG_ON(path->locks_want >= new_locks_want); + + path->locks_want = new_locks_want; + -+ return btree_path_get_locks(trans, path, true); ++ return btree_path_get_locks(trans, path, true, f); +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, -+ unsigned new_locks_want) ++ unsigned new_locks_want, ++ struct get_locks_fail *f) +{ + struct btree_path *linked; + -+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) ++ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) + return true; + + /* @@ -28475,7 +28706,7 @@ index 000000000000..40c8ed8f7bf1 + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; -+ btree_path_get_locks(trans, linked, true); ++ btree_path_get_locks(trans, linked, true, NULL); + } + + return false; @@ -28487,6 +28718,9 @@ index 000000000000..40c8ed8f7bf1 +{ + unsigned l; + ++ if (trans->restarted) ++ return; ++ + EBUG_ON(path->locks_want < new_locks_want); + + path->locks_want = new_locks_want; @@ -28505,6 +28739,9 @@ index 000000000000..40c8ed8f7bf1 + } + + bch2_btree_path_verify_locks(path); ++ ++ path->downgrade_seq++; ++ trace_path_downgrade(trans, _RET_IP_, path); +} + +/* Btree transaction locking: */ @@ -28513,6 +28750,9 @@ index 000000000000..40c8ed8f7bf1 +{ + struct btree_path *path; + ++ if (trans->restarted) ++ return; ++ + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(trans, path); +} @@ -28564,6 +28804,12 @@ index 000000000000..40c8ed8f7bf1 + __bch2_btree_path_unlock(trans, path); +} + ++void bch2_trans_unlock_long(struct btree_trans *trans) ++{ ++ bch2_trans_unlock(trans); ++ bch2_trans_srcu_unlock(trans); ++} ++ +bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; @@ -28622,10 +28868,10 @@ index 000000000000..40c8ed8f7bf1 +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000000..6231e9ffc5d7 +index 000000000000..11b0a2c8cd69 --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,423 @@ +@@ -0,0 +1,433 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -28983,26 +29229,36 @@ index 000000000000..6231e9ffc5d7 + +/* upgrade */ + ++ ++struct get_locks_fail { ++ unsigned l; ++ struct btree *b; ++}; ++ +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, -+ struct btree_path *, unsigned); ++ struct btree_path *, unsigned, ++ struct get_locks_fail *); ++ +bool __bch2_btree_path_upgrade(struct btree_trans *, -+ struct btree_path *, unsigned); ++ struct btree_path *, unsigned, ++ struct get_locks_fail *); + +static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ ++ struct get_locks_fail f; + unsigned old_locks_want = path->locks_want; + + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (path->locks_want < new_locks_want -+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want) ++ ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) + : path->uptodate == BTREE_ITER_UPTODATE) + return 0; + + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, -+ old_locks_want, new_locks_want); ++ old_locks_want, new_locks_want, &f); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); +} + @@ -29051,10 +29307,10 @@ index 000000000000..6231e9ffc5d7 +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c new file mode 100644 -index 000000000000..04c1f4610972 +index 000000000000..decad7b66c59 --- /dev/null +++ b/fs/bcachefs/btree_trans_commit.c -@@ -0,0 +1,1150 @@ +@@ -0,0 +1,1145 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -29326,6 +29582,7 @@ index 000000000000..04c1f4610972 + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && ++ btree_type_has_snapshots(i->btree_id) && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && @@ -29406,7 +29663,7 @@ index 000000000000..04c1f4610972 + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", -+ bch2_btree_ids[path->btree_id], new_u64s); ++ bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + @@ -29436,11 +29693,10 @@ index 000000000000..04c1f4610972 + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + -+ if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) ++ if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) + return 0; + -+ if (old_ops->atomic_trigger == new_ops->atomic_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ if (old_ops->atomic_trigger == new_ops->atomic_trigger) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); @@ -29482,8 +29738,7 @@ index 000000000000..04c1f4610972 + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && -+ old_ops->trans_trigger == new_ops->trans_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ old_ops->trans_trigger == new_ops->trans_trigger) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, @@ -29740,7 +29995,7 @@ index 000000000000..04c1f4610972 + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); -+ bkey_reassemble(&entry->start[0], ++ bkey_reassemble((struct bkey_i *) entry->start, + (struct bkey_s_c) { &i->old_k, i->old_v }); + } + @@ -29748,7 +30003,7 @@ index 000000000000..04c1f4610972 + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); -+ bkey_copy(&entry->start[0], i->k); ++ bkey_copy((struct bkey_i *) entry->start, i->k); + } + + trans_for_each_wb_update(trans, wb) { @@ -29756,7 +30011,7 @@ index 000000000000..04c1f4610972 + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); -+ bkey_copy(&entry->start[0], &wb->k); ++ bkey_copy((struct bkey_i *) entry->start, &wb->k); + } + + if (trans->journal_seq) @@ -29833,12 +30088,12 @@ index 000000000000..04c1f4610972 + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); +} + -+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, ++static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, ++ enum bkey_invalid_flags flags, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; -+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", @@ -29849,8 +30104,7 @@ index 000000000000..04c1f4610972 + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + -+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, err); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); @@ -29921,12 +30175,7 @@ index 000000000000..04c1f4610972 + */ + bch2_journal_res_put(&c->journal, &trans->journal_res); + -+ if (unlikely(ret)) -+ return ret; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; ++ return ret; +} + +static int journal_reclaim_wait_done(struct bch_fs *c) @@ -30091,7 +30340,7 @@ index 000000000000..04c1f4610972 + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) -+ ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); ++ ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); + btree_insert_entry_checks(trans, i); + printbuf_exit(&buf); + @@ -30195,6 +30444,8 @@ index 000000000000..04c1f4610972 + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); +out_reset: ++ if (!ret) ++ bch2_trans_downgrade(trans); + bch2_trans_reset_updates(trans); + + return ret; @@ -30207,10 +30458,10 @@ index 000000000000..04c1f4610972 +} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000000..c9a38e254949 +index 000000000000..3ab773005484 --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,739 @@ +@@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -30441,6 +30692,8 @@ index 000000000000..c9a38e254949 + u8 sorted_idx; + u8 ref; + u8 intent_ref; ++ u32 alloc_seq; ++ u32 downgrade_seq; + + /* btree_iter_copy starts here: */ + struct bpos pos; @@ -30637,6 +30890,7 @@ index 000000000000..c9a38e254949 + u8 nr_updates; + u8 nr_wb_updates; + u8 wb_updates_size; ++ bool srcu_held:1; + bool used_mempool:1; + bool in_traverse_all:1; + bool paths_sorted:1; @@ -30849,16 +31103,17 @@ index 000000000000..c9a38e254949 +} + +enum btree_node_type { -+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, ++ BKEY_TYPE_btree, ++#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, + BCH_BTREE_IDS() +#undef x -+ BKEY_TYPE_btree, ++ BKEY_TYPE_NR +}; + +/* Type of a key in btree @id at level @level: */ +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) +{ -+ return level ? BKEY_TYPE_btree : (enum btree_node_type) id; ++ return level ? BKEY_TYPE_btree : (unsigned) id + 1; +} + +/* Type of keys @b contains: */ @@ -30867,19 +31122,21 @@ index 000000000000..c9a38e254949 + return __btree_node_type(b->c.level, b->c.btree_id); +} + ++const char *bch2_btree_node_type_str(enum btree_node_type); ++ +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ (BIT(BKEY_TYPE_extents)| \ -+ BIT(BKEY_TYPE_alloc)| \ -+ BIT(BKEY_TYPE_inodes)| \ -+ BIT(BKEY_TYPE_stripes)| \ -+ BIT(BKEY_TYPE_reflink)| \ -+ BIT(BKEY_TYPE_btree)) ++ (BIT_ULL(BKEY_TYPE_extents)| \ ++ BIT_ULL(BKEY_TYPE_alloc)| \ ++ BIT_ULL(BKEY_TYPE_inodes)| \ ++ BIT_ULL(BKEY_TYPE_stripes)| \ ++ BIT_ULL(BKEY_TYPE_reflink)| \ ++ BIT_ULL(BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ -+ (BIT(BKEY_TYPE_alloc)| \ -+ BIT(BKEY_TYPE_inodes)| \ -+ BIT(BKEY_TYPE_stripes)| \ -+ BIT(BKEY_TYPE_snapshots)) ++ (BIT_ULL(BKEY_TYPE_alloc)| \ ++ BIT_ULL(BKEY_TYPE_inodes)| \ ++ BIT_ULL(BKEY_TYPE_stripes)| \ ++ BIT_ULL(BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -30887,13 +31144,13 @@ index 000000000000..c9a38e254949 + +static inline bool btree_node_type_needs_gc(enum btree_node_type type) +{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); +} + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + const unsigned mask = 0 -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) + BCH_BTREE_IDS() +#undef x + ; @@ -30903,7 +31160,7 @@ index 000000000000..c9a38e254949 + +static inline bool btree_id_is_extents(enum btree_id btree) +{ -+ return btree_node_type_is_extents((enum btree_node_type) btree); ++ return btree_node_type_is_extents(__btree_node_type(0, btree)); +} + +static inline bool btree_type_has_snapshots(enum btree_id id) @@ -30917,6 +31174,17 @@ index 000000000000..c9a38e254949 + return (1U << id) & mask; +} + ++static inline bool btree_type_has_snapshot_field(enum btree_id id) ++{ ++ const unsigned mask = 0 ++#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) ++ BCH_BTREE_IDS() ++#undef x ++ ; ++ ++ return (1U << id) & mask; ++} ++ +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + const unsigned mask = 0 @@ -32237,10 +32505,10 @@ index 000000000000..9816d2286540 +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000000..7dbf6b6c7f34 +index 000000000000..39c2db68123b --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2480 @@ +@@ -0,0 +1,2474 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -33517,14 +33785,14 @@ index 000000000000..7dbf6b6c7f34 + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf) ?: -+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { ++ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_printf(&buf, "\n "); + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf); -+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); ++ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); + + bch2_fs_inconsistent(c, "%s", buf.buf); + dump_stack(); @@ -34230,7 +34498,7 @@ index 000000000000..7dbf6b6c7f34 +out: + if (new_path) + bch2_path_put(trans, new_path, true); -+ bch2_btree_path_downgrade(trans, iter->path); ++ bch2_trans_downgrade(trans); + return ret; +err: + bch2_btree_node_free_never_used(as, trans, n); @@ -34654,30 +34922,24 @@ index 000000000000..7dbf6b6c7f34 + + r->level = entry->level; + r->alive = true; -+ bkey_copy(&r->key, &entry->start[0]); ++ bkey_copy(&r->key, (struct bkey_i *) entry->start); + + mutex_unlock(&c->btree_root_lock); +} + +struct jset_entry * +bch2_btree_roots_to_journal_entries(struct bch_fs *c, -+ struct jset_entry *start, -+ struct jset_entry *end) ++ struct jset_entry *end, ++ unsigned long skip) +{ -+ struct jset_entry *entry; -+ unsigned long have = 0; + unsigned i; + -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) -+ __set_bit(entry->btree_id, &have); -+ + mutex_lock(&c->btree_root_lock); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + -+ if (r->alive && !test_bit(i, &have)) { ++ if (r->alive && !test_bit(i, &skip)) { + journal_entry_set(end, BCH_JSET_ENTRY_btree_root, + i, r->level, &r->key, r->key.k.u64s); + end = vstruct_next(end); @@ -34723,7 +34985,7 @@ index 000000000000..7dbf6b6c7f34 +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 -index 000000000000..5e0a467fe905 +index 000000000000..4df21512d640 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h @@ -0,0 +1,337 @@ @@ -35000,7 +35262,7 @@ index 000000000000..5e0a467fe905 + struct btree_node_entry *bne = max(write_block(b), + (void *) btree_bkey_last(b, bset_tree_last(b))); + ssize_t remaining_space = -+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ __bch_btree_u64s_remaining(c, b, bne->keys.start); + + if (unlikely(bset_written(b, bset(b, t)))) { + if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -35032,7 +35294,7 @@ index 000000000000..5e0a467fe905 + k.needs_whiteout = true; + + b->whiteout_u64s += k.u64s; -+ bkey_copy(unwritten_whiteouts_start(c, b), &k); ++ bkey_p_copy(unwritten_whiteouts_start(c, b), &k); +} + +/* @@ -35054,7 +35316,7 @@ index 000000000000..5e0a467fe905 + +void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); +struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, -+ struct jset_entry *, struct jset_entry *); ++ struct jset_entry *, unsigned long); + +void bch2_do_pending_node_rewrites(struct bch_fs *); +void bch2_free_pending_node_rewrites(struct bch_fs *); @@ -35517,10 +35779,10 @@ index 000000000000..99993ba77aea +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000000..a1a4b5feadaa +index 000000000000..58d8c6ffd955 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2106 @@ +@@ -0,0 +1,2168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -35893,8 +36155,8 @@ index 000000000000..a1a4b5feadaa + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && -+ fsck_err(c, "no replicas entry\n" -+ " while marking %s", ++ fsck_err(c, ptr_to_missing_replicas_entry, ++ "no replicas entry\n while marking %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); @@ -36218,6 +36480,7 @@ index 000000000000..a1a4b5feadaa + + if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, @@ -36230,6 +36493,7 @@ index 000000000000..a1a4b5feadaa + + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_ptr_too_stale, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, @@ -36243,6 +36507,7 @@ index 000000000000..a1a4b5feadaa + + if (b_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_stale_dirty_ptr, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, @@ -36264,6 +36529,7 @@ index 000000000000..a1a4b5feadaa + ptr_data_type && + bucket_data_type != ptr_data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, @@ -36277,6 +36543,7 @@ index 000000000000..a1a4b5feadaa + + if ((u64) bucket_sectors + sectors > U32_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_bucket_sector_count_overflow, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, @@ -36458,14 +36725,12 @@ index 000000000000..a1a4b5feadaa + return 0; +} + -+int bch2_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++static int __mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -36541,6 +36806,14 @@ index 000000000000..a1a4b5feadaa + return 0; +} + ++int bch2_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); ++} ++ +int bch2_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, @@ -36647,13 +36920,11 @@ index 000000000000..a1a4b5feadaa + return 0; +} + -+int bch2_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++static int __mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bch_fs_usage *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; @@ -36680,6 +36951,14 @@ index 000000000000..a1a4b5feadaa + return 0; +} + ++int bch2_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); ++} ++ +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, @@ -36706,7 +36985,8 @@ index 000000000000..a1a4b5feadaa + *idx = r->offset; + return 0; +not_found: -+ if (fsck_err(c, "pointer to missing indirect extent\n" ++ if (fsck_err(c, reflink_p_to_missing_reflink_v, ++ "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), @@ -36734,13 +37014,11 @@ index 000000000000..a1a4b5feadaa + return ret; +} + -+int bch2_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) ++static int __mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; @@ -36774,6 +37052,14 @@ index 000000000000..a1a4b5feadaa + return ret; +} + ++int bch2_mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); ++} ++ +void bch2_trans_fs_usage_revert(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ @@ -36821,7 +37107,7 @@ index 000000000000..a1a4b5feadaa + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; -+ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; @@ -36880,7 +37166,7 @@ index 000000000000..a1a4b5feadaa + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + bch2_trans_inconsistent(trans, -+ "disk usage increased %lli more than %u sectors reserved)", ++ "disk usage increased %lli more than %llu sectors reserved)", + should_not_have_added, disk_res_sectors); + return 0; +need_mark: @@ -36975,15 +37261,11 @@ index 000000000000..a1a4b5feadaa + return ret; +} + -+int bch2_trans_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) ++static int __trans_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -37040,6 +37322,24 @@ index 000000000000..a1a4b5feadaa + return ret; +} + ++int bch2_trans_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - ++ (int) bch2_bkey_needs_rebalance(c, old); ++ ++ if (mod) { ++ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); ++ if (ret) ++ return ret; ++ } ++ ++ return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); ++} ++ +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) @@ -37193,15 +37493,10 @@ index 000000000000..a1a4b5feadaa + return ret; +} + -+int bch2_trans_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) ++static int __trans_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; @@ -37223,7 +37518,16 @@ index 000000000000..a1a4b5feadaa + return 0; +} + -+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++int bch2_trans_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) ++{ ++ return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); ++} ++ ++static int trans_mark_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ @@ -37290,35 +37594,38 @@ index 000000000000..a1a4b5feadaa + return ret; +} + -+int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) ++static int __trans_mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, unsigned flags) +{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; + -+ if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; -+ -+ v->front_pad = v->back_pad = 0; -+ } -+ + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); + + while (idx < end_idx && !ret) -+ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); -+ ++ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); + return ret; +} + ++int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) ++{ ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; ++ ++ v->front_pad = v->back_pad = 0; ++ } ++ ++ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); ++} ++ +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, @@ -37341,6 +37648,7 @@ index 000000000000..a1a4b5feadaa + + if (a->v.data_type && type && a->v.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ BCH_FSCK_ERR_bucket_metadata_type_mismatch, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter.pos.inode, iter.pos.offset, a->v.gen, @@ -37348,16 +37656,16 @@ index 000000000000..a1a4b5feadaa + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; -+ goto out; ++ goto err; + } + -+ a->v.data_type = type; -+ a->v.dirty_sectors = sectors; -+ -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -+ if (ret) -+ goto out; -+out: ++ if (a->v.data_type != type || ++ a->v.dirty_sectors != sectors) { ++ a->v.data_type = type; ++ a->v.dirty_sectors = sectors; ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ } ++err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -37452,6 +37760,22 @@ index 000000000000..a1a4b5feadaa + return ret; +} + ++int bch2_trans_mark_dev_sbs(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) { ++ int ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ percpu_ref_put(&ca->ref); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ +/* Disk reservations: */ + +#define SECTORS_CACHE 1024 @@ -37629,10 +37953,10 @@ index 000000000000..a1a4b5feadaa +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000000..bf8d7f407e9c +index 000000000000..21f6cb356921 --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,443 @@ +@@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -37974,12 +38298,27 @@ index 000000000000..bf8d7f407e9c +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); + ++#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ++({ \ ++ int ret = 0; \ ++ \ ++ if (_old.k->type) \ ++ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ ++ if (!ret && _new.k->type) \ ++ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ ++ ret; \ ++}) ++ ++#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ ++ mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) ++ +void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); ++int bch2_trans_mark_dev_sbs(struct bch_fs *); + +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ @@ -38398,7 +38737,7 @@ index 000000000000..e593db061d81 +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 -index 000000000000..f69e15dc699c +index 000000000000..4bb88aefed12 --- /dev/null +++ b/fs/bcachefs/chardev.c @@ -0,0 +1,784 @@ @@ -38736,8 +39075,8 @@ index 000000000000..f69e15dc699c + struct bch_ioctl_data_event e = { + .type = BCH_DATA_EVENT_PROGRESS, + .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.btree_id, -+ .p.pos = ctx->stats.pos, ++ .p.btree_id = ctx->stats.pos.btree, ++ .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_total = bch2_fs_usage_read_short(c).used, + }; @@ -40540,10 +40879,10 @@ index 000000000000..5fae0012d808 +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000000..1480b64547b0 +index 000000000000..a8b148ec2a2b --- /dev/null +++ b/fs/bcachefs/compress.c -@@ -0,0 +1,710 @@ +@@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -41243,35 +41582,59 @@ index 000000000000..1480b64547b0 + return ret; +} + ++void bch2_compression_opt_to_text(struct printbuf *out, u64 v) ++{ ++ struct bch_compression_opt opt = bch2_compression_decode(v); ++ ++ if (opt.type < BCH_COMPRESSION_OPT_NR) ++ prt_str(out, bch2_compression_opts[opt.type]); ++ else ++ prt_printf(out, "(unknown compression opt %u)", opt.type); ++ if (opt.level) ++ prt_printf(out, ":%u", opt.level); ++} ++ +void bch2_opt_compression_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ -+ struct bch_compression_opt opt = bch2_compression_decode(v); ++ return bch2_compression_opt_to_text(out, v); ++} + -+ prt_str(out, bch2_compression_opts[opt.type]); -+ if (opt.level) -+ prt_printf(out, ":%u", opt.level); ++int bch2_opt_compression_validate(u64 v, struct printbuf *err) ++{ ++ if (!bch2_compression_opt_valid(v)) { ++ prt_printf(err, "invalid compression opt %llu", v); ++ return -BCH_ERR_invalid_sb_opt_compression; ++ } ++ ++ return 0; +} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h new file mode 100644 -index 000000000000..052ea303241f +index 000000000000..607fd5e232c9 --- /dev/null +++ b/fs/bcachefs/compress.h -@@ -0,0 +1,55 @@ +@@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COMPRESS_H +#define _BCACHEFS_COMPRESS_H + +#include "extents_types.h" + ++static const unsigned __bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ +struct bch_compression_opt { + u8 type:4, + level:4; +}; + -+static inline struct bch_compression_opt bch2_compression_decode(unsigned v) ++static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) +{ + return (struct bch_compression_opt) { + .type = v & 15, @@ -41279,17 +41642,25 @@ index 000000000000..052ea303241f + }; +} + ++static inline bool bch2_compression_opt_valid(unsigned v) ++{ ++ struct bch_compression_opt opt = __bch2_compression_decode(v); ++ ++ return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); ++} ++ ++static inline struct bch_compression_opt bch2_compression_decode(unsigned v) ++{ ++ return bch2_compression_opt_valid(v) ++ ? __bch2_compression_decode(v) ++ : (struct bch_compression_opt) { 0 }; ++} ++ +static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) +{ + return opt.type|(opt.level << 4); +} + -+static const unsigned __bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ +static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) +{ + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; @@ -41306,12 +41677,16 @@ index 000000000000..052ea303241f +void bch2_fs_compress_exit(struct bch_fs *); +int bch2_fs_compress_init(struct bch_fs *); + ++void bch2_compression_opt_to_text(struct printbuf *, u64); ++ +int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++int bch2_opt_compression_validate(u64, struct printbuf *); + +#define bch2_opt_compression (struct bch_opt_fn) { \ -+ .parse = bch2_opt_compression_parse, \ -+ .to_text = bch2_opt_compression_to_text, \ ++ .parse = bch2_opt_compression_parse, \ ++ .to_text = bch2_opt_compression_to_text, \ ++ .validate = bch2_opt_compression_validate, \ +} + +#endif /* _BCACHEFS_COMPRESS_H */ @@ -41453,10 +41828,10 @@ index 000000000000..4778aa19bf34 +#endif // _BCACHEFS_COUNTERS_H diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 -index 000000000000..114f86b45fd5 +index 000000000000..87b4b2d1ec76 --- /dev/null +++ b/fs/bcachefs/darray.h -@@ -0,0 +1,87 @@ +@@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H @@ -41528,9 +41903,15 @@ index 000000000000..114f86b45fd5 + _ret; \ +}) + ++#define darray_remove_item(_d, _pos) \ ++ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) ++ +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + ++#define darray_for_each_reverse(_d, _i) \ ++ for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) ++ +#define darray_init(_d) \ +do { \ + (_d)->data = NULL; \ @@ -41546,10 +41927,10 @@ index 000000000000..114f86b45fd5 +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 -index 000000000000..899ff46de8e0 +index 000000000000..0771a6d880bf --- /dev/null +++ b/fs/bcachefs/data_update.c -@@ -0,0 +1,558 @@ +@@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -41565,6 +41946,7 @@ index 000000000000..899ff46de8e0 +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" ++#include "rebalance.h" +#include "subvolume.h" +#include "trace.h" + @@ -41713,11 +42095,7 @@ index 000000000000..899ff46de8e0 + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); -+ /* -+ * See comment below: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); -+ */ + rewrites_found |= 1U << i; + } + i++; @@ -41763,14 +42141,8 @@ index 000000000000..899ff46de8e0 + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); -+ /* -+ * Currently, we're dropping unneeded replicas -+ * instead of marking them as cached, since -+ * cached data in stripe buckets prevents them -+ * from being reused: ++ + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); -+ */ + goto restart_drop_extra_replicas; + } + } @@ -41803,11 +42175,11 @@ index 000000000000..899ff46de8e0 + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, -+ k.k->p, insert->k.p); -+ if (ret) -+ goto err; -+ -+ ret = bch2_trans_update(trans, &iter, insert, ++ k.k->p, insert->k.p) ?: ++ bch2_bkey_set_needs_rebalance(c, insert, ++ op->opts.background_target, ++ op->opts.background_compression) ?: ++ bch2_trans_update(trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, &op->res, + NULL, @@ -41833,11 +42205,11 @@ index 000000000000..899ff46de8e0 + } + continue; +nowork: -+ if (m->ctxt && m->ctxt->stats) { ++ if (m->stats && m->stats) { + BUG_ON(k.k->p.offset <= iter.pos.offset); -+ atomic64_inc(&m->ctxt->stats->keys_raced); ++ atomic64_inc(&m->stats->keys_raced); + atomic64_add(k.k->p.offset - iter.pos.offset, -+ &m->ctxt->stats->sectors_raced); ++ &m->stats->sectors_raced); + } + + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); @@ -41991,6 +42363,8 @@ index 000000000000..899ff46de8e0 + bch2_bkey_buf_reassemble(&m->k, c, k); + m->btree_id = btree_id; + m->data_opts = data_opts; ++ m->ctxt = ctxt; ++ m->stats = ctxt ? ctxt->stats : NULL; + + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); @@ -42039,7 +42413,7 @@ index 000000000000..899ff46de8e0 + + if (c->opts.nocow_enabled) { + if (ctxt) { -+ move_ctxt_wait_event(ctxt, trans, ++ move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + !atomic_read(&ctxt->read_sectors)); @@ -42110,10 +42484,10 @@ index 000000000000..899ff46de8e0 +} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h new file mode 100644 -index 000000000000..7ca1f98d7e94 +index 000000000000..9dc17b9d8379 --- /dev/null +++ b/fs/bcachefs/data_update.h -@@ -0,0 +1,43 @@ +@@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H @@ -42139,6 +42513,7 @@ index 000000000000..7ca1f98d7e94 + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; ++ struct bch_move_stats *stats; + struct bch_write_op op; +}; + @@ -42159,7 +42534,7 @@ index 000000000000..7ca1f98d7e94 +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 -index 000000000000..75a3dc7cbd47 +index 000000000000..57c5128db173 --- /dev/null +++ b/fs/bcachefs/debug.c @@ -0,0 +1,954 @@ @@ -42682,7 +43057,7 @@ index 000000000000..75a3dc7cbd47 + + prt_printf(out, "%px btree=%s l=%u ", + b, -+ bch2_btree_ids[b->c.btree_id], ++ bch2_btree_id_str(b->c.btree_id), + b->c.level); + prt_newline(out); + @@ -43084,18 +43459,18 @@ index 000000000000..75a3dc7cbd47 + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; -+ debugfs_create_file(bch2_btree_ids[bd->id], ++ debugfs_create_file(bch2_btree_id_str(bd->id), + 0400, c->btree_debug_dir, bd, + &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", -+ bch2_btree_ids[bd->id]); ++ bch2_btree_id_str(bd->id)); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", -+ bch2_btree_ids[bd->id]); ++ bch2_btree_id_str(bd->id)); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &bfloat_failed_debug_ops); @@ -43157,10 +43532,10 @@ index 000000000000..2c37143b5fd1 +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000000..6c6c8d57d72b +index 000000000000..1a0f2d571569 --- /dev/null +++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,587 @@ +@@ -0,0 +1,577 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -43260,61 +43635,51 @@ index 000000000000..6c6c8d57d72b + .is_visible = dirent_is_visible, +}; + -+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); ++ int ret = 0; + -+ if (!d_name.len) { -+ prt_printf(err, "empty name"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!d_name.len, c, err, ++ dirent_empty_name, ++ "empty name"); + -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, ++ dirent_val_too_big, ++ "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + + /* + * Check new keys don't exceed the max length + * (older keys may be larger.) + */ -+ if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) { -+ prt_printf(err, "dirent name too big (%u > %u)", -+ d_name.len, BCH_NAME_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, ++ dirent_name_too_long, ++ "dirent name too big (%u > %u)", ++ d_name.len, BCH_NAME_MAX); + -+ if (d_name.len != strnlen(d_name.name, d_name.len)) { -+ prt_printf(err, "dirent has stray data after name's NUL"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, ++ dirent_name_embedded_nul, ++ "dirent has stray data after name's NUL"); + -+ if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || ++ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, ++ dirent_name_dot_or_dotdot, ++ "invalid name"); + -+ if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, ++ dirent_name_has_slash, ++ "name with /"); + -+ if (memchr(d_name.name, '/', d_name.len)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (d.v->d_type != DT_SUBVOL && -+ le64_to_cpu(d.v->d_inum) == d.k->p.inode) { -+ prt_printf(err, "dirent points to own directory"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && ++ le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, ++ dirent_to_itself, ++ "dirent points to own directory"); ++fsck_err: ++ return ret; +} + +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, @@ -43750,7 +44115,7 @@ index 000000000000..6c6c8d57d72b +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 -index 000000000000..e9fa1df38232 +index 000000000000..cd262bf4d9c5 --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,70 @@ @@ -43763,7 +44128,7 @@ index 000000000000..e9fa1df38232 +enum bkey_invalid_flags; +extern const struct bch_hash_desc bch2_dirent_hash_desc; + -+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -43826,10 +44191,10 @@ index 000000000000..e9fa1df38232 +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 -index 000000000000..e00133b6ea51 +index 000000000000..d613695abf9f --- /dev/null +++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,550 @@ +@@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" @@ -44007,6 +44372,7 @@ index 000000000000..e00133b6ea51 + + dst->deleted = BCH_GROUP_DELETED(src); + dst->parent = BCH_GROUP_PARENT(src); ++ memcpy(dst->label, src->label, sizeof(dst->label)); + } + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { @@ -44214,7 +44580,57 @@ index 000000000000..e00133b6ea51 + return v; +} + -+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) ++void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ++{ ++ struct bch_disk_groups_cpu *groups; ++ struct bch_disk_group_cpu *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ out->atomic++; ++ rcu_read_lock(); ++ groups = rcu_dereference(c->disk_groups); ++ if (!groups) ++ goto invalid; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto invalid; ++ ++ if (v >= groups->nr) ++ goto invalid; ++ ++ g = groups->entries + v; ++ ++ if (g->deleted) ++ goto invalid; ++ ++ path[nr++] = v; ++ ++ if (!g->parent) ++ break; ++ ++ v = g->parent - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); ++ if (nr) ++ prt_printf(out, "."); ++ } ++out: ++ rcu_read_unlock(); ++ out->atomic--; ++ return; ++invalid: ++ prt_printf(out, "invalid label %u", v); ++ goto out; ++} ++ ++void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_field_get(sb, disk_groups); @@ -44325,10 +44741,7 @@ index 000000000000..e00133b6ea51 + return -EINVAL; +} + -+void bch2_opt_target_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_sb *sb, -+ u64 v) ++void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + struct target t = target_decode(v); + @@ -44336,60 +44749,84 @@ index 000000000000..e00133b6ea51 + case TARGET_NULL: + prt_printf(out, "none"); + break; -+ case TARGET_DEV: -+ if (c) { -+ struct bch_dev *ca; ++ case TARGET_DEV: { ++ struct bch_dev *ca; + -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; + -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ prt_printf(out, "offline device %u", t.dev); -+ } else { -+ prt_printf(out, "invalid device %u", t.dev); -+ } -+ -+ rcu_read_unlock(); ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ prt_printf(out, "offline device %u", t.dev); + } else { -+ struct bch_member m = bch2_sb_member_get(sb, t.dev); -+ -+ if (bch2_dev_exists(sb, t.dev)) { -+ prt_printf(out, "Device "); -+ pr_uuid(out, m.uuid.b); -+ prt_printf(out, " (%u)", t.dev); -+ } else { -+ prt_printf(out, "Bad device %u", t.dev); -+ } ++ prt_printf(out, "invalid device %u", t.dev); + } ++ ++ rcu_read_unlock(); + break; ++ } + case TARGET_GROUP: -+ if (c) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); -+ mutex_unlock(&c->sb_lock); -+ } else { -+ bch2_disk_path_to_text(out, sb, t.group); -+ } ++ bch2_disk_path_to_text(out, c, t.group); + break; + default: + BUG(); + } +} ++ ++void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ prt_printf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_member m = bch2_sb_member_get(sb, t.dev); ++ ++ if (bch2_dev_exists(sb, t.dev)) { ++ prt_printf(out, "Device "); ++ pr_uuid(out, m.uuid.b); ++ prt_printf(out, " (%u)", t.dev); ++ } else { ++ prt_printf(out, "Bad device %u", t.dev); ++ } ++ break; ++ } ++ case TARGET_GROUP: ++ bch2_disk_path_to_text_sb(out, sb, t.group); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) ++{ ++ if (c) ++ bch2_target_to_text(out, c, v); ++ else ++ bch2_target_to_text_sb(out, sb, v); ++} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 -index 000000000000..bd7711767fd4 +index 000000000000..441826fff224 --- /dev/null +++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H + ++#include "disk_groups_types.h" ++ +extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; + +static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) @@ -44471,7 +44908,10 @@ index 000000000000..bd7711767fd4 +/* Exported for userspace bcachefs-tools: */ +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); ++void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); ++void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); ++ ++void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); @@ -44492,12 +44932,36 @@ index 000000000000..bd7711767fd4 +void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h +new file mode 100644 +index 000000000000..a54ef085b13d +--- /dev/null ++++ b/fs/bcachefs/disk_groups_types.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H ++#define _BCACHEFS_DISK_GROUPS_TYPES_H ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ u8 label[BCH_SB_LABEL_SIZE]; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[] __counted_by(nr); ++}; ++ ++#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000000..8646856e4539 +index 000000000000..875f7c5a6fca --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1966 @@ +@@ -0,0 +1,1969 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ @@ -44605,29 +45069,26 @@ index 000000000000..8646856e4539 + +/* Stripes btree keys: */ + -+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ int ret = 0; + -+ if (bkey_eq(k.k->p, POS_MIN)) { -+ prt_printf(err, "stripe at POS_MIN"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || ++ bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, ++ stripe_pos_bad, ++ "stripe at bad pos"); + -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero inode field"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, ++ stripe_val_size_bad, ++ "incorrect value size (%zu < %u)", ++ bkey_val_u64s(k.k), stripe_val_u64s(s)); + -+ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { -+ prt_printf(err, "incorrect value size (%zu < %u)", -+ bkey_val_u64s(k.k), stripe_val_u64s(s)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); ++ ret = bch2_bkey_ptrs_invalid(c, k, flags, err); ++fsck_err: ++ return ret; +} + +void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -44653,6 +45114,7 @@ index 000000000000..8646856e4539 + prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); + if (i < nr_data) + prt_printf(out, "#%u", stripe_blockcount_get(s, i)); ++ prt_printf(out, " gen %u", ptr->gen); + if (ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } @@ -44806,16 +45268,21 @@ index 000000000000..8646856e4539 + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { -+ struct printbuf buf2 = PRINTBUF; ++ struct printbuf err = PRINTBUF; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); ++ prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", ++ want.hi, want.lo, ++ got.hi, got.lo, ++ bch2_csum_types[v->csum_type]); ++ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); ++ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); ++ bch_err_ratelimited(ca, "%s", err.buf); ++ printbuf_exit(&err); + -+ bch_err_ratelimited(c, -+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", -+ (void *) _RET_IP_, i, j, v->csum_type, -+ want.lo, got.lo, buf2.buf); -+ printbuf_exit(&buf2); + clear_bit(i, buf->valid); ++ ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + break; + } + @@ -44873,7 +45340,11 @@ index 000000000000..8646856e4539 + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, ++ bio_data_dir(bio) ++ ? BCH_MEMBER_ERROR_write ++ : BCH_MEMBER_ERROR_read, ++ "erasure coding %s error: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -44974,14 +45445,10 @@ index 000000000000..8646856e4539 + return ret; +} + -+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -+{ -+ return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe)); -+} -+ +/* recovery read path: */ -+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +{ ++ struct bch_fs *c = trans->c; + struct ec_stripe_buf *buf; + struct closure cl; + struct bch_stripe *v; @@ -44996,7 +45463,7 @@ index 000000000000..8646856e4539 + if (!buf) + return -BCH_ERR_ENOMEM_ec_read_extent; + -+ ret = get_stripe_key(c, rbio->pick.ec.idx, buf); ++ ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); + if (ret) { + bch_err_ratelimited(c, + "error doing reconstruct read: error %i looking up stripe", ret); @@ -46466,7 +46933,7 @@ index 000000000000..8646856e4539 +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000000..966d165a3b66 +index 000000000000..7d0237c9819f --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,260 @@ @@ -46480,7 +46947,7 @@ index 000000000000..966d165a3b66 + +enum bkey_invalid_flags; + -+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); @@ -46671,7 +47138,7 @@ index 000000000000..966d165a3b66 + struct ec_stripe_new *s; +}; + -+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); + @@ -46853,15 +47320,17 @@ index 000000000000..d260ff9bbfeb +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 -index 000000000000..7cc083776a2e +index 000000000000..68a1a96bb7ca --- /dev/null +++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,265 @@ +@@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + +#define BCH_ERRCODES() \ ++ x(ERANGE, ERANGE_option_too_small) \ ++ x(ERANGE, ERANGE_option_too_big) \ + x(ENOMEM, ENOMEM_stripe_buf) \ + x(ENOMEM, ENOMEM_replicas_table) \ + x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -47072,6 +47541,8 @@ index 000000000000..7cc083776a2e + x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ + x(BCH_ERR_invalid_sb, invalid_sb_clean) \ + x(BCH_ERR_invalid_sb, invalid_sb_quota) \ ++ x(BCH_ERR_invalid_sb, invalid_sb_errors) \ ++ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, btree_node_read_err) \ @@ -47124,10 +47595,10 @@ index 000000000000..7cc083776a2e +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 -index 000000000000..2a5af8872613 +index 000000000000..7b28d37922fd --- /dev/null +++ b/fs/bcachefs/error.c -@@ -0,0 +1,293 @@ +@@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "error.h" @@ -47186,8 +47657,9 @@ index 000000000000..2a5af8872613 + up_write(&c->state_lock); +} + -+void bch2_io_error(struct bch_dev *ca) ++void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) +{ ++ atomic64_inc(&ca->errors[type]); + //queue_work(system_long_wq, &ca->io_error_work); +} + @@ -47246,31 +47718,34 @@ index 000000000000..2a5af8872613 + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return NULL; + -+ list_for_each_entry(s, &c->fsck_errors, list) ++ list_for_each_entry(s, &c->fsck_error_msgs, list) + if (s->fmt == fmt) { + /* + * move it to the head of the list: repeated fsck errors + * are common + */ -+ list_move(&s->list, &c->fsck_errors); ++ list_move(&s->list, &c->fsck_error_msgs); + return s; + } + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) { -+ if (!c->fsck_alloc_err) ++ if (!c->fsck_alloc_msgs_err) + bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); -+ c->fsck_alloc_err = true; ++ c->fsck_alloc_msgs_err = true; + return NULL; + } + + INIT_LIST_HEAD(&s->list); + s->fmt = fmt; -+ list_add(&s->list, &c->fsck_errors); ++ list_add(&s->list, &c->fsck_error_msgs); + return s; +} + -+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) ++int bch2_fsck_err(struct bch_fs *c, ++ enum bch_fsck_flags flags, ++ enum bch_sb_error_id err, ++ const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; @@ -47278,11 +47753,13 @@ index 000000000000..2a5af8872613 + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + ++ bch2_sb_error_count(c, err); ++ + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + -+ mutex_lock(&c->fsck_error_lock); ++ mutex_lock(&c->fsck_error_msgs_lock); + s = fsck_err_get(c, fmt); + if (s) { + /* @@ -47292,7 +47769,7 @@ index 000000000000..2a5af8872613 + */ + if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { + ret = s->ret; -+ mutex_unlock(&c->fsck_error_lock); ++ mutex_unlock(&c->fsck_error_msgs_lock); + printbuf_exit(&buf); + return ret; + } @@ -47387,7 +47864,7 @@ index 000000000000..2a5af8872613 + if (s) + s->ret = ret; + -+ mutex_unlock(&c->fsck_error_lock); ++ mutex_unlock(&c->fsck_error_msgs_lock); + + printbuf_exit(&buf); + @@ -47408,9 +47885,9 @@ index 000000000000..2a5af8872613 +{ + struct fsck_err_state *s, *n; + -+ mutex_lock(&c->fsck_error_lock); ++ mutex_lock(&c->fsck_error_msgs_lock); + -+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { + if (s->ratelimited && s->last_msg) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + @@ -47419,20 +47896,21 @@ index 000000000000..2a5af8872613 + kfree(s); + } + -+ mutex_unlock(&c->fsck_error_lock); ++ mutex_unlock(&c->fsck_error_msgs_lock); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 -index 000000000000..7ce9540052e5 +index 000000000000..d167d65986e0 --- /dev/null +++ b/fs/bcachefs/error.h -@@ -0,0 +1,206 @@ +@@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H + +#include +#include ++#include "sb-errors.h" + +struct bch_dev; +struct bch_fs; @@ -47530,18 +48008,26 @@ index 000000000000..7ce9540052e5 + char *last_msg; +}; + -+#define FSCK_CAN_FIX (1 << 0) -+#define FSCK_CAN_IGNORE (1 << 1) -+#define FSCK_NEED_FSCK (1 << 2) -+#define FSCK_NO_RATELIMIT (1 << 3) ++enum bch_fsck_flags { ++ FSCK_CAN_FIX = 1 << 0, ++ FSCK_CAN_IGNORE = 1 << 1, ++ FSCK_NEED_FSCK = 1 << 2, ++ FSCK_NO_RATELIMIT = 1 << 3, ++}; + -+__printf(3, 4) __cold -+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); ++#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) ++ ++__printf(4, 5) __cold ++int bch2_fsck_err(struct bch_fs *, ++ enum bch_fsck_flags, ++ enum bch_sb_error_id, ++ const char *, ...); +void bch2_flush_fsck_errs(struct bch_fs *); + -+#define __fsck_err(c, _flags, msg, ...) \ ++#define __fsck_err(c, _flags, _err_type, ...) \ +({ \ -+ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ ++ int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ ++ __VA_ARGS__); \ + \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) { \ @@ -47556,26 +48042,53 @@ index 000000000000..7ce9540052e5 + +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + -+#define __fsck_err_on(cond, c, _flags, ...) \ -+ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ ++ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) + -+#define need_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++#define need_fsck_err_on(cond, c, _err_type, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + -+#define need_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++#define need_fsck_err(c, _err_type, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + -+#define mustfix_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++#define mustfix_fsck_err(c, _err_type, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + -+#define mustfix_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + -+#define fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++#define fsck_err(c, _err_type, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + -+#define fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++#define fsck_err_on(cond, c, _err_type, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) ++ ++static inline void bch2_bkey_fsck_err(struct bch_fs *c, ++ struct printbuf *err_msg, ++ enum bch_sb_error_id err_type, ++ const char *fmt, ...) ++{ ++ va_list args; ++ ++ va_start(args, fmt); ++ prt_vprintf(err_msg, fmt, args); ++ va_end(args); ++ ++} ++ ++#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ ++do { \ ++ prt_printf(_err_msg, __VA_ARGS__); \ ++ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ ++ ret = -BCH_ERR_invalid_bkey; \ ++ goto fsck_err; \ ++} while (0) ++ ++#define bkey_fsck_err_on(cond, ...) \ ++do { \ ++ if (unlikely(cond)) \ ++ bkey_fsck_err(__VA_ARGS__); \ ++} while (0) + +/* + * Fatal errors: these don't indicate a bug, but we can't continue running in RW @@ -47608,26 +48121,26 @@ index 000000000000..7ce9540052e5 +void bch2_io_error_work(struct work_struct *); + +/* Does the error handling without logging a message */ -+void bch2_io_error(struct bch_dev *); ++void bch2_io_error(struct bch_dev *, enum bch_member_error_type); + -+#define bch2_dev_io_err_on(cond, ca, ...) \ ++#define bch2_dev_io_err_on(cond, ca, _type, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_dev_ratelimited(ca, __VA_ARGS__); \ -+ bch2_io_error(ca); \ ++ bch2_io_error(ca, _type); \ + } \ + _ret; \ +}) + -+#define bch2_dev_inum_io_err_on(cond, ca, ...) \ ++#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ -+ bch2_io_error(ca); \ ++ bch2_io_error(ca, _type); \ + } \ + _ret; \ +}) @@ -47832,10 +48345,10 @@ index 000000000000..6f5cf449361a +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000000..1b25f84e4b9c +index 000000000000..a864de231b69 --- /dev/null +++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1403 @@ +@@ -0,0 +1,1516 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -47851,6 +48364,7 @@ index 000000000000..1b25f84e4b9c +#include "btree_iter.h" +#include "buckets.h" +#include "checksum.h" ++#include "compress.h" +#include "debug.h" +#include "disk_groups.h" +#include "error.h" @@ -48000,17 +48514,19 @@ index 000000000000..1b25f84e4b9c + +/* KEY_TYPE_btree_ptr: */ + -+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), BCH_REPLICAS_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); ++ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, ++ btree_ptr_val_too_big, ++ "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); ++ ++ ret = bch2_bkey_ptrs_invalid(c, k, flags, err); ++fsck_err: ++ return ret; +} + +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -48019,17 +48535,20 @@ index 000000000000..1b25f84e4b9c + bch2_bkey_ptrs_to_text(out, c, k); +} + -+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { -+ prt_printf(err, "value too big (%zu > %zu)", -+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); ++ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, ++ btree_ptr_v2_val_too_big, ++ "value too big (%zu > %zu)", ++ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); ++ ++ ret = bch2_bkey_ptrs_invalid(c, k, flags, err); ++fsck_err: ++ return ret; +} + +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -48210,19 +48729,18 @@ index 000000000000..1b25f84e4b9c + +/* KEY_TYPE_reservation: */ + -+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ int ret = 0; + -+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { -+ prt_printf(err, "invalid nr_replicas (%u)", -+ r.v->nr_replicas); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, ++ reservation_key_nr_replicas_invalid, ++ "invalid nr_replicas (%u)", r.v->nr_replicas); ++fsck_err: ++ return ret; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -48595,18 +49113,6 @@ index 000000000000..1b25f84e4b9c + return i; +} + -+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *next = extent_entry_next(entry); -+ -+ /* stripes have ptrs, but their layout doesn't work with this code */ -+ BUG_ON(k.k->type == KEY_TYPE_stripe); -+ -+ memmove_u64s_down(entry, next, -+ (u64 *) bkey_val_end(k) - (u64 *) next); -+ k.k->u64s -= (u64 *) next - (u64 *) entry; -+} -+ +/* + * Returns pointer to the next entry after the one being dropped: + */ @@ -48830,10 +49336,6 @@ index 000000000000..1b25f84e4b9c +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ const struct bch_extent_ptr *ptr; -+ const struct bch_extent_stripe_ptr *ec; -+ struct bch_dev *ca; + bool first = true; + + if (c) @@ -48844,9 +49346,9 @@ index 000000000000..1b25f84e4b9c + prt_printf(out, " "); + + switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ptr = entry_to_ptr(entry); -+ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *ptr = entry_to_ptr(entry); ++ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + @@ -48868,10 +49370,12 @@ index 000000000000..1b25f84e4b9c + prt_printf(out, " stale"); + } + break; ++ } + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, @@ -48880,12 +49384,26 @@ index 000000000000..1b25f84e4b9c + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); + break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ec = &entry->stripe_ptr; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: { ++ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; + + prt_printf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; ++ } ++ case BCH_EXTENT_ENTRY_rebalance: { ++ const struct bch_extent_rebalance *r = &entry->rebalance; ++ ++ prt_str(out, "rebalance: target "); ++ if (c) ++ bch2_target_to_text(out, c, r->target); ++ else ++ prt_printf(out, "%u", r->target); ++ prt_str(out, " compression "); ++ bch2_compression_opt_to_text(out, r->compression); ++ break; ++ } + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; @@ -48895,7 +49413,7 @@ index 000000000000..1b25f84e4b9c + } +} + -+static int extent_ptr_invalid(const struct bch_fs *c, ++static int extent_ptr_invalid(struct bch_fs *c, + struct bkey_s_c k, + enum bkey_invalid_flags flags, + const struct bch_extent_ptr *ptr, @@ -48908,6 +49426,7 @@ index 000000000000..1b25f84e4b9c + u64 bucket; + u32 bucket_offset; + struct bch_dev *ca; ++ int ret = 0; + + if (!bch2_dev_exists2(c, ptr->dev)) { + /* @@ -48918,41 +49437,33 @@ index 000000000000..1b25f84e4b9c + if (flags & BKEY_INVALID_WRITE) + return 0; + -+ prt_printf(err, "pointer to invalid device (%u)", ptr->dev); -+ return -BCH_ERR_invalid_bkey; ++ bkey_fsck_err(c, err, ptr_to_invalid_device, ++ "pointer to invalid device (%u)", ptr->dev); + } + + ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr != ptr2 && ptr->dev == ptr2->dev) { -+ prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, ++ ptr_to_duplicate_device, ++ "multiple pointers to same device (%u)", ptr->dev); + + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + -+ if (bucket >= ca->mi.nbuckets) { -+ prt_printf(err, "pointer past last bucket (%llu > %llu)", -+ bucket, ca->mi.nbuckets); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { -+ prt_printf(err, "pointer before first bucket (%llu < %u)", -+ bucket, ca->mi.first_bucket); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bucket_offset + size_ondisk > ca->mi.bucket_size) { -+ prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", ++ bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, ++ ptr_after_last_bucket, ++ "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); ++ bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, ++ ptr_before_first_bucket, ++ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); ++ bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, ++ ptr_spans_multiple_buckets, ++ "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++fsck_err: ++ return ret; +} + -+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ @@ -48962,24 +49473,22 @@ index 000000000000..1b25f84e4b9c + unsigned size_ondisk = k.k->size; + unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; -+ bool unwritten = false, have_ec = false, crc_since_last_ptr = false; -+ int ret; ++ bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; ++ int ret = 0; + + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { -+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { -+ prt_printf(err, "invalid extent entry type (got %u, max %u)", -+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, ++ extent_ptrs_invalid_entry, ++ "invalid extent entry type (got %u, max %u)", ++ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + -+ if (bkey_is_btree_ptr(k.k) && -+ !extent_entry_is_ptr(entry)) { -+ prt_printf(err, "has non ptr field"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && ++ !extent_entry_is_ptr(entry), c, err, ++ btree_ptr_has_non_ptr, ++ "has non ptr field"); + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: @@ -48988,22 +49497,15 @@ index 000000000000..1b25f84e4b9c + if (ret) + return ret; + -+ if (nr_ptrs && unwritten != entry->ptr.unwritten) { -+ prt_printf(err, "extent with unwritten and written ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, ++ ptr_cached_and_erasure_coded, ++ "cached, erasure coded ptr"); + -+ if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { -+ prt_printf(err, "has unwritten ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ if (!entry->ptr.unwritten) ++ have_written = true; ++ else ++ have_unwritten = true; + -+ if (entry->ptr.cached && have_ec) { -+ prt_printf(err, "cached, erasure coded ptr"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; @@ -49013,72 +49515,77 @@ index 000000000000..1b25f84e4b9c + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + -+ if (crc.offset + crc.live_size > -+ crc.uncompressed_size) { -+ prt_printf(err, "checksum offset + key size > uncompressed size"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ size_ondisk = crc.compressed_size; -+ -+ if (!bch2_checksum_type_valid(c, crc.csum_type)) { -+ prt_printf(err, "invalid checksum type"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { -+ prt_printf(err, "invalid compression type"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, ++ ptr_crc_uncompressed_size_too_small, ++ "checksum offset + key size > uncompressed size"); ++ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, ++ ptr_crc_csum_type_unknown, ++ "invalid checksum type"); ++ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, ++ ptr_crc_compression_type_unknown, ++ "invalid compression type"); + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; -+ else if (nonce != crc.offset + crc.nonce) { -+ prt_printf(err, "incorrect nonce"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ else if (nonce != crc.offset + crc.nonce) ++ bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, ++ "incorrect nonce"); + } + -+ if (crc_since_last_ptr) { -+ prt_printf(err, "redundant crc entry"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(crc_since_last_ptr, c, err, ++ ptr_crc_redundant, ++ "redundant crc entry"); + crc_since_last_ptr = true; ++ ++ bkey_fsck_err_on(crc_is_encoded(crc) && ++ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && ++ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, ++ ptr_crc_uncompressed_size_too_big, ++ "too large encoded extent"); ++ ++ size_ondisk = crc.compressed_size; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: -+ if (have_ec) { -+ prt_printf(err, "redundant stripe entry"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(have_ec, c, err, ++ ptr_stripe_redundant, ++ "redundant stripe entry"); + have_ec = true; + break; -+ case BCH_EXTENT_ENTRY_rebalance: ++ case BCH_EXTENT_ENTRY_rebalance: { ++ const struct bch_extent_rebalance *r = &entry->rebalance; ++ ++ if (!bch2_compression_opt_valid(r->compression)) { ++ struct bch_compression_opt opt = __bch2_compression_decode(r->compression); ++ prt_printf(err, "invalid compression opt %u:%u", ++ opt.type, opt.level); ++ return -BCH_ERR_invalid_bkey; ++ } + break; + } ++ } + } + -+ if (!nr_ptrs) { -+ prt_str(err, "no ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { -+ prt_str(err, "too many ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (crc_since_last_ptr) { -+ prt_printf(err, "redundant crc entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (have_ec) { -+ prt_printf(err, "redundant stripe entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(!nr_ptrs, c, err, ++ extent_ptrs_no_ptrs, ++ "no ptrs"); ++ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, ++ extent_ptrs_too_many_ptrs, ++ "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); ++ bkey_fsck_err_on(have_written && have_unwritten, c, err, ++ extent_ptrs_written_and_unwritten, ++ "extent with unwritten and written ptrs"); ++ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, ++ extent_ptrs_unwritten, ++ "has unwritten ptrs"); ++ bkey_fsck_err_on(crc_since_last_ptr, c, err, ++ extent_ptrs_redundant_crc, ++ "redundant crc entry"); ++ bkey_fsck_err_on(have_ec, c, err, ++ extent_ptrs_redundant_stripe, ++ "redundant stripe entry"); ++fsck_err: ++ return ret; +} + +void bch2_ptr_swab(struct bkey_s k) @@ -49119,6 +49626,125 @@ index 000000000000..1b25f84e4b9c + } +} + ++const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ ++ bkey_extent_entry_for_each(ptrs, entry) ++ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) ++ return &entry->rebalance; ++ ++ return NULL; ++} ++ ++unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, ++ unsigned target, unsigned compression) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ unsigned rewrite_ptrs = 0; ++ ++ if (compression) { ++ unsigned compression_type = bch2_compression_opt_to_type(compression); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned i = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { ++ rewrite_ptrs = 0; ++ goto incompressible; ++ } ++ ++ if (!p.ptr.cached && p.crc.compression_type != compression_type) ++ rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ } ++incompressible: ++ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { ++ const struct bch_extent_ptr *ptr; ++ unsigned i = 0; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) ++ rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ } ++ ++ return rewrite_ptrs; ++} ++ ++bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); ++ ++ /* ++ * If it's an indirect extent, we don't delete the rebalance entry when ++ * done so that we know what options were applied - check if it still ++ * needs work done: ++ */ ++ if (r && ++ k.k->type == KEY_TYPE_reflink_v && ++ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) ++ r = NULL; ++ ++ return r != NULL; ++} ++ ++int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, ++ unsigned target, unsigned compression) ++{ ++ struct bkey_s k = bkey_i_to_s(_k); ++ struct bch_extent_rebalance *r; ++ bool needs_rebalance; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ return 0; ++ ++ /* get existing rebalance entry: */ ++ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); ++ if (r) { ++ if (k.k->type == KEY_TYPE_reflink_v) { ++ /* ++ * indirect extents: existing options take precedence, ++ * so that we don't move extents back and forth if ++ * they're referenced by different inodes with different ++ * options: ++ */ ++ if (r->target) ++ target = r->target; ++ if (r->compression) ++ compression = r->compression; ++ } ++ ++ r->target = target; ++ r->compression = compression; ++ } ++ ++ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); ++ ++ if (needs_rebalance && !r) { ++ union bch_extent_entry *new = bkey_val_end(k); ++ ++ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; ++ new->rebalance.compression = compression; ++ new->rebalance.target = target; ++ new->rebalance.unused = 0; ++ k.k->u64s += extent_entry_u64s(new); ++ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { ++ /* ++ * For indirect extents, don't delete the rebalance entry when ++ * we're finished so that we know we specifically moved it or ++ * compressed it to its current location/compression type ++ */ ++ extent_entry_drop(k, (union bch_extent_entry *) r); ++ } ++ ++ return 0; ++} ++ +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -49241,10 +49867,10 @@ index 000000000000..1b25f84e4b9c +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000000..879e7d218b6a +index 000000000000..a2ce8a3be13c --- /dev/null +++ b/fs/bcachefs/extents.h -@@ -0,0 +1,758 @@ +@@ -0,0 +1,765 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H @@ -49336,6 +49962,18 @@ index 000000000000..879e7d218b6a + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); +} + ++static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *next = extent_entry_next(entry); ++ ++ /* stripes have ptrs, but their layout doesn't work with this code */ ++ BUG_ON(k.k->type == KEY_TYPE_stripe); ++ ++ memmove_u64s_down(entry, next, ++ (u64 *) bkey_val_end(k) - (u64 *) next); ++ k.k->u64s -= (u64 *) next - (u64 *) entry; ++} ++ +static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; @@ -49437,6 +50075,11 @@ index 000000000000..879e7d218b6a + crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); +} + ++static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) ++{ ++ return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); ++} ++ +/* bkey_ptrs: generically over any key type that has ptrs */ + +struct bkey_ptrs_c { @@ -49630,12 +50273,12 @@ index 000000000000..879e7d218b6a + +/* KEY_TYPE_btree_ptr: */ + -+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + -+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, @@ -49675,7 +50318,7 @@ index 000000000000..879e7d218b6a + +/* KEY_TYPE_reservation: */ + -+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -49935,11 +50578,19 @@ index 000000000000..879e7d218b6a +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); -+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +void bch2_ptr_swab(struct bkey_s); + ++const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); ++unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, ++ unsigned, unsigned); ++bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); ++ ++int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, ++ unsigned, unsigned); ++ +/* Generic extent code: */ + +enum bch_extent_overlap { @@ -49984,24 +50635,6 @@ index 000000000000..879e7d218b6a + k->size = new_size; +} + -+/* -+ * In extent_sort_fix_overlapping(), insert_fixup_extent(), -+ * extent_merge_inline() - we're modifying keys in place that are packed. To do -+ * that we have to unpack the key, modify the unpacked key - then this -+ * copies/repacks the unpacked to the original as necessary. -+ */ -+static inline void extent_save(struct btree *b, struct bkey_packed *dst, -+ struct bkey *src) -+{ -+ struct bkey_format *f = &b->format; -+ struct bkey_i *dst_unpacked; -+ -+ if ((dst_unpacked = packed_to_bkey(dst))) -+ dst_unpacked->k = *src; -+ else -+ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -+} -+ +#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h new file mode 100644 @@ -50471,7 +51104,7 @@ index 000000000000..66b945be10c2 +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 -index 000000000000..bb5305441f27 +index 000000000000..4496cf91a4c1 --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,501 @@ @@ -50528,7 +51161,7 @@ index 000000000000..bb5305441f27 + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) -+ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ new_inode->bi_flags |= BCH_INODE_unlinked; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) @@ -51027,10 +51660,10 @@ index 000000000000..dde237859514 +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c new file mode 100644 -index 000000000000..58ccc7b91ac7 +index 000000000000..52f0e7acda3d --- /dev/null +++ b/fs/bcachefs/fs-io-buffered.c -@@ -0,0 +1,1093 @@ +@@ -0,0 +1,1106 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -51422,6 +52055,21 @@ index 000000000000..58ccc7b91ac7 + return ret; +} + ++/* ++ * Determine when a writepage io is full. We have to limit writepage bios to a ++ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to ++ * what the bounce path in bch2_write_extent() can handle. In theory we could ++ * loosen this restriction for non-bounce I/O, but we don't have that context ++ * here. Ideally, we can up this limit and make it configurable in the future ++ * when the bounce path can be enhanced to accommodate larger source bios. ++ */ ++static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) ++{ ++ struct bio *bio = &io->op.wbio.bio; ++ return bio_full(bio, len) || ++ (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); ++} ++ +static void bch2_writepage_io_done(struct bch_write_op *op) +{ + struct bch_writepage_io *io = @@ -51639,9 +52287,7 @@ index 000000000000..58ccc7b91ac7 + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || -+ bio_full(&w->io->op.wbio.bio, sectors << 9) || -+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= -+ (BIO_MAX_VECS * PAGE_SIZE) || ++ bch_io_full(w->io, sectors << 9) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + @@ -52159,10 +52805,10 @@ index 000000000000..a6126ff790e6 +#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c new file mode 100644 -index 000000000000..6a9557e7ecab +index 000000000000..5b42a76c4796 --- /dev/null +++ b/fs/bcachefs/fs-io-direct.c -@@ -0,0 +1,679 @@ +@@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -52278,6 +52924,7 @@ index 000000000000..6a9557e7ecab + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); ++ dio->cl.closure_get_happened = true; + } + + dio->req = req; @@ -55113,7 +55760,7 @@ index 000000000000..ca70346e68dc +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 -index 000000000000..6040bd3f0778 +index 000000000000..5a39bcb597a3 --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c @@ -0,0 +1,572 @@ @@ -55164,13 +55811,13 @@ index 000000000000..6040bd3f0778 + unsigned newflags = s->flags; + unsigned oldflags = bi->bi_flags & s->mask; + -+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && -+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) + return -EINVAL; + + if (s->set_projinherit) { @@ -55691,7 +56338,7 @@ index 000000000000..6040bd3f0778 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h new file mode 100644 -index 000000000000..54a9c21a3b83 +index 000000000000..d30f9bb056fd --- /dev/null +++ b/fs/bcachefs/fs-ioctl.h @@ -0,0 +1,81 @@ @@ -55703,28 +56350,28 @@ index 000000000000..54a9c21a3b83 + +/* bcachefs inode flags -> vfs inode flags: */ +static const __maybe_unused unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_SYNC] = S_SYNC, -+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, -+ [__BCH_INODE_APPEND] = S_APPEND, -+ [__BCH_INODE_NOATIME] = S_NOATIME, ++ [__BCH_INODE_sync] = S_SYNC, ++ [__BCH_INODE_immutable] = S_IMMUTABLE, ++ [__BCH_INODE_append] = S_APPEND, ++ [__BCH_INODE_noatime] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const __maybe_unused unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_SYNC] = FS_SYNC_FL, -+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_APPEND] = FS_APPEND_FL, -+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, -+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++ [__BCH_INODE_sync] = FS_SYNC_FL, ++ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_append] = FS_APPEND_FL, ++ [__BCH_INODE_nodump] = FS_NODUMP_FL, ++ [__BCH_INODE_noatime] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const __maybe_unused unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, -+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, -+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ [__BCH_INODE_sync] = FS_XFLAG_SYNC, ++ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_append] = FS_XFLAG_APPEND, ++ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + @@ -55778,10 +56425,10 @@ index 000000000000..54a9c21a3b83 +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000000..bc009ef497d0 +index 000000000000..82b668ea20aa --- /dev/null +++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1980 @@ +@@ -0,0 +1,1977 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -56548,15 +57195,15 @@ index 000000000000..bc009ef497d0 + stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); + } + -+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ if (inode->ei_inode.bi_flags & BCH_INODE_immutable) + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; + -+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ if (inode->ei_inode.bi_flags & BCH_INODE_append) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; + -+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ if (inode->ei_inode.bi_flags & BCH_INODE_nodump) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; + @@ -56997,9 +57644,6 @@ index 000000000000..bc009ef497d0 + .inum = inode->ei_inode.bi_dir, + }; + -+ if (!parent_inum.inum) -+ return NULL; -+ + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + @@ -57979,14 +58623,15 @@ index 000000000000..5edf1d4b9e6b +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000000..b8f9e7475dc5 +index 000000000000..9f3e9bd3d767 --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2417 @@ +@@ -0,0 +1,2490 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" ++#include "btree_cache.h" +#include "btree_update.h" +#include "buckets.h" +#include "darray.h" @@ -58429,9 +59074,10 @@ index 000000000000..b8f9e7475dc5 + if (i->equiv == n.equiv) { + bch_err(c, "snapshot deletion did not finish:\n" + " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", -+ bch2_btree_ids[btree_id], ++ bch2_btree_id_str(btree_id), + pos.inode, pos.offset, + i->id, n.id, n.equiv); ++ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); + } + } @@ -58704,8 +59350,9 @@ index 000000000000..b8f9e7475dc5 + int ret = 0; + + if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, -+ "key in missing snapshot: %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ bkey_in_missing_snapshot, ++ "key in missing snapshot: %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: @@ -58774,6 +59421,7 @@ index 000000000000..b8f9e7475dc5 + + if (fsck_err_on(k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k), c, ++ hash_table_key_duplicate, + "duplicate hash table keys:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), @@ -58792,8 +59440,9 @@ index 000000000000..b8f9e7475dc5 + printbuf_exit(&buf); + return ret; +bad_hash: -+ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", -+ bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, ++ if (fsck_err(c, hash_table_key_wrong_offset, ++ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", ++ bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); @@ -58834,22 +59483,23 @@ index 000000000000..b8f9e7475dc5 + BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && -+ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| -+ BCH_INODE_I_SECTORS_DIRTY| -+ BCH_INODE_UNLINKED))) ++ !(u.bi_flags & (BCH_INODE_i_size_dirty| ++ BCH_INODE_i_sectors_dirty| ++ BCH_INODE_unlinked))) + return 0; + + if (prev->bi_inum != u.bi_inum) + *prev = u; + + if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || -+ inode_d_type(prev) != inode_d_type(&u), c, ++ inode_d_type(prev) != inode_d_type(&u), ++ c, inode_snapshot_mismatch, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + -+ if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && ++ if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && + bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { + struct bpos new_min_pos; + @@ -58857,7 +59507,7 @@ index 000000000000..b8f9e7475dc5 + if (ret) + goto err; + -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; ++ u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; + + ret = __write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); @@ -58869,9 +59519,10 @@ index 000000000000..b8f9e7475dc5 + return 0; + } + -+ if (u.bi_flags & BCH_INODE_UNLINKED && ++ if (u.bi_flags & BCH_INODE_unlinked && + (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ fsck_err(c, inode_unlinked_but_clean, ++ "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); @@ -58881,9 +59532,10 @@ index 000000000000..b8f9e7475dc5 + return ret; + } + -+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ if (u.bi_flags & BCH_INODE_i_size_dirty && + (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ fsck_err(c, inode_i_size_dirty_but_clean, ++ "filesystem marked clean, but inode %llu has i_size dirty", + u.bi_inum))) { + bch_verbose(c, "truncating inode %llu", u.bi_inum); + @@ -58907,15 +59559,16 @@ index 000000000000..b8f9e7475dc5 + * We truncated without our normal sector accounting hook, just + * make sure we recalculate it: + */ -+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ u.bi_flags |= BCH_INODE_i_sectors_dirty; + -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ u.bi_flags &= ~BCH_INODE_i_size_dirty; + do_update = true; + } + -+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ if (u.bi_flags & BCH_INODE_i_sectors_dirty && + (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ fsck_err(c, inode_i_sectors_dirty_but_clean, ++ "filesystem marked clean, but inode %llu has i_sectors dirty", + u.bi_inum))) { + s64 sectors; + @@ -58929,14 +59582,14 @@ index 000000000000..b8f9e7475dc5 + } + + u.bi_sectors = sectors; -+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ u.bi_flags &= ~BCH_INODE_i_sectors_dirty; + do_update = true; + } + -+ if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { ++ if (u.bi_flags & BCH_INODE_backptr_untrusted) { + u.bi_dir = 0; + u.bi_dir_offset = 0; -+ u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; ++ u.bi_flags &= ~BCH_INODE_backptr_untrusted; + do_update = true; + } + @@ -59041,10 +59694,11 @@ index 000000000000..b8f9e7475dc5 + return -BCH_ERR_internal_fsck_err; + } + -+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, -+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", -+ w->last_pos.inode, i->snapshot, -+ i->inode.bi_sectors, i->count)) { ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), ++ c, inode_i_sectors_wrong, ++ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", ++ w->last_pos.inode, i->snapshot, ++ i->inode.bi_sectors, i->count)) { + i->inode.bi_sectors = i->count; + ret = fsck_write_inode(trans, &i->inode, i->snapshot); + if (ret) @@ -59185,7 +59839,8 @@ index 000000000000..b8f9e7475dc5 + prt_printf(&buf, "\n overwriting %s extent", + pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); + -+ if (fsck_err(c, "overlapping extents%s", buf.buf)) { ++ if (fsck_err(c, extent_overlapping, ++ "overlapping extents%s", buf.buf)) { + struct btree_iter *old_iter = &iter1; + struct disk_reservation res = { 0 }; + @@ -59282,6 +59937,28 @@ index 000000000000..b8f9e7475dc5 + return ret; +} + ++static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc_is_encoded(crc) && ++ crc.uncompressed_size > encoded_extent_max_sectors) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ++ return 0; ++} ++ +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct inode_walker *inode, @@ -59318,7 +59995,7 @@ index 000000000000..b8f9e7475dc5 + goto err; + + if (k.k->type != KEY_TYPE_whiteout) { -+ if (fsck_err_on(!i, c, ++ if (fsck_err_on(!i, c, extent_in_missing_inode, + "extent in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -59326,7 +60003,8 @@ index 000000000000..b8f9e7475dc5 + + if (fsck_err_on(i && + !S_ISREG(i->inode.bi_mode) && -+ !S_ISLNK(i->inode.bi_mode), c, ++ !S_ISLNK(i->inode.bi_mode), ++ c, extent_in_non_reg_inode, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), @@ -59356,9 +60034,10 @@ index 000000000000..b8f9e7475dc5 + continue; + + if (k.k->type != KEY_TYPE_whiteout) { -+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && -+ !bkey_extent_is_reservation(k), c, ++ !bkey_extent_is_reservation(k), ++ c, extent_past_end_of_inode, + "extent type past end of inode %llu:%u, i_size %llu\n %s", + i->inode.bi_inum, i->snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -59417,7 +60096,8 @@ index 000000000000..b8f9e7475dc5 + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); -+ check_extent(trans, &iter, k, &w, &s, &extent_ends); ++ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: ++ check_extent_overbig(trans, &iter, k); + })) ?: + check_i_sectors(trans, &w); + @@ -59431,6 +60111,30 @@ index 000000000000..b8f9e7475dc5 + return ret; +} + ++int bch2_check_indirect_extents(struct bch_fs *c) ++{ ++ struct btree_trans *trans = bch2_trans_get(c); ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct disk_reservation res = { 0 }; ++ int ret = 0; ++ ++ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, ++ POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ &res, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ ++ bch2_disk_reservation_put(c, &res); ++ check_extent_overbig(trans, &iter, k); ++ })); ++ ++ bch2_disk_reservation_put(c, &res); ++ bch2_trans_put(trans); ++ ++ bch_err_fn(c, ret); ++ return ret; ++} ++ +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; @@ -59455,7 +60159,8 @@ index 000000000000..b8f9e7475dc5 + continue; + } + -+ if (fsck_err_on(i->inode.bi_nlink != i->count, c, ++ if (fsck_err_on(i->inode.bi_nlink != i->count, ++ c, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; @@ -59499,27 +60204,28 @@ index 000000000000..b8f9e7475dc5 + backpointer_exists = ret; + ret = 0; + -+ if (fsck_err_on(S_ISDIR(target->bi_mode) && -+ backpointer_exists, c, ++ if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, ++ c, inode_dir_multiple_links, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } + -+ if (fsck_err_on(backpointer_exists && -+ !target->bi_nlink, c, ++ if (fsck_err_on(backpointer_exists && !target->bi_nlink, ++ c, inode_multiple_links_but_nlink_0, + "inode %llu type %s has multiple links but i_nlink 0", + target->bi_inum, bch2_d_types[d.v->d_type])) { + target->bi_nlink++; -+ target->bi_flags &= ~BCH_INODE_UNLINKED; ++ target->bi_flags &= ~BCH_INODE_unlinked; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + -+ if (fsck_err_on(!backpointer_exists, c, ++ if (fsck_err_on(!backpointer_exists, ++ c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", @@ -59537,7 +60243,8 @@ index 000000000000..b8f9e7475dc5 + } + } + -+ if (fsck_err_on(d.v->d_type != inode_d_type(target), c, ++ if (fsck_err_on(d.v->d_type != inode_d_type(target), ++ c, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), @@ -59561,7 +60268,8 @@ index 000000000000..b8f9e7475dc5 + if (d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && + (c->sb.version < bcachefs_metadata_version_subvol_dirent || -+ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", ++ fsck_err(c, dirent_d_parent_subvol_wrong, ++ "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); @@ -59633,7 +60341,7 @@ index 000000000000..b8f9e7475dc5 + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + dir->first_this_inode = false; + -+ if (fsck_err_on(!i, c, ++ if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, + "dirent in nonexisting directory:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -59645,7 +60353,8 @@ index 000000000000..b8f9e7475dc5 + if (!i) + goto out; + -+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, ++ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), ++ c, dirent_in_non_dir_inode, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), + (printbuf_reset(&buf), @@ -59679,7 +60388,7 @@ index 000000000000..b8f9e7475dc5 + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + -+ if (fsck_err_on(ret, c, ++ if (fsck_err_on(ret, c, dirent_to_missing_subvol, + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); @@ -59691,7 +60400,7 @@ index 000000000000..b8f9e7475dc5 + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + -+ if (fsck_err_on(ret, c, ++ if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { @@ -59700,7 +60409,8 @@ index 000000000000..b8f9e7475dc5 + goto err; + } + -+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, ++ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, ++ c, subvol_root_wrong_bi_subvol, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { @@ -59719,7 +60429,8 @@ index 000000000000..b8f9e7475dc5 + if (ret) + goto err; + -+ if (fsck_err_on(!target->inodes.nr, c, ++ if (fsck_err_on(!target->inodes.nr, ++ c, dirent_to_missing_inode, + "dirent points to missing inode: (equiv %u)\n%s", + equiv.snapshot, + (printbuf_reset(&buf), @@ -59805,7 +60516,7 @@ index 000000000000..b8f9e7475dc5 + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + inode->first_this_inode = false; + -+ if (fsck_err_on(!i, c, ++ if (fsck_err_on(!i, c, xattr_in_missing_inode, + "xattr for missing inode %llu", + k.k->p.inode)) + return bch2_btree_delete_at(trans, iter, 0); @@ -59854,7 +60565,8 @@ index 000000000000..b8f9e7475dc5 + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + -+ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { ++ if (mustfix_fsck_err_on(ret, c, root_subvol_missing, ++ "root subvol missing")) { + struct bkey_i_subvolume root_subvol; + + snapshot = U32_MAX; @@ -59880,8 +60592,10 @@ index 000000000000..b8f9e7475dc5 + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + -+ if (mustfix_fsck_err_on(ret, c, "root directory missing") || -+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, ++ if (mustfix_fsck_err_on(ret, c, root_dir_missing, ++ "root directory missing") || ++ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), ++ c, root_inode_not_dir, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); @@ -59985,7 +60699,8 @@ index 000000000000..b8f9e7475dc5 + } + + if (bch2_err_matches(ret, ENOENT)) { -+ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", ++ if (fsck_err(c, inode_unreachable, ++ "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + bch2_d_type_str(inode_d_type(inode)), + inode->bi_nlink, @@ -60025,7 +60740,8 @@ index 000000000000..b8f9e7475dc5 + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + -+ if (!fsck_err(c, "directory structure loop")) ++ if (!fsck_err(c, dir_loop, ++ "directory structure loop")) + return 0; + + ret = commit_do(trans, NULL, NULL, @@ -60073,7 +60789,7 @@ index 000000000000..b8f9e7475dc5 + break; + } + -+ if (u.bi_flags & BCH_INODE_UNLINKED) ++ if (u.bi_flags & BCH_INODE_unlinked) + continue; + + ret = check_path(trans, &path, &u, iter.pos.snapshot); @@ -60285,7 +61001,8 @@ index 000000000000..b8f9e7475dc5 + link = &links->d[++*idx]; + } + -+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, ++ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, ++ c, inode_wrong_nlink, + "inode %llu type %s has wrong i_nlink (%u, should be %u)", + u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], + bch2_inode_nlink_get(&u), link->count)) { @@ -60402,16 +61119,17 @@ index 000000000000..b8f9e7475dc5 +} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h new file mode 100644 -index 000000000000..90c87b5089a0 +index 000000000000..da991e8cf27e --- /dev/null +++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,14 @@ +@@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FSCK_H +#define _BCACHEFS_FSCK_H + +int bch2_check_inodes(struct bch_fs *); +int bch2_check_extents(struct bch_fs *); ++int bch2_check_indirect_extents(struct bch_fs *); +int bch2_check_dirents(struct bch_fs *); +int bch2_check_xattrs(struct bch_fs *); +int bch2_check_root(struct bch_fs *); @@ -60422,10 +61140,10 @@ index 000000000000..90c87b5089a0 +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000000..bb3f443d8381 +index 000000000000..def77f2d8802 --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,1133 @@ +@@ -0,0 +1,1198 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -60434,6 +61152,7 @@ index 000000000000..bb3f443d8381 +#include "bkey_methods.h" +#include "btree_update.h" +#include "buckets.h" ++#include "compress.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" @@ -60447,13 +61166,18 @@ index 000000000000..bb3f443d8381 + +#include + -+const char * const bch2_inode_opts[] = { +#define x(name, ...) #name, ++const char * const bch2_inode_opts[] = { + BCH_INODE_OPTS() -+#undef x + NULL, +}; + ++static const char * const bch2_inode_flag_strs[] = { ++ BCH_INODE_FLAGS() ++ NULL ++}; ++#undef x ++ +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; + +static int inode_decode_field(const u8 *in, const u8 *end, @@ -60789,9 +61513,10 @@ index 000000000000..bb3f443d8381 + return ret; +} + -+int bch2_inode_write(struct btree_trans *trans, ++int bch2_inode_write_flags(struct btree_trans *trans, + struct btree_iter *iter, -+ struct bch_inode_unpacked *inode) ++ struct bch_inode_unpacked *inode, ++ enum btree_update_flags flags) +{ + struct bkey_inode_buf *inode_p; + @@ -60801,7 +61526,7 @@ index 000000000000..bb3f443d8381 + + bch2_inode_pack_inlined(inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; -+ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); +} + +struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) @@ -60825,117 +61550,121 @@ index 000000000000..bb3f443d8381 + return &inode_p->inode.k_i; +} + -+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) ++static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +{ + struct bch_inode_unpacked unpacked; ++ int ret = 0; + -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero k.p.inode"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->p.inode, c, err, ++ inode_pos_inode_nonzero, ++ "nonzero k.p.inode"); + -+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) { -+ prt_printf(err, "fs inode in blockdev range"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, ++ inode_pos_blockdev_range, ++ "fs inode in blockdev range"); + -+ if (bch2_inode_unpack(k, &unpacked)) { -+ prt_printf(err, "invalid variable length fields"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, ++ inode_unpack_error, ++ "invalid variable length fields"); + -+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { -+ prt_printf(err, "invalid data checksum type (%u >= %u", -+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, ++ inode_checksum_type_invalid, ++ "invalid data checksum type (%u >= %u", ++ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); + -+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { -+ prt_printf(err, "invalid data checksum type (%u >= %u)", -+ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(unpacked.bi_compression && ++ !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, ++ inode_compression_type_invalid, ++ "invalid compression opt %u", unpacked.bi_compression - 1); + -+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && -+ unpacked.bi_nlink != 0) { -+ prt_printf(err, "flagged as unlinked but bi_nlink != 0"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && ++ unpacked.bi_nlink != 0, c, err, ++ inode_unlinked_but_nlink_nonzero, ++ "flagged as unlinked but bi_nlink != 0"); + -+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { -+ prt_printf(err, "subvolume root but not a directory"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; ++ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, ++ inode_subvol_root_but_not_dir, ++ "subvolume root but not a directory"); ++fsck_err: ++ return ret; +} + -+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ int ret = 0; + -+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, ++ inode_str_hash_invalid, ++ "invalid str hash type (%llu >= %u)", ++ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + -+ return __bch2_inode_invalid(k, err); ++ ret = __bch2_inode_invalid(c, k, err); ++fsck_err: ++ return ret; +} + -+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); ++ int ret = 0; + -+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, ++ inode_str_hash_invalid, ++ "invalid str hash type (%llu >= %u)", ++ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); + -+ return __bch2_inode_invalid(k, err); ++ ret = __bch2_inode_invalid(c, k, err); ++fsck_err: ++ return ret; +} + -+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); ++ int ret = 0; + -+ if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || -+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { -+ prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", -+ INODEv3_FIELDS_START(inode.v), -+ INODEv3_FIELDS_START_INITIAL, -+ bkey_val_u64s(inode.k)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || ++ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, ++ inode_v3_fields_start_bad, ++ "invalid fields_start (got %llu, min %u max %zu)", ++ INODEv3_FIELDS_START(inode.v), ++ INODEv3_FIELDS_START_INITIAL, ++ bkey_val_u64s(inode.k)); + -+ if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, ++ inode_str_hash_invalid, ++ "invalid str hash type (%llu >= %u)", ++ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); + -+ return __bch2_inode_invalid(k, err); ++ ret = __bch2_inode_invalid(c, k, err); ++fsck_err: ++ return ret; +} + +static void __bch2_inode_unpacked_to_text(struct printbuf *out, + struct bch_inode_unpacked *inode) +{ -+ prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", -+ inode->bi_mode, inode->bi_flags, ++ prt_printf(out, "mode=%o ", inode->bi_mode); ++ ++ prt_str(out, "flags="); ++ prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); ++ prt_printf(out, " (%x)", inode->bi_flags); ++ ++ prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", + inode->bi_journal_seq, + inode->bi_size, + inode->bi_sectors, + inode->bi_version); + +#define x(_name, _bits) \ -+ prt_printf(out, " "#_name " %llu", (u64) inode->_name); ++ prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + BCH_INODE_FIELDS_v3() +#undef x +} @@ -60974,7 +61703,7 @@ index 000000000000..bb3f443d8381 + +static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +{ -+ return bkey_inode_flags(k) & BCH_INODE_UNLINKED; ++ return bkey_inode_flags(k) & BCH_INODE_unlinked; +} + +int bch2_trans_mark_inode(struct btree_trans *trans, @@ -61038,16 +61767,17 @@ index 000000000000..bb3f443d8381 + return 0; +} + -+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero k.p.inode"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(k.k->p.inode, c, err, ++ inode_pos_inode_nonzero, ++ "nonzero k.p.inode"); ++fsck_err: ++ return ret; +} + +void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, @@ -61354,8 +62084,8 @@ index 000000000000..bb3f443d8381 + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ if (bi->bi_flags & BCH_INODE_unlinked) ++ bi->bi_flags &= ~BCH_INODE_unlinked; + else { + if (bi->bi_nlink == U32_MAX) + return -EINVAL; @@ -61368,13 +62098,13 @@ index 000000000000..bb3f443d8381 + +void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) +{ -+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { ++ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { + bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", + bi->bi_inum); + return; + } + -+ if (bi->bi_flags & BCH_INODE_UNLINKED) { ++ if (bi->bi_flags & BCH_INODE_unlinked) { + bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); + return; + } @@ -61382,7 +62112,7 @@ index 000000000000..bb3f443d8381 + if (bi->bi_nlink) + bi->bi_nlink--; + else -+ bi->bi_flags |= BCH_INODE_UNLINKED; ++ bi->bi_flags |= BCH_INODE_unlinked; +} + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) @@ -61407,6 +62137,18 @@ index 000000000000..bb3f443d8381 + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; +} + ++int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) ++{ ++ struct bch_inode_unpacked inode; ++ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); ++ ++ if (ret) ++ return ret; ++ ++ bch2_inode_opts_get(opts, trans->c, &inode); ++ return 0; ++} ++ +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct bch_fs *c = trans->c; @@ -61470,53 +62212,85 @@ index 000000000000..bb3f443d8381 + return ret ?: -BCH_ERR_transaction_restart_nested; +} + -+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) ++static int may_delete_deleted_inode(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos, ++ bool *need_another_pass) +{ + struct bch_fs *c = trans->c; -+ struct btree_iter iter; ++ struct btree_iter inode_iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + -+ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) -+ return 0; -+ -+ if (!fsck_err_on(c->sb.clean, c, -+ "filesystem marked as clean but have deleted inode %llu:%u", -+ pos.offset, pos.snapshot)) -+ return 0; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); ++ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (fsck_err_on(!bkey_is_inode(k.k), c, ++ deleted_inode_missing, + "nonexistent inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_unpack(k, &inode); + if (ret) -+ goto err; ++ goto out; + + if (fsck_err_on(S_ISDIR(inode.bi_mode), c, ++ deleted_inode_is_dir, + "directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + -+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, ++ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, ++ deleted_inode_not_unlinked, + "non-deleted inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + -+ return 1; -+err: ++ if (c->sb.clean && ++ !fsck_err(c, ++ deleted_inode_but_clean, ++ "filesystem marked as clean but have deleted inode %llu:%u", ++ pos.offset, pos.snapshot)) { ++ ret = 0; ++ goto out; ++ } ++ ++ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { ++ struct bpos new_min_pos; ++ ++ ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); ++ if (ret) ++ goto out; ++ ++ inode.bi_flags &= ~BCH_INODE_unlinked; ++ ++ ret = bch2_inode_write_flags(trans, &inode_iter, &inode, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch_err_msg(c, ret, "clearing inode unlinked flag"); ++ if (ret) ++ goto out; ++ ++ /* ++ * We'll need another write buffer flush to pick up the new ++ * unlinked inodes in the snapshot leaves: ++ */ ++ *need_another_pass = true; ++ return 0; ++ } ++ ++ ret = 1; ++out: +fsck_err: ++ bch2_trans_iter_exit(trans, &inode_iter); + return ret; +delete: -+ return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); ++ ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); ++ goto out; +} + +int bch2_delete_dead_inodes(struct bch_fs *c) @@ -61524,7 +62298,10 @@ index 000000000000..bb3f443d8381 + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; ++ bool need_another_pass; + int ret; ++again: ++ need_another_pass = false; + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) @@ -61538,7 +62315,8 @@ index 000000000000..bb3f443d8381 + */ + for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p)); ++ ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, ++ &need_another_pass)); + if (ret < 0) + break; + @@ -61548,12 +62326,17 @@ index 000000000000..bb3f443d8381 + bch2_fs_lazy_rw(c); + } + ++ bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ++ + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + } + bch2_trans_iter_exit(trans, &iter); ++ ++ if (!ret && need_another_pass) ++ goto again; +err: + bch2_trans_put(trans); + @@ -61561,25 +62344,26 @@ index 000000000000..bb3f443d8381 +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000000..a7464e1b6960 +index 000000000000..88818a332b1e --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,207 @@ +@@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H + +#include "bkey.h" ++#include "bkey_methods.h" +#include "opts.h" + +enum bkey_invalid_flags; +extern const char * const bch2_inode_opts[]; + -+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); -+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -61619,7 +62403,7 @@ index 000000000000..a7464e1b6960 + k->type == KEY_TYPE_inode_v3; +} + -+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -61668,8 +62452,16 @@ index 000000000000..a7464e1b6960 + +int bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); -+int bch2_inode_write(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *); ++ ++int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *, enum btree_update_flags); ++ ++static inline int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_inode_write_flags(trans, iter, inode, 0); ++} + +void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); @@ -61744,7 +62536,7 @@ index 000000000000..a7464e1b6960 + +static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) +{ -+ return bi->bi_flags & BCH_INODE_UNLINKED ++ return bi->bi_flags & BCH_INODE_unlinked + ? 0 + : bi->bi_nlink + nlink_bias(bi->bi_mode); +} @@ -61754,10 +62546,10 @@ index 000000000000..a7464e1b6960 +{ + if (nlink) { + bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ bi->bi_flags &= ~BCH_INODE_unlinked; + } else { + bi->bi_nlink = 0; -+ bi->bi_flags |= BCH_INODE_UNLINKED; ++ bi->bi_flags |= BCH_INODE_unlinked; + } +} + @@ -61767,6 +62559,7 @@ index 000000000000..a7464e1b6960 +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); ++int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); + +int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); +int bch2_delete_dead_inodes(struct bch_fs *); @@ -61774,10 +62567,10 @@ index 000000000000..a7464e1b6960 +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c new file mode 100644 -index 000000000000..119834cb8f9e +index 000000000000..bebc11444ef5 --- /dev/null +++ b/fs/bcachefs/io_misc.c -@@ -0,0 +1,515 @@ +@@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_misc.c - fallocate, fpunch, truncate: @@ -61796,13 +62589,14 @@ index 000000000000..119834cb8f9e +#include "io_misc.h" +#include "io_write.h" +#include "logged_ops.h" ++#include "rebalance.h" +#include "subvolume.h" + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, -+ unsigned sectors, ++ u64 sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) @@ -61884,7 +62678,7 @@ index 000000000000..119834cb8f9e + if (ret) + goto err; + -+ sectors = min(sectors, wp->sectors_free); ++ sectors = min_t(u64, sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); @@ -62135,6 +62929,7 @@ index 000000000000..119834cb8f9e + struct btree_iter iter; + struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; ++ struct bch_io_opts opts; + u64 dst_offset = le64_to_cpu(op->v.dst_offset); + u64 src_offset = le64_to_cpu(op->v.src_offset); + s64 shift = dst_offset - src_offset; @@ -62143,6 +62938,10 @@ index 000000000000..119834cb8f9e + bool insert = shift > 0; + int ret = 0; + ++ ret = bch2_inum_opts_get(trans, inum, &opts); ++ if (ret) ++ return ret; ++ + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, 0), + BTREE_ITER_INTENT); @@ -62223,7 +63022,10 @@ index 000000000000..119834cb8f9e + + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); + -+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: ++ ret = bch2_bkey_set_needs_rebalance(c, copy, ++ opts.background_target, ++ opts.background_compression) ?: ++ bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: + bch2_logged_op_update(trans, &op->k_i) ?: + bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); @@ -62295,7 +63097,7 @@ index 000000000000..119834cb8f9e +} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h new file mode 100644 -index 000000000000..c9e6ed40e1b8 +index 000000000000..9cb44a7c43c1 --- /dev/null +++ b/fs/bcachefs/io_misc.h @@ -0,0 +1,34 @@ @@ -62304,7 +63106,7 @@ index 000000000000..c9e6ed40e1b8 +#define _BCACHEFS_IO_MISC_H + +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, -+ unsigned, struct bch_io_opts, s64 *, ++ u64, struct bch_io_opts, s64 *, + struct write_point_specifier); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); @@ -62335,7 +63137,7 @@ index 000000000000..c9e6ed40e1b8 +#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c new file mode 100644 -index 000000000000..443c3ea65527 +index 000000000000..a56ed553dc15 --- /dev/null +++ b/fs/bcachefs/io_read.c @@ -0,0 +1,1210 @@ @@ -62984,7 +63786,7 @@ index 000000000000..443c3ea65527 + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); -+ bch2_io_error(ca); ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: @@ -63018,7 +63820,7 @@ index 000000000000..443c3ea65527 + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + -+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", @@ -63366,7 +64168,7 @@ index 000000000000..443c3ea65527 + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ -+ if (bch2_ec_read_extent(c, rbio)) { ++ if (bch2_ec_read_extent(trans, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } @@ -63715,10 +64517,10 @@ index 000000000000..d9c18bb7d403 +#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c new file mode 100644 -index 000000000000..6e4f85eb6ec8 +index 000000000000..f02b3f7d26a0 --- /dev/null +++ b/fs/bcachefs/io_write.c -@@ -0,0 +1,1671 @@ +@@ -0,0 +1,1675 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2010, 2011 Kent Overstreet @@ -63923,6 +64725,17 @@ index 000000000000..6e4f85eb6ec8 + struct btree_iter iter; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; ++ /* ++ * Crazy performance optimization: ++ * Every extent update needs to also update the inode: the inode trigger ++ * will set bi->journal_seq to the journal sequence number of this ++ * transaction - for fsync. ++ * ++ * But if that's the only reason we're updating the inode (we're not ++ * updating bi_size or bi_sectors), then we don't need the inode update ++ * to be journalled - if we crash, the bi_journal_seq update will be ++ * lost, but that's fine. ++ */ + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + int ret; + @@ -63944,7 +64757,7 @@ index 000000000000..6e4f85eb6ec8 + + inode = bkey_i_to_inode_v3(k); + -+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && ++ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; @@ -64072,10 +64885,13 @@ index 000000000000..6e4f85eb6ec8 + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + -+ ret = bch2_extent_update(trans, inum, &iter, sk.k, -+ &op->res, -+ op->new_i_size, &op->i_sectors_delta, -+ op->flags & BCH_WRITE_CHECK_ENOSPC); ++ ret = bch2_bkey_set_needs_rebalance(c, sk.k, ++ op->opts.background_target, ++ op->opts.background_compression) ?: ++ bch2_extent_update(trans, inum, &iter, sk.k, ++ &op->res, ++ op->new_i_size, &op->i_sectors_delta, ++ op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -64216,7 +65032,6 @@ index 000000000000..6e4f85eb6ec8 +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k; + unsigned dev; + int ret = 0; + @@ -64226,14 +65041,6 @@ index 000000000000..6e4f85eb6ec8 + goto err; + } + -+ /* -+ * probably not the ideal place to hook this in, but I don't -+ * particularly want to plumb io_opts all the way through the btree -+ * update stack right now -+ */ -+ for_each_keylist_key(keys, k) -+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); -+ + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + @@ -64364,7 +65171,7 @@ index 000000000000..6e4f85eb6ec8 + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + -+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", @@ -64537,6 +65344,7 @@ index 000000000000..6e4f85eb6ec8 + + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && + op->crc.compressed_size <= wp->sectors_free && + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || + op->incompressible)) { @@ -64812,9 +65620,7 @@ index 000000000000..6e4f85eb6ec8 + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { -+ if (p.crc.csum_type || -+ crc_is_compressed(p.crc) || -+ p.has_ec) ++ if (crc_is_encoded(p.crc) || p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); @@ -65610,10 +66416,10 @@ index 000000000000..c7f97c2c4805 +#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000000..0e7a9ffa3671 +index 000000000000..5b5d69f2316b --- /dev/null +++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1449 @@ +@@ -0,0 +1,1468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions @@ -66635,6 +67441,25 @@ index 000000000000..0e7a9ffa3671 + return ret; +} + ++int bch2_fs_journal_alloc(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->journal.nr) ++ continue; ++ ++ int ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ +/* startup/shutdown: */ + +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) @@ -67065,10 +67890,10 @@ index 000000000000..0e7a9ffa3671 +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 -index 000000000000..491133cc52f3 +index 000000000000..011711e99c8d --- /dev/null +++ b/fs/bcachefs/journal.h -@@ -0,0 +1,548 @@ +@@ -0,0 +1,549 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H @@ -67605,6 +68430,7 @@ index 000000000000..491133cc52f3 +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +int bch2_dev_journal_alloc(struct bch_dev *); ++int bch2_fs_journal_alloc(struct bch_fs *); + +void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + @@ -67619,10 +68445,10 @@ index 000000000000..491133cc52f3 +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000000..6a3d6a374e9c +index 000000000000..f4bc2cdbfdd7 --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1894 @@ +@@ -0,0 +1,1947 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -67765,7 +68591,8 @@ index 000000000000..6a3d6a374e9c + if (!dup->csum_good) + goto replace; + -+ fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", ++ fsck_err(c, journal_entry_replicas_data_mismatch, ++ "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + i = dup; + goto found; @@ -67860,7 +68687,7 @@ index 000000000000..6a3d6a374e9c + prt_str(out, ": "); +} + -+#define journal_entry_err(c, version, jset, entry, msg, ...) \ ++#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ +({ \ + struct printbuf _buf = PRINTBUF; \ + \ @@ -67869,9 +68696,10 @@ index 000000000000..6a3d6a374e9c + \ + switch (flags & BKEY_INVALID_WRITE) { \ + case READ: \ -+ mustfix_fsck_err(c, "%s", _buf.buf); \ ++ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ + break; \ + case WRITE: \ ++ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ + bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ + if (bch2_fs_inconsistent(c)) { \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ @@ -67884,8 +68712,8 @@ index 000000000000..6a3d6a374e9c + true; \ +}) + -+#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ -+ ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) ++#define journal_entry_err_on(cond, ...) \ ++ ((cond) ? journal_entry_err(__VA_ARGS__) : false) + +#define FSCK_DELETED_KEY 5 + @@ -67902,7 +68730,10 @@ index 000000000000..6a3d6a374e9c + struct printbuf buf = PRINTBUF; + int ret = 0; + -+ if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { ++ if (journal_entry_err_on(!k->k.u64s, ++ c, version, jset, entry, ++ journal_entry_bkey_u64s_0, ++ "k->u64s 0")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; @@ -67911,6 +68742,7 @@ index 000000000000..6a3d6a374e9c + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), + c, version, jset, entry, ++ journal_entry_bkey_past_end, + "extends past end of journal entry")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); @@ -67919,6 +68751,7 @@ index 000000000000..6a3d6a374e9c + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, + c, version, jset, entry, ++ journal_entry_bkey_bad_format, + "bad format %u", k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -67942,7 +68775,8 @@ index 000000000000..6a3d6a374e9c + bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf); + -+ mustfix_fsck_err(c, "%s", buf.buf); ++ mustfix_fsck_err(c, journal_entry_bkey_invalid, ++ "%s", buf.buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -67994,7 +68828,7 @@ index 000000000000..6a3d6a374e9c + prt_newline(out); + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } -+ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); ++ prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } @@ -68012,6 +68846,7 @@ index 000000000000..6a3d6a374e9c + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, + c, version, jset, entry, ++ journal_entry_btree_root_bad_size, + "invalid btree root journal entry: wrong number of keys")) { + void *next = vstruct_next(entry); + /* @@ -68061,6 +68896,7 @@ index 000000000000..6a3d6a374e9c + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, + c, version, jset, entry, ++ journal_entry_blacklist_bad_size, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } @@ -68088,6 +68924,7 @@ index 000000000000..6a3d6a374e9c + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, + c, version, jset, entry, ++ journal_entry_blacklist_v2_bad_size, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; @@ -68098,6 +68935,7 @@ index 000000000000..6a3d6a374e9c + if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > + le64_to_cpu(bl_entry->end), + c, version, jset, entry, ++ journal_entry_blacklist_v2_start_past_end, + "invalid journal seq blacklist entry: start > end")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } @@ -68130,6 +68968,7 @@ index 000000000000..6a3d6a374e9c + + if (journal_entry_err_on(bytes < sizeof(*u), + c, version, jset, entry, ++ journal_entry_usage_bad_size, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -68164,6 +69003,7 @@ index 000000000000..6a3d6a374e9c + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, + c, version, jset, entry, ++ journal_entry_data_usage_bad_size, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -68195,13 +69035,17 @@ index 000000000000..6a3d6a374e9c + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), -+ c, version, jset, entry, "bad size")) { ++ c, version, jset, entry, ++ journal_entry_clock_bad_size, ++ "bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, -+ c, version, jset, entry, "bad rw")) { ++ c, version, jset, entry, ++ journal_entry_clock_bad_rw, ++ "bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } @@ -68233,7 +69077,9 @@ index 000000000000..6a3d6a374e9c + int ret = 0; + + if (journal_entry_err_on(bytes < expected, -+ c, version, jset, entry, "bad size (%u < %u)", ++ c, version, jset, entry, ++ journal_entry_dev_usage_bad_size, ++ "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -68242,13 +69088,17 @@ index 000000000000..6a3d6a374e9c + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), -+ c, version, jset, entry, "bad dev")) { ++ c, version, jset, entry, ++ journal_entry_dev_usage_bad_dev, ++ "bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, -+ c, version, jset, entry, "bad pad")) { ++ c, version, jset, entry, ++ journal_entry_dev_usage_bad_pad, ++ "bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } @@ -68363,7 +69213,8 @@ index 000000000000..6a3d6a374e9c + + vstruct_for_each(jset, entry) { + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), -+ c, version, jset, entry, ++ c, version, jset, entry, ++ journal_entry_past_jset_end, + "journal entry extends past end of jset")) { + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); + break; @@ -68392,6 +69243,7 @@ index 000000000000..6a3d6a374e9c + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, ++ jset_unsupported_version, + "%s sector %llu seq %llu: incompatible journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -68402,7 +69254,8 @@ index 000000000000..6a3d6a374e9c + } + + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), -+ c, version, jset, NULL, ++ c, version, jset, NULL, ++ jset_unknown_csum, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -68413,6 +69266,7 @@ index 000000000000..6a3d6a374e9c + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), + c, version, jset, NULL, ++ jset_last_seq_newer_than_seq, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { @@ -68441,7 +69295,8 @@ index 000000000000..6a3d6a374e9c + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), -+ c, version, jset, NULL, ++ c, version, jset, NULL, ++ jset_unsupported_version, + "%s sector %llu seq %llu: unknown journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -68456,7 +69311,8 @@ index 000000000000..6a3d6a374e9c + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, -+ c, version, jset, NULL, ++ c, version, jset, NULL, ++ jset_past_bucket_end, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) @@ -68525,7 +69381,7 @@ index 000000000000..6a3d6a374e9c + ret = submit_bio_wait(bio); + kfree(bio); + -+ if (bch2_dev_io_err_on(ret, ca, ++ if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, + "journal read error: sector %llu", + offset) || + bch2_meta_read_fault("journal")) { @@ -68581,7 +69437,8 @@ index 000000000000..6a3d6a374e9c + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + csum_good = jset_csum_good(c, j); -+ if (!csum_good) ++ if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, ++ "journal checksum error")) + saw_bad = true; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), @@ -68797,6 +69654,7 @@ index 000000000000..6a3d6a374e9c + + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, le32_to_cpu(i->j.version), &i->j, NULL, ++ jset_last_seq_newer_than_seq, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) @@ -68813,7 +69671,8 @@ index 000000000000..6a3d6a374e9c + } + + if (!*last_seq) { -+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); ++ fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, ++ "journal read done, but no entries found after dropping non-flushes"); + return 0; + } + @@ -68839,6 +69698,7 @@ index 000000000000..6a3d6a374e9c + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, ++ jset_seq_blacklisted, + "found blacklisted journal entry %llu", seq); + i->ignore = true; + } @@ -68879,7 +69739,8 @@ index 000000000000..6a3d6a374e9c + bch2_journal_ptrs_to_text(&buf2, c, i); + + missing_end = seq - 1; -+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" ++ fsck_err(c, journal_entries_missing, ++ "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", + missing_start, missing_end, @@ -68934,7 +69795,8 @@ index 000000000000..6a3d6a374e9c + if (!degraded && + !bch2_replicas_marked(c, &replicas.e) && + (le64_to_cpu(i->j.seq) == *last_seq || -+ fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s", ++ fsck_err(c, journal_entry_replicas_not_marked, ++ "superblock not marked as containing replicas for journal entry %llu\n %s", + le64_to_cpu(i->j.seq), buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) @@ -69206,7 +70068,8 @@ index 000000000000..6a3d6a374e9c + struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned long flags; + -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, ++ "error writing journal entry %llu: %s", + le64_to_cpu(w->data->seq), + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { @@ -69266,9 +70129,15 @@ index 000000000000..6a3d6a374e9c + continue_at(cl, journal_write_done, c->io_complete_wq); +} + -+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) ++static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) +{ -+ struct jset_entry *i, *next, *prev = NULL; ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct jset_entry *start, *end, *i, *next, *prev = NULL; ++ struct jset *jset = w->data; ++ unsigned sectors, bytes, u64s; ++ bool validate_before_checksum = false; ++ unsigned long btree_roots_have = 0; ++ int ret; + + /* + * Simple compaction, dropping empty jset_entries (from journal @@ -69285,8 +70154,20 @@ index 000000000000..6a3d6a374e9c + if (!u64s) + continue; + -+ if (i->type == BCH_JSET_ENTRY_btree_root) ++ /* ++ * New btree roots are set by journalling them; when the journal ++ * entry gets written we have to propagate them to ++ * c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the ++ * btree roots (at least for now); so after we copy btree roots ++ * to c->btree_roots we have to get any missing btree roots and ++ * add them to this journal entry: ++ */ ++ if (i->type == BCH_JSET_ENTRY_btree_root) { + bch2_journal_entry_to_btree_root(c, i); ++ __set_bit(i->btree_id, &btree_roots_have); ++ } + + /* Can we merge with previous entry? */ + if (prev && @@ -69310,85 +70191,10 @@ index 000000000000..6a3d6a374e9c + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ -+void bch2_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_last_unwritten_buf(j); -+ struct bch_replicas_padded replicas; -+ struct jset_entry *start, *end; -+ struct jset *jset; -+ struct bio *bio; -+ struct printbuf journal_debug_buf = PRINTBUF; -+ bool validate_before_checksum = false; -+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0; -+ int ret; -+ -+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); -+ -+ journal_buf_realloc(j, w); -+ jset = w->data; -+ -+ j->write_start_time = local_clock(); -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * If the journal is in an error state - we did an emergency shutdown - -+ * we prefer to continue doing journal writes. We just mark them as -+ * noflush so they'll never be used, but they'll still be visible by the -+ * list_journal tool - this helps in debugging. -+ * -+ * There's a caveat: the first journal write after marking the -+ * superblock dirty must always be a flush write, because on startup -+ * from a clean shutdown we didn't necessarily read the journal and the -+ * new journal write might overwrite whatever was in the journal -+ * previously - we can't leave the journal without any flush writes in -+ * it. -+ * -+ * So if we're in an error state, and we're still starting up, we don't -+ * write anything at all. -+ */ -+ if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && -+ (bch2_journal_error(j) || -+ w->noflush || -+ (!w->must_flush && -+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && -+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { -+ w->noflush = true; -+ SET_JSET_NO_FLUSH(jset, true); -+ jset->last_seq = 0; -+ w->last_seq = 0; -+ -+ j->nr_noflush_writes++; -+ } else if (!bch2_journal_error(j)) { -+ j->last_flush_write = jiffies; -+ j->nr_flush_writes++; -+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); -+ } else { -+ spin_unlock(&j->lock); -+ goto err; -+ } -+ spin_unlock(&j->lock); -+ -+ /* -+ * New btree roots are set by journalling them; when the journal entry -+ * gets written we have to propagate them to c->btree_roots -+ * -+ * But, every journal entry we write has to contain all the btree roots -+ * (at least for now); so after we copy btree roots to c->btree_roots we -+ * have to get any missing btree roots and add them to this journal -+ * entry: -+ */ -+ -+ bch2_journal_entries_postprocess(c, jset); + + start = end = vstruct_last(jset); + -+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); @@ -69404,7 +70210,7 @@ index 000000000000..6a3d6a374e9c + bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + vstruct_bytes(jset), w->sectors << 9, + u64s, w->u64s_reserved, j->entry_u64s_reserved); -+ goto err; ++ return -EINVAL; + } + + jset->magic = cpu_to_le64(jset_magic(c)); @@ -69423,37 +70229,117 @@ index 000000000000..6a3d6a374e9c + validate_before_checksum = true; + + if (validate_before_checksum && -+ jset_validate(c, NULL, jset, 0, WRITE)) -+ goto err; ++ (ret = jset_validate(c, NULL, jset, 0, WRITE))) ++ return ret; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret)) -+ goto err; ++ return ret; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + if (!validate_before_checksum && -+ jset_validate(c, NULL, jset, 0, WRITE)) -+ goto err; ++ (ret = jset_validate(c, NULL, jset, 0, WRITE))) ++ return ret; + + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ return 0; ++} + -+retry_alloc: -+ spin_lock(&j->lock); -+ ret = journal_write_alloc(j, w); ++static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int error = bch2_journal_error(j); + -+ if (ret && j->can_discard) { -+ spin_unlock(&j->lock); -+ bch2_journal_do_discards(j); -+ goto retry_alloc; ++ /* ++ * If the journal is in an error state - we did an emergency shutdown - ++ * we prefer to continue doing journal writes. We just mark them as ++ * noflush so they'll never be used, but they'll still be visible by the ++ * list_journal tool - this helps in debugging. ++ * ++ * There's a caveat: the first journal write after marking the ++ * superblock dirty must always be a flush write, because on startup ++ * from a clean shutdown we didn't necessarily read the journal and the ++ * new journal write might overwrite whatever was in the journal ++ * previously - we can't leave the journal without any flush writes in ++ * it. ++ * ++ * So if we're in an error state, and we're still starting up, we don't ++ * write anything at all. ++ */ ++ if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) ++ return -EIO; ++ ++ if (error || ++ w->noflush || ++ (!w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { ++ w->noflush = true; ++ SET_JSET_NO_FLUSH(w->data, true); ++ w->data->last_seq = 0; ++ w->last_seq = 0; ++ ++ j->nr_noflush_writes++; ++ } else { ++ j->last_flush_write = jiffies; ++ j->nr_flush_writes++; ++ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } + ++ return 0; ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct bch_replicas_padded replicas; ++ struct bio *bio; ++ struct printbuf journal_debug_buf = PRINTBUF; ++ unsigned i, nr_rw_members = 0; ++ int ret; ++ ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ ++ j->write_start_time = local_clock(); ++ ++ spin_lock(&j->lock); ++ ret = bch2_journal_write_pick_flush(j, w); ++ spin_unlock(&j->lock); + if (ret) ++ goto err; ++ ++ journal_buf_realloc(j, w); ++ ++ ret = bch2_journal_write_prep(j, w); ++ if (ret) ++ goto err; ++ ++ while (1) { ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w); ++ if (!ret || !j->can_discard) ++ break; ++ ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ } ++ ++ if (ret) { + __bch2_journal_debug_to_text(&journal_debug_buf, j); ++ spin_unlock(&j->lock); ++ bch_err(c, "Unable to allocate journal write:\n%s", ++ journal_debug_buf.buf); ++ printbuf_exit(&journal_debug_buf); ++ goto err; ++ } + + /* + * write is allocated, no longer need to account for it in @@ -69468,13 +70354,6 @@ index 000000000000..6a3d6a374e9c + bch2_journal_space_available(j); + spin_unlock(&j->lock); + -+ if (ret) { -+ bch_err(c, "Unable to allocate journal write:\n%s", -+ journal_debug_buf.buf); -+ printbuf_exit(&journal_debug_buf); -+ goto err; -+ } -+ + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + + if (c->opts.nochanges) @@ -69496,7 +70375,7 @@ index 000000000000..6a3d6a374e9c + if (ret) + goto err; + -+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) { ++ if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); + @@ -71829,10 +72708,10 @@ index 000000000000..4d1e786a27a8 +#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 -index 000000000000..215a653322f3 +index 000000000000..a5cc0ed195d6 --- /dev/null +++ b/fs/bcachefs/lru.c -@@ -0,0 +1,162 @@ +@@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -71845,17 +72724,17 @@ index 000000000000..215a653322f3 +#include "recovery.h" + +/* KEY_TYPE_lru is obsolete: */ -+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (!lru_pos_time(k.k->p)) { -+ prt_printf(err, "lru entry at time=0"); -+ return -BCH_ERR_invalid_bkey; ++ int ret = 0; + -+ } -+ -+ return 0; ++ bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, ++ lru_entry_at_time_0, ++ "lru entry at time=0"); ++fsck_err: ++ return ret; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, @@ -71930,6 +72809,7 @@ index 000000000000..215a653322f3 + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, ++ lru_entry_to_invalid_bucket, + "lru key points to nonexistent device:bucket %llu:%llu", + alloc_pos.inode, alloc_pos.offset)) + return bch2_btree_delete_at(trans, lru_iter, 0); @@ -71960,7 +72840,8 @@ index 000000000000..215a653322f3 + } + + if (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect lru entry: lru %s time %llu\n" ++ fsck_err(c, lru_entry_bad, ++ "incorrect lru entry: lru %s time %llu\n" + " %s\n" + " for %s", + bch2_lru_types[type], @@ -71997,7 +72878,7 @@ index 000000000000..215a653322f3 +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 -index 000000000000..be66bf9ad809 +index 000000000000..429dca816df5 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,69 @@ @@ -72051,7 +72932,7 @@ index 000000000000..be66bf9ad809 + return BCH_LRU_read; +} + -+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -72885,10 +73766,10 @@ index 000000000000..027efaa0d575 +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000000..39a14e321680 +index 000000000000..ab749bf2fcbc --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,1159 @@ +@@ -0,0 +1,1198 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -72911,6 +73792,7 @@ index 000000000000..39a14e321680 +#include "keylist.h" +#include "move.h" +#include "replicas.h" ++#include "snapshot.h" +#include "super-io.h" +#include "trace.h" + @@ -72950,20 +73832,6 @@ index 000000000000..39a14e321680 + } +} + -+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) -+{ -+ mutex_lock(&c->data_progress_lock); -+ list_add(&stats->list, &c->data_progress_list); -+ mutex_unlock(&c->data_progress_lock); -+} -+ -+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) -+{ -+ mutex_lock(&c->data_progress_lock); -+ list_del(&stats->list); -+ mutex_unlock(&c->data_progress_lock); -+} -+ +struct moving_io { + struct list_head read_list; + struct list_head io_list; @@ -73047,35 +73915,31 @@ index 000000000000..39a14e321680 + closure_put(&ctxt->cl); +} + -+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, -+ struct btree_trans *trans) ++void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) +{ + struct moving_io *io; + -+ if (trans) -+ bch2_trans_unlock(trans); -+ + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { ++ bch2_trans_unlock_long(ctxt->trans); + list_del(&io->read_list); + move_write(io); + } +} + -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, -+ struct btree_trans *trans) ++void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +{ + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + -+ move_ctxt_wait_event(ctxt, trans, ++ move_ctxt_wait_event(ctxt, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); +} + +void bch2_moving_ctxt_exit(struct moving_context *ctxt) +{ -+ struct bch_fs *c = ctxt->c; ++ struct bch_fs *c = ctxt->trans->c; + -+ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + + EBUG_ON(atomic_read(&ctxt->write_sectors)); @@ -73083,16 +73947,12 @@ index 000000000000..39a14e321680 + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); + -+ if (ctxt->stats) { -+ progress_list_del(c, ctxt->stats); -+ trace_move_data(c, -+ atomic64_read(&ctxt->stats->sectors_moved), -+ atomic64_read(&ctxt->stats->keys_moved)); -+ } -+ + mutex_lock(&c->moving_context_lock); + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); ++ ++ bch2_trans_put(ctxt->trans); ++ memset(ctxt, 0, sizeof(*ctxt)); +} + +void bch2_moving_ctxt_init(struct moving_context *ctxt, @@ -73104,7 +73964,7 @@ index 000000000000..39a14e321680 +{ + memset(ctxt, 0, sizeof(*ctxt)); + -+ ctxt->c = c; ++ ctxt->trans = bch2_trans_get(c); + ctxt->fn = (void *) _RET_IP_; + ctxt->rate = rate; + ctxt->stats = stats; @@ -73121,16 +73981,17 @@ index 000000000000..39a14e321680 + mutex_lock(&c->moving_context_lock); + list_add(&ctxt->list, &c->moving_context_list); + mutex_unlock(&c->moving_context_lock); ++} + -+ if (stats) { -+ progress_list_add(c, stats); -+ stats->data_type = BCH_DATA_user; -+ } ++void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) ++{ ++ trace_move_data(c, stats); +} + +void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); ++ stats->data_type = BCH_DATA_user; + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + @@ -73177,15 +74038,14 @@ index 000000000000..39a14e321680 + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + -+static int bch2_move_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct moving_context *ctxt, -+ struct move_bucket_in_flight *bucket_in_flight, -+ struct bch_io_opts io_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct data_update_opts data_opts) ++int bch2_move_extent(struct moving_context *ctxt, ++ struct move_bucket_in_flight *bucket_in_flight, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bch_io_opts io_opts, ++ struct data_update_opts data_opts) +{ ++ struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct moving_io *io; @@ -73194,6 +74054,8 @@ index 000000000000..39a14e321680 + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + ++ if (ctxt->stats) ++ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); + trace_move_extent2(c, k); + + bch2_data_update_opts_normalize(k, &data_opts); @@ -73246,7 +74108,7 @@ index 000000000000..39a14e321680 + io->rbio.bio.bi_end_io = move_read_endio; + + ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, -+ io_opts, data_opts, btree_id, k); ++ io_opts, data_opts, iter->btree_id, k); + if (ret && ret != -BCH_ERR_unwritten_extent_update) + goto err_free_pages; + @@ -73258,9 +74120,11 @@ index 000000000000..39a14e321680 + + BUG_ON(ret); + -+ io->write.ctxt = ctxt; + io->write.op.end_io = move_write_done; + ++ if (ctxt->rate) ++ bch2_ratelimit_increment(ctxt->rate, k.k->size); ++ + if (ctxt->stats) { + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); @@ -73290,7 +74154,7 @@ index 000000000000..39a14e321680 + closure_get(&ctxt->cl); + bch2_read_extent(trans, &io->rbio, + bkey_start_pos(k.k), -+ btree_id, k, 0, ++ iter->btree_id, k, 0, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); + return 0; @@ -73304,45 +74168,96 @@ index 000000000000..39a14e321680 + return ret; +} + -+static int lookup_inode(struct btree_trans *trans, struct bpos pos, -+ struct bch_inode_unpacked *inode) ++struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, ++ struct per_snapshot_io_opts *io_opts, ++ struct bkey_s_c extent_k) ++{ ++ struct bch_fs *c = trans->c; ++ u32 restart_count = trans->restart_count; ++ int ret = 0; ++ ++ if (io_opts->cur_inum != extent_k.k->p.inode) { ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ io_opts->d.nr = 0; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (k.k->p.offset != extent_k.k->p.inode) ++ break; ++ ++ if (!bkey_is_inode(k.k)) ++ continue; ++ ++ struct bch_inode_unpacked inode; ++ BUG_ON(bch2_inode_unpack(k, &inode)); ++ ++ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; ++ bch2_inode_opts_get(&e.io_opts, trans->c, &inode); ++ ++ ret = darray_push(&io_opts->d, e); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ io_opts->cur_inum = extent_k.k->p.inode; ++ } ++ ++ ret = ret ?: trans_was_restarted(trans, restart_count); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (extent_k.k->p.snapshot) { ++ struct snapshot_io_opts_entry *i; ++ darray_for_each(io_opts->d, i) ++ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) ++ return &i->io_opts; ++ } ++ ++ return &io_opts->fs_io_opts; ++} ++ ++int bch2_move_get_io_opts_one(struct btree_trans *trans, ++ struct bch_io_opts *io_opts, ++ struct bkey_s_c extent_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, -+ BTREE_ITER_ALL_SNAPSHOTS); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k || !bkey_eq(k.k->p, pos)) { -+ ret = -BCH_ERR_ENOENT_inode; -+ goto err; ++ /* reflink btree? */ ++ if (!extent_k.k->p.inode) { ++ *io_opts = bch2_opts_to_inode_opts(trans->c->opts); ++ return 0; + } + -+ ret = bkey_is_inode(k.k) ? 0 : -EIO; -+ if (ret) -+ goto err; ++ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), ++ BTREE_ITER_CACHED); ++ ret = bkey_err(k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ ++ if (!ret && bkey_is_inode(k.k)) { ++ struct bch_inode_unpacked inode; ++ bch2_inode_unpack(k, &inode); ++ bch2_inode_opts_get(io_opts, trans->c, &inode); ++ } else { ++ *io_opts = bch2_opts_to_inode_opts(trans->c->opts); ++ } + -+ ret = bch2_inode_unpack(k, inode); -+ if (ret) -+ goto err; -+err: + bch2_trans_iter_exit(trans, &iter); -+ return ret; ++ return 0; +} + -+static int move_ratelimit(struct btree_trans *trans, -+ struct moving_context *ctxt) ++int bch2_move_ratelimit(struct moving_context *ctxt) +{ -+ struct bch_fs *c = trans->c; ++ struct bch_fs *c = ctxt->trans->c; + u64 delay; + -+ if (ctxt->wait_on_copygc) { -+ bch2_trans_unlock(trans); ++ if (ctxt->wait_on_copygc && !c->copygc_running) { ++ bch2_trans_unlock_long(ctxt->trans); + wait_event_killable(c->copygc_running_wq, + !c->copygc_running || + kthread_should_stop()); @@ -73351,8 +74266,12 @@ index 000000000000..39a14e321680 + do { + delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + ++ + if (delay) { -+ bch2_trans_unlock(trans); ++ if (delay > HZ / 10) ++ bch2_trans_unlock_long(ctxt->trans); ++ else ++ bch2_trans_unlock(ctxt->trans); + set_current_state(TASK_INTERRUPTIBLE); + } + @@ -73365,7 +74284,7 @@ index 000000000000..39a14e321680 + schedule_timeout(delay); + + if (unlikely(freezing(current))) { -+ move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); @@ -73374,7 +74293,7 @@ index 000000000000..39a14e321680 + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ -+ move_ctxt_wait_event(ctxt, trans, ++ move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && @@ -73383,52 +74302,28 @@ index 000000000000..39a14e321680 + return 0; +} + -+static int move_get_io_opts(struct btree_trans *trans, -+ struct bch_io_opts *io_opts, -+ struct bkey_s_c k, u64 *cur_inum) ++static int bch2_move_data_btree(struct moving_context *ctxt, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ enum btree_id btree_id) +{ -+ struct bch_inode_unpacked inode; -+ int ret; -+ -+ if (*cur_inum == k.k->p.inode) -+ return 0; -+ -+ ret = lookup_inode(trans, -+ SPOS(0, k.k->p.inode, k.k->p.snapshot), -+ &inode); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ -+ if (!ret) -+ bch2_inode_opts_get(io_opts, trans->c, &inode); -+ else -+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts); -+ *cur_inum = k.k->p.inode; -+ return 0; -+} -+ -+static int __bch2_move_data(struct moving_context *ctxt, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ enum btree_id btree_id) -+{ -+ struct bch_fs *c = ctxt->c; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans *trans = ctxt->trans; ++ struct bch_fs *c = trans->c; ++ struct per_snapshot_io_opts snapshot_io_opts; ++ struct bch_io_opts *io_opts; + struct bkey_buf sk; -+ struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct data_update_opts data_opts; -+ u64 cur_inum = U64_MAX; + int ret = 0, ret2; + ++ per_snapshot_io_opts_init(&snapshot_io_opts, c); + bch2_bkey_buf_init(&sk); + + if (ctxt->stats) { + ctxt->stats->data_type = BCH_DATA_user; -+ ctxt->stats->btree_id = btree_id; -+ ctxt->stats->pos = start; ++ ctxt->stats->pos = BBPOS(btree_id, start); + } + + bch2_trans_iter_init(trans, &iter, btree_id, start, @@ -73438,7 +74333,7 @@ index 000000000000..39a14e321680 + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); + -+ while (!move_ratelimit(trans, ctxt)) { ++ while (!bch2_move_ratelimit(ctxt)) { + bch2_trans_begin(trans); + + k = bch2_btree_iter_peek(&iter); @@ -73455,17 +74350,18 @@ index 000000000000..39a14e321680 + break; + + if (ctxt->stats) -+ ctxt->stats->pos = iter.pos; ++ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + -+ ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); ++ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); ++ ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + continue; + + memset(&data_opts, 0, sizeof(data_opts)); -+ if (!pred(c, arg, k, &io_opts, &data_opts)) ++ if (!pred(c, arg, k, io_opts, &data_opts)) + goto next; + + /* @@ -73475,24 +74371,20 @@ index 000000000000..39a14e321680 + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + -+ ret2 = bch2_move_extent(trans, &iter, ctxt, NULL, -+ io_opts, btree_id, k, data_opts); ++ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt, trans); ++ bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + + /* XXX signal failure */ + goto next; + } -+ -+ if (ctxt->rate) -+ bch2_ratelimit_increment(ctxt->rate, k.k->size); +next: + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); @@ -73501,59 +74393,68 @@ index 000000000000..39a14e321680 + } + + bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); ++ per_snapshot_io_opts_exit(&snapshot_io_opts); ++ ++ return ret; ++} ++ ++int __bch2_move_data(struct moving_context *ctxt, ++ struct bbpos start, ++ struct bbpos end, ++ move_pred_fn pred, void *arg) ++{ ++ struct bch_fs *c = ctxt->trans->c; ++ enum btree_id id; ++ int ret = 0; ++ ++ for (id = start.btree; ++ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); ++ id++) { ++ ctxt->stats->pos = BBPOS(id, POS_MIN); ++ ++ if (!btree_type_has_ptrs(id) || ++ !bch2_btree_id_root(c, id)->b) ++ continue; ++ ++ ret = bch2_move_data_btree(ctxt, ++ id == start.btree ? start.pos : POS_MIN, ++ id == end.btree ? end.pos : POS_MAX, ++ pred, arg, id); ++ if (ret) ++ break; ++ } + + return ret; +} + +int bch2_move_data(struct bch_fs *c, -+ enum btree_id start_btree_id, struct bpos start_pos, -+ enum btree_id end_btree_id, struct bpos end_pos, ++ struct bbpos start, ++ struct bbpos end, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ ++ + struct moving_context ctxt; -+ enum btree_id id; -+ int ret = 0; ++ int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ -+ for (id = start_btree_id; -+ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); -+ id++) { -+ stats->btree_id = id; -+ -+ if (id != BTREE_ID_extents && -+ id != BTREE_ID_reflink) -+ continue; -+ -+ if (!bch2_btree_id_root(c, id)->b) -+ continue; -+ -+ ret = __bch2_move_data(&ctxt, -+ id == start_btree_id ? start_pos : POS_MIN, -+ id == end_btree_id ? end_pos : POS_MAX, -+ pred, arg, id); -+ if (ret) -+ break; -+ } -+ ++ ret = __bch2_move_data(&ctxt, start, end, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + -+int __bch2_evacuate_bucket(struct btree_trans *trans, -+ struct moving_context *ctxt, ++int __bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts _data_opts) +{ -+ struct bch_fs *c = ctxt->c; ++ struct btree_trans *trans = ctxt->trans; ++ struct bch_fs *c = trans->c; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_iter iter; + struct bkey_buf sk; @@ -73564,7 +74465,6 @@ index 000000000000..39a14e321680 + struct data_update_opts data_opts; + unsigned dirty_sectors, bucket_size; + u64 fragmentation; -+ u64 cur_inum = U64_MAX; + struct bpos bp_pos = POS_MIN; + int ret = 0; + @@ -73599,7 +74499,7 @@ index 000000000000..39a14e321680 + goto err; + } + -+ while (!(ret = move_ratelimit(trans, ctxt))) { ++ while (!(ret = bch2_move_ratelimit(ctxt))) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, @@ -73628,7 +74528,7 @@ index 000000000000..39a14e321680 + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + -+ ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); ++ ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) { + bch2_trans_iter_exit(trans, &iter); + continue; @@ -73649,23 +74549,20 @@ index 000000000000..39a14e321680 + i++; + } + -+ ret = bch2_move_extent(trans, &iter, ctxt, -+ bucket_in_flight, -+ io_opts, bp.btree_id, k, data_opts); ++ ret = bch2_move_extent(ctxt, bucket_in_flight, ++ &iter, k, io_opts, data_opts); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt, trans); ++ bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + if (ret) + goto err; + -+ if (ctxt->rate) -+ bch2_ratelimit_increment(ctxt->rate, k.k->size); + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + } else { @@ -73716,14 +74613,12 @@ index 000000000000..39a14e321680 + struct write_point_specifier wp, + bool wait_on_copygc) +{ -+ struct btree_trans *trans = bch2_trans_get(c); + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts); ++ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); + bch2_moving_ctxt_exit(&ctxt); -+ bch2_trans_put(trans); + + return ret; +} @@ -73740,21 +74635,25 @@ index 000000000000..39a14e321680 +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_trans *trans = bch2_trans_get(c); ++ struct moving_context ctxt; ++ struct btree_trans *trans; + struct btree_iter iter; + struct btree *b; + enum btree_id id; + struct data_update_opts data_opts; + int ret = 0; + -+ progress_list_add(c, stats); ++ bch2_moving_ctxt_init(&ctxt, c, NULL, stats, ++ writepoint_ptr(&c->btree_write_point), ++ true); ++ trans = ctxt.trans; + + stats->data_type = BCH_DATA_btree; + + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { -+ stats->btree_id = id; ++ stats->pos = BBPOS(id, POS_MIN); + + if (!bch2_btree_id_root(c, id)->b) + continue; @@ -73773,7 +74672,7 @@ index 000000000000..39a14e321680 + bpos_cmp(b->key.k.p, end_pos)) > 0) + break; + -+ stats->pos = iter.pos; ++ stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (!pred(c, arg, b, &io_opts, &data_opts)) + goto next; @@ -73795,14 +74694,10 @@ index 000000000000..39a14e321680 + break; + } + -+ bch2_trans_put(trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ ++ bch_err_fn(c, ret); ++ bch2_moving_ctxt_exit(&ctxt); + bch2_btree_interior_updates_flush(c); + -+ progress_list_del(c, stats); + return ret; +} + @@ -73923,8 +74818,7 @@ index 000000000000..39a14e321680 + mutex_unlock(&c->sb_lock); + } + -+ if (ret) -+ bch_err_fn(c, ret); ++ bch_err_fn(c, ret); + return ret; +} + @@ -73947,14 +74841,16 @@ index 000000000000..39a14e321680 + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, ++ (struct bbpos) { op.start_btree, op.start_pos }, ++ (struct bbpos) { op.end_btree, op.end_pos }, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; ++ ++ bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_MIGRATE: + if (op.migrate.dev >= c->sb.nr_devices) @@ -73971,18 +74867,21 @@ index 000000000000..39a14e321680 + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, ++ (struct bbpos) { op.start_btree, op.start_pos }, ++ (struct bbpos) { op.end_btree, op.end_pos }, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; ++ ++ bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_REWRITE_OLD_NODES: + bch2_move_stats_init(stats, "rewrite_old_nodes"); + ret = bch2_scan_old_btree_nodes(c, stats); ++ bch2_move_stats_exit(stats, c); + break; + default: + ret = -EINVAL; @@ -73991,19 +74890,43 @@ index 000000000000..39a14e321680 + return ret; +} + ++void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) ++{ ++ prt_printf(out, "%s: data type=%s pos=", ++ stats->name, ++ bch2_data_types[stats->data_type]); ++ bch2_bbpos_to_text(out, stats->pos); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ prt_str(out, "keys moved: "); ++ prt_u64(out, atomic64_read(&stats->keys_moved)); ++ prt_newline(out); ++ ++ prt_str(out, "keys raced: "); ++ prt_u64(out, atomic64_read(&stats->keys_raced)); ++ prt_newline(out); ++ ++ prt_str(out, "bytes seen: "); ++ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); ++ prt_newline(out); ++ ++ prt_str(out, "bytes moved: "); ++ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); ++ prt_newline(out); ++ ++ prt_str(out, "bytes raced: "); ++ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); ++ prt_newline(out); ++ ++ printbuf_indent_sub(out, 2); ++} ++ +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) +{ -+ struct bch_move_stats *stats = ctxt->stats; + struct moving_io *io; + -+ prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); -+ prt_newline(out); -+ -+ prt_printf(out, " data type %s btree_id %s position: ", -+ bch2_data_types[stats->data_type], -+ bch2_btree_ids[stats->btree_id]); -+ bch2_bpos_to_text(out, stats->pos); -+ prt_newline(out); ++ bch2_move_stats_to_text(out, ctxt->stats); + printbuf_indent_add(out, 2); + + prt_printf(out, "reads: ios %u/%u sectors %u/%u", @@ -74044,20 +74967,18 @@ index 000000000000..39a14e321680 +{ + INIT_LIST_HEAD(&c->moving_context_list); + mutex_init(&c->moving_context_lock); -+ -+ INIT_LIST_HEAD(&c->data_progress_list); -+ mutex_init(&c->data_progress_lock); +} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h new file mode 100644 -index 000000000000..cbdd58db8782 +index 000000000000..07cf9d42643b --- /dev/null +++ b/fs/bcachefs/move.h -@@ -0,0 +1,96 @@ +@@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_H +#define _BCACHEFS_MOVE_H + ++#include "bbpos.h" +#include "bcachefs_ioctl.h" +#include "btree_iter.h" +#include "buckets.h" @@ -74067,7 +74988,7 @@ index 000000000000..cbdd58db8782 +struct bch_read_bio; + +struct moving_context { -+ struct bch_fs *c; ++ struct btree_trans *trans; + struct list_head list; + void *fn; + @@ -74093,13 +75014,14 @@ index 000000000000..cbdd58db8782 + wait_queue_head_t wait; +}; + -+#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ ++#define move_ctxt_wait_event(_ctxt, _cond) \ +do { \ + bool cond_finished = false; \ -+ bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ ++ bch2_moving_ctxt_do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ ++ bch2_trans_unlock_long((_ctxt)->trans); \ + __wait_event((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond))); \ @@ -74115,22 +75037,60 @@ index 000000000000..cbdd58db8782 + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool); +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -+void bch2_moving_ctxt_do_pending_writes(struct moving_context *, -+ struct btree_trans *); ++void bch2_moving_ctxt_do_pending_writes(struct moving_context *); ++void bch2_move_ctxt_wait_for_io(struct moving_context *); ++int bch2_move_ratelimit(struct moving_context *); ++ ++/* Inodes in different snapshots may have different IO options: */ ++struct snapshot_io_opts_entry { ++ u32 snapshot; ++ struct bch_io_opts io_opts; ++}; ++ ++struct per_snapshot_io_opts { ++ u64 cur_inum; ++ struct bch_io_opts fs_io_opts; ++ DARRAY(struct snapshot_io_opts_entry) d; ++}; ++ ++static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) ++{ ++ memset(io_opts, 0, sizeof(*io_opts)); ++ io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); ++} ++ ++static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) ++{ ++ darray_exit(&io_opts->d); ++} ++ ++struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, ++ struct per_snapshot_io_opts *, struct bkey_s_c); ++int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); + +int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + ++int bch2_move_extent(struct moving_context *, ++ struct move_bucket_in_flight *, ++ struct btree_iter *, ++ struct bkey_s_c, ++ struct bch_io_opts, ++ struct data_update_opts); ++ ++int __bch2_move_data(struct moving_context *, ++ struct bbpos, ++ struct bbpos, ++ move_pred_fn, void *); +int bch2_move_data(struct bch_fs *, -+ enum btree_id, struct bpos, -+ enum btree_id, struct bpos, ++ struct bbpos start, ++ struct bbpos end, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool, + move_pred_fn, void *); + -+int __bch2_evacuate_bucket(struct btree_trans *, -+ struct moving_context *, ++int __bch2_evacuate_bucket(struct moving_context *, + struct move_bucket_in_flight *, + struct bpos, int, + struct data_update_opts); @@ -74144,7 +75104,10 @@ index 000000000000..cbdd58db8782 + struct bch_move_stats *, + struct bch_ioctl_data); + -+void bch2_move_stats_init(struct bch_move_stats *stats, char *name); ++void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); ++void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); ++void bch2_move_stats_init(struct bch_move_stats *, char *); ++ +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_move_init(struct bch_fs *); @@ -74152,7 +75115,7 @@ index 000000000000..cbdd58db8782 +#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h new file mode 100644 -index 000000000000..baf1f8570b3f +index 000000000000..e22841ef31e4 --- /dev/null +++ b/fs/bcachefs/move_types.h @@ -0,0 +1,36 @@ @@ -74160,17 +75123,17 @@ index 000000000000..baf1f8570b3f +#ifndef _BCACHEFS_MOVE_TYPES_H +#define _BCACHEFS_MOVE_TYPES_H + ++#include "bbpos_types.h" ++ +struct bch_move_stats { + enum bch_data_type data_type; -+ enum btree_id btree_id; -+ struct bpos pos; -+ struct list_head list; ++ struct bbpos pos; + char name[32]; + + atomic64_t keys_moved; + atomic64_t keys_raced; -+ atomic64_t sectors_moved; + atomic64_t sectors_seen; ++ atomic64_t sectors_moved; + atomic64_t sectors_raced; +}; + @@ -74194,10 +75157,10 @@ index 000000000000..baf1f8570b3f +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000000..4017120baeee +index 000000000000..0a0576326c5b --- /dev/null +++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,414 @@ +@@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector @@ -74301,8 +75264,7 @@ index 000000000000..4017120baeee + return ret; +} + -+static void move_buckets_wait(struct btree_trans *trans, -+ struct moving_context *ctxt, ++static void move_buckets_wait(struct moving_context *ctxt, + struct buckets_in_flight *list, + bool flush) +{ @@ -74311,7 +75273,7 @@ index 000000000000..4017120baeee + + while ((i = list->first)) { + if (flush) -+ move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); ++ move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); + + if (atomic_read(&i->count)) + break; @@ -74329,7 +75291,7 @@ index 000000000000..4017120baeee + kfree(i); + } + -+ bch2_trans_unlock(trans); ++ bch2_trans_unlock_long(ctxt->trans); +} + +static bool bucket_in_flight(struct buckets_in_flight *list, @@ -74340,11 +75302,11 @@ index 000000000000..4017120baeee + +typedef DARRAY(struct move_bucket) move_buckets; + -+static int bch2_copygc_get_buckets(struct btree_trans *trans, -+ struct moving_context *ctxt, ++static int bch2_copygc_get_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + move_buckets *buckets) +{ ++ struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; @@ -74352,7 +75314,7 @@ index 000000000000..4017120baeee + size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; + int ret; + -+ move_buckets_wait(trans, ctxt, buckets_in_flight, false); ++ move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_flush(trans); + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", @@ -74388,10 +75350,11 @@ index 000000000000..4017120baeee +} + +noinline -+static int bch2_copygc(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct buckets_in_flight *buckets_in_flight) ++static int bch2_copygc(struct moving_context *ctxt, ++ struct buckets_in_flight *buckets_in_flight, ++ bool *did_work) +{ ++ struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct data_update_opts data_opts = { + .btree_insert_flags = BCH_WATERMARK_copygc, @@ -74402,7 +75365,7 @@ index 000000000000..4017120baeee + u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + -+ ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); ++ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + if (ret) + goto err; + @@ -74421,10 +75384,12 @@ index 000000000000..4017120baeee + break; + } + -+ ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, ++ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, + f->bucket.k.gen, data_opts); + if (ret) + goto err; ++ ++ *did_work = true; + } +err: + darray_exit(&buckets); @@ -74500,24 +75465,24 @@ index 000000000000..4017120baeee +static int bch2_copygc_thread(void *arg) +{ + struct bch_fs *c = arg; -+ struct btree_trans *trans; + struct moving_context ctxt; + struct bch_move_stats move_stats; + struct io_clock *clock = &c->io_clock[WRITE]; -+ struct buckets_in_flight buckets; ++ struct buckets_in_flight *buckets; + u64 last, wait; + int ret = 0; + -+ memset(&buckets, 0, sizeof(buckets)); -+ -+ ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); ++ buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); ++ if (!buckets) ++ return -ENOMEM; ++ ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + if (ret) { ++ kfree(buckets); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); + return ret; + } + + set_freezable(); -+ trans = bch2_trans_get(c); + + bch2_move_stats_init(&move_stats, "copygc"); + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, @@ -74525,16 +75490,18 @@ index 000000000000..4017120baeee + false); + + while (!ret && !kthread_should_stop()) { -+ bch2_trans_unlock(trans); ++ bool did_work = false; ++ ++ bch2_trans_unlock_long(ctxt.trans); + cond_resched(); + + if (!c->copy_gc_enabled) { -+ move_buckets_wait(trans, &ctxt, &buckets, true); ++ move_buckets_wait(&ctxt, buckets, true); + kthread_wait_freezable(c->copy_gc_enabled); + } + + if (unlikely(freezing(current))) { -+ move_buckets_wait(trans, &ctxt, &buckets, true); ++ move_buckets_wait(&ctxt, buckets, true); + __refrigerator(false); + continue; + } @@ -74545,7 +75512,7 @@ index 000000000000..4017120baeee + if (wait > clock->max_slop) { + c->copygc_wait_at = last; + c->copygc_wait = last + wait; -+ move_buckets_wait(trans, &ctxt, &buckets, true); ++ move_buckets_wait(&ctxt, buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); @@ -74555,16 +75522,29 @@ index 000000000000..4017120baeee + c->copygc_wait = 0; + + c->copygc_running = true; -+ ret = bch2_copygc(trans, &ctxt, &buckets); ++ ret = bch2_copygc(&ctxt, buckets, &did_work); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); ++ ++ if (!wait && !did_work) { ++ u64 min_member_capacity = bch2_min_rw_member_capacity(c); ++ ++ if (min_member_capacity == U64_MAX) ++ min_member_capacity = 128 * 2048; ++ ++ bch2_trans_unlock_long(ctxt.trans); ++ bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), ++ MAX_SCHEDULE_TIMEOUT); ++ } + } + -+ move_buckets_wait(trans, &ctxt, &buckets, true); -+ rhashtable_destroy(&buckets.table); -+ bch2_trans_put(trans); ++ move_buckets_wait(&ctxt, buckets, true); ++ ++ rhashtable_destroy(&buckets->table); ++ kfree(buckets); + bch2_moving_ctxt_exit(&ctxt); ++ bch2_move_stats_exit(&move_stats, c); + + return 0; +} @@ -74864,10 +75844,10 @@ index 000000000000..bd12bf677924 + diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 -index 000000000000..232f50c73a94 +index 000000000000..8dd4046cca41 --- /dev/null +++ b/fs/bcachefs/opts.c -@@ -0,0 +1,605 @@ +@@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -74882,11 +75862,6 @@ index 000000000000..232f50c73a94 + +#define x(t, n, ...) [n] = #t, + -+const char * const bch2_iops_measurements[] = { -+ BCH_IOPS_MEASUREMENTS() -+ NULL -+}; -+ +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() + NULL @@ -74912,9 +75887,8 @@ index 000000000000..232f50c73a94 + NULL +}; + -+const char * const bch2_btree_ids[] = { ++const char * const __bch2_btree_ids[] = { + BCH_BTREE_IDS() -+ "interior btree node", + NULL +}; + @@ -75141,14 +76115,14 @@ index 000000000000..232f50c73a94 + if (err) + prt_printf(err, "%s: too small (min %llu)", + opt->attr.name, opt->min); -+ return -ERANGE; ++ return -BCH_ERR_ERANGE_option_too_small; + } + + if (opt->max && v >= opt->max) { + if (err) + prt_printf(err, "%s: too big (max %llu)", + opt->attr.name, opt->max); -+ return -ERANGE; ++ return -BCH_ERR_ERANGE_option_too_big; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { @@ -75165,6 +76139,9 @@ index 000000000000..232f50c73a94 + return -EINVAL; + } + ++ if (opt->fn.validate) ++ return opt->fn.validate(v, err); ++ + return 0; +} + @@ -75475,7 +76452,7 @@ index 000000000000..232f50c73a94 +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 -index 000000000000..55014336c5f7 +index 000000000000..8526f177450a --- /dev/null +++ b/fs/bcachefs/opts.h @@ -0,0 +1,564 @@ @@ -75491,13 +76468,12 @@ index 000000000000..55014336c5f7 + +struct bch_fs; + -+extern const char * const bch2_iops_measurements[]; +extern const char * const bch2_error_actions[]; +extern const char * const bch2_fsck_fix_opts[]; +extern const char * const bch2_version_upgrade_opts[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; -+extern const char * const bch2_btree_ids[]; ++extern const char * const __bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; +extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; @@ -75555,6 +76531,7 @@ index 000000000000..55014336c5f7 +struct bch_opt_fn { + int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++ int (*validate)(u64, struct printbuf *); +}; + +/** @@ -76045,7 +77022,7 @@ index 000000000000..55014336c5f7 +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c new file mode 100644 -index 000000000000..de41f9a14492 +index 000000000000..5e653eb81d54 --- /dev/null +++ b/fs/bcachefs/printbuf.c @@ -0,0 +1,425 @@ @@ -76466,12 +77443,12 @@ index 000000000000..de41f9a14492 + while (list[nr]) + nr++; + -+ while (flags && (bit = __ffs(flags)) < nr) { ++ while (flags && (bit = __ffs64(flags)) < nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[bit]); -+ flags ^= 1 << bit; ++ flags ^= BIT_ULL(bit); + } +} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h @@ -76766,10 +77743,10 @@ index 000000000000..2191423d9f22 +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000000..cb68ae44d597 +index 000000000000..a54647c36b85 --- /dev/null +++ b/fs/bcachefs/quota.c -@@ -0,0 +1,978 @@ +@@ -0,0 +1,979 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" @@ -76831,17 +77808,18 @@ index 000000000000..cb68ae44d597 + .to_text = bch2_sb_quota_to_text, +}; + -+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (k.k->p.inode >= QTYP_NR) { -+ prt_printf(err, "invalid quota type (%llu >= %u)", -+ k.k->p.inode, QTYP_NR); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, ++ quota_type_invalid, ++ "invalid quota type (%llu >= %u)", ++ k.k->p.inode, QTYP_NR); ++fsck_err: ++ return ret; +} + +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, @@ -77750,7 +78728,7 @@ index 000000000000..cb68ae44d597 +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 -index 000000000000..2f463874a362 +index 000000000000..884f601f41c4 --- /dev/null +++ b/fs/bcachefs/quota.h @@ -0,0 +1,74 @@ @@ -77764,7 +78742,7 @@ index 000000000000..2f463874a362 +enum bkey_invalid_flags; +extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + -+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -77879,22 +78857,28 @@ index 000000000000..6a136083d389 +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000000..568f1e8e7507 +index 000000000000..3319190b8d9c --- /dev/null +++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,366 @@ +@@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" ++#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" ++#include "btree_update.h" ++#include "btree_write_buffer.h" +#include "buckets.h" +#include "clock.h" +#include "compress.h" +#include "disk_groups.h" +#include "errcode.h" ++#include "error.h" ++#include "inode.h" +#include "move.h" +#include "rebalance.h" ++#include "subvolume.h" +#include "super-io.h" +#include "trace.h" + @@ -77902,302 +78886,396 @@ index 000000000000..568f1e8e7507 +#include +#include + -+/* -+ * Check if an extent should be moved: -+ * returns -1 if it should not be moved, or -+ * device of pointer that should be moved, if known, or INT_MAX if unknown -+ */ ++#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) ++ ++static const char * const bch2_rebalance_state_strs[] = { ++#define x(t) #t, ++ BCH_REBALANCE_STATES() ++ NULL ++#undef x ++}; ++ ++static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie *cookie; ++ u64 v; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, ++ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ v = k.k->type == KEY_TYPE_cookie ++ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) ++ : 0; ++ ++ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); ++ ret = PTR_ERR_OR_ZERO(cookie); ++ if (ret) ++ goto err; ++ ++ bkey_cookie_init(&cookie->k_i); ++ cookie->k.p = iter.pos; ++ cookie->v.cookie = cpu_to_le64(v + 1); ++ ++ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) ++{ ++ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ __bch2_set_rebalance_needs_scan(trans, inum)); ++ rebalance_wakeup(c); ++ return ret; ++} ++ ++int bch2_set_fs_needs_rebalance(struct bch_fs *c) ++{ ++ return bch2_set_rebalance_needs_scan(c, 0); ++} ++ ++static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 v; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, ++ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ v = k.k->type == KEY_TYPE_cookie ++ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) ++ : 0; ++ ++ if (v == cookie) ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, ++ struct btree_iter *work_iter) ++{ ++ return !kthread_should_stop() ++ ? bch2_btree_iter_peek(work_iter) ++ : bkey_s_c_null; ++} ++ ++static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); ++ int ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ extent_entry_drop(bkey_i_to_s(n), ++ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); ++ return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); ++} ++ ++static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, ++ struct bpos work_pos, ++ struct btree_iter *extent_iter, ++ struct data_update_opts *data_opts) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ ++ bch2_trans_iter_exit(trans, extent_iter); ++ bch2_trans_iter_init(trans, extent_iter, ++ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, ++ work_pos, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek_slot(extent_iter); ++ if (bkey_err(k)) ++ return k; ++ ++ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; ++ if (!r) { ++ /* raced due to btree write buffer, nothing to do */ ++ return bkey_s_c_null; ++ } ++ ++ memset(data_opts, 0, sizeof(*data_opts)); ++ ++ data_opts->rewrite_ptrs = ++ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); ++ data_opts->target = r->target; ++ ++ if (!data_opts->rewrite_ptrs) { ++ /* ++ * device we would want to write to offline? devices in target ++ * changed? ++ * ++ * We'll now need a full scan before this extent is picked up ++ * again: ++ */ ++ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); ++ if (ret) ++ return bkey_s_c_err(ret); ++ return bkey_s_c_null; ++ } ++ ++ return k; ++} ++ ++noinline_for_stack ++static int do_rebalance_extent(struct moving_context *ctxt, ++ struct bpos work_pos, ++ struct btree_iter *extent_iter) ++{ ++ struct btree_trans *trans = ctxt->trans; ++ struct bch_fs *c = trans->c; ++ struct bch_fs_rebalance *r = &trans->c->rebalance; ++ struct data_update_opts data_opts; ++ struct bch_io_opts io_opts; ++ struct bkey_s_c k; ++ struct bkey_buf sk; ++ int ret; ++ ++ ctxt->stats = &r->work_stats; ++ r->state = BCH_REBALANCE_working; ++ ++ bch2_bkey_buf_init(&sk); ++ ++ ret = bkey_err(k = next_rebalance_extent(trans, work_pos, ++ extent_iter, &data_opts)); ++ if (ret || !k.k) ++ goto out; ++ ++ ret = bch2_move_get_io_opts_one(trans, &io_opts, k); ++ if (ret) ++ goto out; ++ ++ atomic64_add(k.k->size, &ctxt->stats->sectors_seen); ++ ++ /* ++ * The iterator gets unlocked by __bch2_read_extent - need to ++ * save a copy of @k elsewhere: ++ */ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); ++ if (ret) { ++ if (bch2_err_matches(ret, ENOMEM)) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ ret = -BCH_ERR_transaction_restart_nested; ++ } ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto out; ++ ++ /* skip it and continue, XXX signal failure */ ++ ret = 0; ++ } ++out: ++ bch2_bkey_buf_exit(&sk, c); ++ return ret; ++} ++ +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ unsigned i; ++ unsigned target, compression; + -+ data_opts->rewrite_ptrs = 0; -+ data_opts->target = io_opts->background_target; -+ data_opts->extra_replicas = 0; -+ data_opts->btree_insert_flags = 0; ++ if (k.k->p.inode) { ++ target = io_opts->background_target; ++ compression = io_opts->background_compression ?: io_opts->compression; ++ } else { ++ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + -+ if (io_opts->background_compression && -+ !bch2_bkey_is_incompressible(k)) { -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ i = 0; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (!p.ptr.cached && -+ p.crc.compression_type != -+ bch2_compression_opt_to_type(io_opts->background_compression)) -+ data_opts->rewrite_ptrs |= 1U << i; -+ i++; -+ } -+ } -+ -+ if (io_opts->background_target) { -+ const struct bch_extent_ptr *ptr; -+ -+ i = 0; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (!ptr->cached && -+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && -+ bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) -+ data_opts->rewrite_ptrs |= 1U << i; -+ i++; -+ } ++ target = r ? r->target : io_opts->background_target; ++ compression = r ? r->compression : ++ (io_opts->background_compression ?: io_opts->compression); + } + ++ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); ++ data_opts->target = target; + return data_opts->rewrite_ptrs != 0; +} + -+void bch2_rebalance_add_key(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) ++static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) +{ -+ struct data_update_opts update_opts = { 0 }; -+ struct bkey_ptrs_c ptrs; -+ const struct bch_extent_ptr *ptr; -+ unsigned i; ++ struct btree_trans *trans = ctxt->trans; ++ struct bch_fs_rebalance *r = &trans->c->rebalance; ++ int ret; + -+ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) -+ return; ++ bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ++ ctxt->stats = &r->scan_stats; + -+ i = 0; -+ ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr(ptrs, ptr) { -+ if ((1U << i) && update_opts.rewrite_ptrs) -+ if (atomic64_add_return(k.k->size, -+ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == -+ k.k->size) -+ rebalance_wakeup(c); -+ i++; -+ } -+} -+ -+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -+{ -+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == -+ sectors) -+ rebalance_wakeup(c); -+} -+ -+struct rebalance_work { -+ int dev_most_full_idx; -+ unsigned dev_most_full_percent; -+ u64 dev_most_full_work; -+ u64 dev_most_full_capacity; -+ u64 total_work; -+}; -+ -+static void rebalance_work_accumulate(struct rebalance_work *w, -+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) -+{ -+ unsigned percent_full; -+ u64 work = dev_work + unknown_dev; -+ -+ /* avoid divide by 0 */ -+ if (!capacity) -+ return; -+ -+ if (work < dev_work || work < unknown_dev) -+ work = U64_MAX; -+ work = min(work, capacity); -+ -+ percent_full = div64_u64(work * 100, capacity); -+ -+ if (percent_full >= w->dev_most_full_percent) { -+ w->dev_most_full_idx = idx; -+ w->dev_most_full_percent = percent_full; -+ w->dev_most_full_work = work; -+ w->dev_most_full_capacity = capacity; ++ if (!inum) { ++ r->scan_start = BBPOS_MIN; ++ r->scan_end = BBPOS_MAX; ++ } else { ++ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); ++ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } + -+ if (w->total_work + dev_work >= w->total_work && -+ w->total_work + dev_work >= dev_work) -+ w->total_work += dev_work; -+} ++ r->state = BCH_REBALANCE_scanning; + -+static struct rebalance_work rebalance_work(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct rebalance_work ret = { .dev_most_full_idx = -1 }; -+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ rebalance_work_accumulate(&ret, -+ atomic64_read(&ca->rebalance_work), -+ unknown_dev, -+ bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket), -+ i); -+ -+ rebalance_work_accumulate(&ret, -+ unknown_dev, 0, c->capacity, -1); ++ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: ++ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + ++ bch2_move_stats_exit(&r->scan_stats, trans->c); + return ret; +} + -+static void rebalance_work_reset(struct bch_fs *c) ++static void rebalance_wait(struct bch_fs *c) +{ -+ struct bch_dev *ca; -+ unsigned i; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ u64 now = atomic64_read(&clock->now); ++ u64 min_member_capacity = bch2_min_rw_member_capacity(c); + -+ for_each_online_member(ca, c, i) -+ atomic64_set(&ca->rebalance_work, 0); ++ if (min_member_capacity == U64_MAX) ++ min_member_capacity = 128 * 2048; + -+ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++ r->wait_iotime_end = now + (min_member_capacity >> 6); ++ ++ if (r->state != BCH_REBALANCE_waiting) { ++ r->wait_iotime_start = now; ++ r->wait_wallclock_start = ktime_get_real_ns(); ++ r->state = BCH_REBALANCE_waiting; ++ } ++ ++ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} + -+static unsigned long curr_cputime(void) ++static int do_rebalance(struct moving_context *ctxt) +{ -+ u64 utime, stime; ++ struct btree_trans *trans = ctxt->trans; ++ struct bch_fs *c = trans->c; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct btree_iter rebalance_work_iter, extent_iter = { NULL }; ++ struct bkey_s_c k; ++ int ret = 0; + -+ task_cputime_adjusted(current, &utime, &stime); -+ return nsecs_to_jiffies(utime + stime); ++ bch2_move_stats_init(&r->work_stats, "rebalance_work"); ++ bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ++ ++ bch2_trans_iter_init(trans, &rebalance_work_iter, ++ BTREE_ID_rebalance_work, POS_MIN, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ ++ while (!bch2_move_ratelimit(ctxt) && ++ !kthread_wait_freezable(r->enabled)) { ++ bch2_trans_begin(trans); ++ ++ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret || !k.k) ++ break; ++ ++ ret = k.k->type == KEY_TYPE_cookie ++ ? do_rebalance_scan(ctxt, k.k->p.inode, ++ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) ++ : do_rebalance_extent(ctxt, k.k->p, &extent_iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++ ++ bch2_btree_iter_advance(&rebalance_work_iter); ++ } ++ ++ bch2_trans_iter_exit(trans, &extent_iter); ++ bch2_trans_iter_exit(trans, &rebalance_work_iter); ++ bch2_move_stats_exit(&r->scan_stats, c); ++ ++ if (!ret && ++ !kthread_should_stop() && ++ !atomic64_read(&r->work_stats.sectors_seen) && ++ !atomic64_read(&r->scan_stats.sectors_seen)) { ++ bch2_trans_unlock_long(trans); ++ rebalance_wait(c); ++ } ++ ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_fn(c, ret); ++ return ret; +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct rebalance_work w, p; -+ struct bch_move_stats move_stats; -+ unsigned long start, prev_start; -+ unsigned long prev_run_time, prev_run_cputime; -+ unsigned long cputime, prev_cputime; -+ u64 io_start; -+ long throttle; ++ struct moving_context ctxt; ++ int ret; + + set_freezable(); + -+ io_start = atomic64_read(&clock->now); -+ p = rebalance_work(c); -+ prev_start = jiffies; -+ prev_cputime = curr_cputime(); ++ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, ++ writepoint_ptr(&c->rebalance_write_point), ++ true); + -+ bch2_move_stats_init(&move_stats, "rebalance"); -+ while (!kthread_wait_freezable(r->enabled)) { -+ cond_resched(); ++ while (!kthread_should_stop() && ++ !(ret = do_rebalance(&ctxt))) ++ ; + -+ start = jiffies; -+ cputime = curr_cputime(); -+ -+ prev_run_time = start - prev_start; -+ prev_run_cputime = cputime - prev_cputime; -+ -+ w = rebalance_work(c); -+ BUG_ON(!w.dev_most_full_capacity); -+ -+ if (!w.total_work) { -+ r->state = REBALANCE_WAITING; -+ kthread_wait_freezable(rebalance_work(c).total_work); -+ continue; -+ } -+ -+ /* -+ * If there isn't much work to do, throttle cpu usage: -+ */ -+ throttle = prev_run_cputime * 100 / -+ max(1U, w.dev_most_full_percent) - -+ prev_run_time; -+ -+ if (w.dev_most_full_percent < 20 && throttle > 0) { -+ r->throttled_until_iotime = io_start + -+ div_u64(w.dev_most_full_capacity * -+ (20 - w.dev_most_full_percent), -+ 50); -+ -+ if (atomic64_read(&clock->now) + clock->max_slop < -+ r->throttled_until_iotime) { -+ r->throttled_until_cputime = start + throttle; -+ r->state = REBALANCE_THROTTLED; -+ -+ bch2_kthread_io_clock_wait(clock, -+ r->throttled_until_iotime, -+ throttle); -+ continue; -+ } -+ } -+ -+ /* minimum 1 mb/sec: */ -+ r->pd.rate.rate = -+ max_t(u64, 1 << 11, -+ r->pd.rate.rate * -+ max(p.dev_most_full_percent, 1U) / -+ max(w.dev_most_full_percent, 1U)); -+ -+ io_start = atomic64_read(&clock->now); -+ p = w; -+ prev_start = start; -+ prev_cputime = cputime; -+ -+ r->state = REBALANCE_RUNNING; -+ memset(&move_stats, 0, sizeof(move_stats)); -+ rebalance_work_reset(c); -+ -+ bch2_move_data(c, -+ 0, POS_MIN, -+ BTREE_ID_NR, POS_MAX, -+ /* ratelimiting disabled for now */ -+ NULL, /* &r->pd.rate, */ -+ &move_stats, -+ writepoint_ptr(&c->rebalance_write_point), -+ true, -+ rebalance_pred, NULL); -+ } ++ bch2_moving_ctxt_exit(&ctxt); + + return 0; +} + -+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) ++void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; -+ struct rebalance_work w = rebalance_work(c); + -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 20); -+ -+ prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); -+ prt_tab(out); -+ -+ prt_human_readable_u64(out, w.dev_most_full_work << 9); -+ prt_printf(out, "/"); -+ prt_human_readable_u64(out, w.dev_most_full_capacity << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "total work:"); -+ prt_tab(out); -+ -+ prt_human_readable_u64(out, w.total_work << 9); -+ prt_printf(out, "/"); -+ prt_human_readable_u64(out, c->capacity << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "rate:"); -+ prt_tab(out); -+ prt_printf(out, "%u", r->pd.rate.rate); ++ prt_str(out, bch2_rebalance_state_strs[r->state]); + prt_newline(out); ++ printbuf_indent_add(out, 2); + + switch (r->state) { -+ case REBALANCE_WAITING: -+ prt_printf(out, "waiting"); ++ case BCH_REBALANCE_waiting: { ++ u64 now = atomic64_read(&c->io_clock[WRITE].now); ++ ++ prt_str(out, "io wait duration: "); ++ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); ++ prt_newline(out); ++ ++ prt_str(out, "io wait remaining: "); ++ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); ++ prt_newline(out); ++ ++ prt_str(out, "duration waited: "); ++ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); ++ prt_newline(out); + break; -+ case REBALANCE_THROTTLED: -+ prt_printf(out, "throttled for %lu sec or ", -+ (r->throttled_until_cputime - jiffies) / HZ); -+ prt_human_readable_u64(out, -+ (r->throttled_until_iotime - -+ atomic64_read(&c->io_clock[WRITE].now)) << 9); -+ prt_printf(out, " io"); ++ } ++ case BCH_REBALANCE_working: ++ bch2_move_stats_to_text(out, &r->work_stats); + break; -+ case REBALANCE_RUNNING: -+ prt_printf(out, "running"); ++ case BCH_REBALANCE_scanning: ++ bch2_move_stats_to_text(out, &r->scan_stats); + break; + } + prt_newline(out); ++ printbuf_indent_sub(out, 2); +} + +void bch2_rebalance_stop(struct bch_fs *c) @@ -78246,21 +79324,22 @@ index 000000000000..568f1e8e7507 +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h new file mode 100644 -index 000000000000..7ade0bb81cce +index 000000000000..28a52638f16c --- /dev/null +++ b/fs/bcachefs/rebalance.h -@@ -0,0 +1,28 @@ +@@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_H +#define _BCACHEFS_REBALANCE_H + +#include "rebalance_types.h" + ++int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); ++int bch2_set_fs_needs_rebalance(struct bch_fs *); ++ +static inline void rebalance_wakeup(struct bch_fs *c) +{ + struct task_struct *p; @@ -78272,11 +79351,7 @@ index 000000000000..7ade0bb81cce + rcu_read_unlock(); +} + -+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_opts *); -+void bch2_rebalance_add_work(struct bch_fs *, u64); -+ -+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); ++void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); + +void bch2_rebalance_stop(struct bch_fs *); +int bch2_rebalance_start(struct bch_fs *); @@ -78285,42 +79360,53 @@ index 000000000000..7ade0bb81cce +#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h new file mode 100644 -index 000000000000..7462a92e9598 +index 000000000000..0fffb536c1d0 --- /dev/null +++ b/fs/bcachefs/rebalance_types.h -@@ -0,0 +1,26 @@ +@@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_TYPES_H +#define _BCACHEFS_REBALANCE_TYPES_H + ++#include "bbpos_types.h" +#include "move_types.h" + -+enum rebalance_state { -+ REBALANCE_WAITING, -+ REBALANCE_THROTTLED, -+ REBALANCE_RUNNING, ++#define BCH_REBALANCE_STATES() \ ++ x(waiting) \ ++ x(working) \ ++ x(scanning) ++ ++enum bch_rebalance_states { ++#define x(t) BCH_REBALANCE_##t, ++ BCH_REBALANCE_STATES() ++#undef x +}; + +struct bch_fs_rebalance { -+ struct task_struct __rcu *thread; ++ struct task_struct __rcu *thread; + struct bch_pd_controller pd; + -+ atomic64_t work_unknown_dev; ++ enum bch_rebalance_states state; ++ u64 wait_iotime_start; ++ u64 wait_iotime_end; ++ u64 wait_wallclock_start; + -+ enum rebalance_state state; -+ u64 throttled_until_iotime; -+ unsigned long throttled_until_cputime; ++ struct bch_move_stats work_stats; + -+ unsigned enabled:1; ++ struct bbpos scan_start; ++ struct bbpos scan_end; ++ struct bch_move_stats scan_stats; ++ ++ unsigned enabled:1; +}; + +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000000..4cd660650e5b +index 000000000000..9c30500ce920 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1049 @@ +@@ -0,0 +1,1057 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -78346,6 +79432,7 @@ index 000000000000..4cd660650e5b +#include "logged_ops.h" +#include "move.h" +#include "quota.h" ++#include "rebalance.h" +#include "recovery.h" +#include "replicas.h" +#include "sb-clean.h" @@ -78505,7 +79592,7 @@ index 000000000000..4cd660650e5b + bch2_journal_replay_key(trans, k)); + if (ret) { + bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", -+ bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); ++ bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + goto err; + } + } @@ -78548,7 +79635,7 @@ index 000000000000..4cd660650e5b + + if (entry->u64s) { + r->level = entry->level; -+ bkey_copy(&r->key, &entry->start[0]); ++ bkey_copy(&r->key, (struct bkey_i *) entry->start); + r->error = 0; + } else { + r->error = -EIO; @@ -78687,10 +79774,12 @@ index 000000000000..4cd660650e5b + } + + if (r->error) { -+ __fsck_err(c, btree_id_is_alloc(i) ++ __fsck_err(c, ++ btree_id_is_alloc(i) + ? FSCK_CAN_IGNORE : 0, ++ btree_root_bkey_invalid, + "invalid btree root %s", -+ bch2_btree_ids[i]); ++ bch2_btree_id_str(i)); + if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } @@ -78698,8 +79787,9 @@ index 000000000000..4cd660650e5b + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { + fsck_err(c, ++ btree_root_read_error, + "error reading btree root %s", -+ bch2_btree_ids[i]); ++ bch2_btree_id_str(i)); + if (btree_id_is_alloc(i)) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + ret = 0; @@ -79036,6 +80126,7 @@ index 000000000000..4cd660650e5b + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, ++ clean_but_journal_not_empty, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); @@ -79043,7 +80134,9 @@ index 000000000000..4cd660650e5b + } + + if (!last_journal_entry) { -+ fsck_err_on(!c->sb.clean, c, "no journal entries found"); ++ fsck_err_on(!c->sb.clean, c, ++ dirty_but_no_journal_entries, ++ "no journal entries found"); + if (clean) + goto use_clean; + @@ -79051,6 +80144,13 @@ index 000000000000..4cd660650e5b + if (*i) { + last_journal_entry = &(*i)->j; + (*i)->ignore = false; ++ /* ++ * This was probably a NO_FLUSH entry, ++ * so last_seq was garbage - but we know ++ * we're only using a single journal ++ * entry, set it here: ++ */ ++ (*i)->j.last_seq = (*i)->j.seq; + break; + } + } @@ -79224,7 +80324,7 @@ index 000000000000..4cd660650e5b + } + kfree(clean); + -+ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { ++ if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + bch2_fs_read_write_early(c); + bch2_delete_dead_snapshots_async(c); + } @@ -79269,16 +80369,12 @@ index 000000000000..4cd660650e5b + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + -+ for_each_online_member(ca, c, i) ++ for_each_member_device(ca, c, i) + bch2_dev_usage_init(ca); + -+ for_each_online_member(ca, c, i) { -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } ++ ret = bch2_fs_journal_alloc(c); ++ if (ret) ++ goto err; + + /* + * journal_res_get() will crash if called before this has @@ -79296,15 +80392,13 @@ index 000000000000..4cd660650e5b + * btree updates + */ + bch_verbose(c, "marking superblocks"); -+ for_each_member_device(ca, c, i) { -+ ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) { -+ percpu_ref_put(&ca->ref); -+ goto err; -+ } ++ ret = bch2_trans_mark_dev_sbs(c); ++ bch_err_msg(c, ret, "marking superblocks"); ++ if (ret) ++ goto err; + ++ for_each_online_member(ca, c, i) + ca->new_fs_bucket_idx = 0; -+ } + + ret = bch2_fs_freespace_init(c); + if (ret) @@ -79411,10 +80505,10 @@ index 000000000000..852d30567da9 +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h new file mode 100644 -index 000000000000..fbfa9d831d6f +index 000000000000..515e3d62c2ac --- /dev/null +++ b/fs/bcachefs/recovery_types.h -@@ -0,0 +1,49 @@ +@@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_TYPES_H +#define _BCACHEFS_RECOVERY_TYPES_H @@ -79431,6 +80525,8 @@ index 000000000000..fbfa9d831d6f + x(snapshots_read, PASS_ALWAYS) \ + x(check_topology, 0) \ + x(check_allocations, PASS_FSCK) \ ++ x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ ++ x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, PASS_ALWAYS) \ + x(check_alloc_info, PASS_FSCK) \ @@ -79444,11 +80540,12 @@ index 000000000000..fbfa9d831d6f + x(check_snapshot_trees, PASS_FSCK) \ + x(check_snapshots, PASS_FSCK) \ + x(check_subvols, PASS_FSCK) \ -+ x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ ++ x(delete_dead_snapshots, PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 0) \ + x(resume_logged_ops, PASS_ALWAYS) \ + x(check_inodes, PASS_FSCK) \ + x(check_extents, PASS_FSCK) \ ++ x(check_indirect_extents, PASS_FSCK) \ + x(check_dirents, PASS_FSCK) \ + x(check_xattrs, PASS_FSCK) \ + x(check_root, PASS_FSCK) \ @@ -79456,6 +80553,7 @@ index 000000000000..fbfa9d831d6f + x(check_nlinks, PASS_FSCK) \ + x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 0) \ ++ x(set_fs_needs_rebalance, 0) \ + +enum bch_recovery_pass { +#define x(n, when) BCH_RECOVERY_PASS_##n, @@ -79466,10 +80564,10 @@ index 000000000000..fbfa9d831d6f +#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 -index 000000000000..d77d0ea9afff +index 000000000000..6e1bfe9feb59 --- /dev/null +++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,405 @@ +@@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" @@ -79479,6 +80577,7 @@ index 000000000000..d77d0ea9afff +#include "inode.h" +#include "io_misc.h" +#include "io_write.h" ++#include "rebalance.h" +#include "reflink.h" +#include "subvolume.h" +#include "super-io.h" @@ -79499,7 +80598,7 @@ index 000000000000..d77d0ea9afff + +/* reflink pointers */ + -+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ @@ -79546,7 +80645,7 @@ index 000000000000..d77d0ea9afff + +/* indirect extents */ + -+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ @@ -79575,28 +80674,29 @@ index 000000000000..d77d0ea9afff +} +#endif + ++static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) ++{ ++ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { ++ new->k.type = KEY_TYPE_deleted; ++ new->k.size = 0; ++ set_bkey_val_u64s(&new->k, 0);; ++ *flags &= ~BTREE_TRIGGER_INSERT; ++ } ++} ++ +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { -+ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); -+ -+ if (!r->v.refcount) { -+ r->k.type = KEY_TYPE_deleted; -+ r->k.size = 0; -+ set_bkey_val_u64s(&r->k, 0); -+ return 0; -+ } -+ } ++ check_indirect_extent_deleting(new, &flags); + + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); +} + +/* indirect inline data */ + -+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ @@ -79604,7 +80704,7 @@ index 000000000000..d77d0ea9afff +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bkey_s_c k) ++ struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); @@ -79619,16 +80719,7 @@ index 000000000000..d77d0ea9afff + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { -+ struct bkey_i_indirect_inline_data *r = -+ bkey_i_to_indirect_inline_data(new); -+ -+ if (!r->v.refcount) { -+ r->k.type = KEY_TYPE_deleted; -+ r->k.size = 0; -+ set_bkey_val_u64s(&r->k, 0); -+ } -+ } ++ check_indirect_extent_deleting(new, &flags); + + return 0; +} @@ -79732,8 +80823,9 @@ index 000000000000..d77d0ea9afff + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); + struct bpos dst_end = dst_start, src_end = src_start; ++ struct bch_io_opts opts; + struct bpos src_want; -+ u64 dst_done; ++ u64 dst_done = 0; + u32 dst_snapshot, src_snapshot; + int ret = 0, ret2 = 0; + @@ -79749,6 +80841,10 @@ index 000000000000..d77d0ea9afff + bch2_bkey_buf_init(&new_src); + trans = bch2_trans_get(c); + ++ ret = bch2_inum_opts_get(trans, src_inum, &opts); ++ if (ret) ++ goto err; ++ + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, @@ -79832,10 +80928,13 @@ index 000000000000..d77d0ea9afff + min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter.pos.offset)); + -+ ret = bch2_extent_update(trans, dst_inum, &dst_iter, -+ new_dst.k, &disk_res, -+ new_i_size, i_sectors_delta, -+ true); ++ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, ++ opts.background_target, ++ opts.background_compression) ?: ++ bch2_extent_update(trans, dst_inum, &dst_iter, ++ new_dst.k, &disk_res, ++ new_i_size, i_sectors_delta, ++ true); + bch2_disk_reservation_put(c, &disk_res); + } + bch2_trans_iter_exit(trans, &dst_iter); @@ -79866,7 +80965,7 @@ index 000000000000..d77d0ea9afff + + bch2_trans_iter_exit(trans, &inode_iter); + } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -+ ++err: + bch2_trans_put(trans); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); @@ -79877,7 +80976,7 @@ index 000000000000..d77d0ea9afff +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 -index 000000000000..fe52538efb52 +index 000000000000..8ccf3f9c4939 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,81 @@ @@ -79887,7 +80986,7 @@ index 000000000000..fe52538efb52 + +enum bkey_invalid_flags; + -+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); @@ -79902,7 +81001,7 @@ index 000000000000..fe52538efb52 + .min_val_size = 16, \ +}) + -+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); @@ -79918,7 +81017,7 @@ index 000000000000..fe52538efb52 + .min_val_size = 8, \ +}) + -+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); @@ -79964,10 +81063,10 @@ index 000000000000..fe52538efb52 +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 -index 000000000000..cef2a0447b86 +index 000000000000..1c3ae13bfced --- /dev/null +++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1058 @@ +@@ -0,0 +1,1050 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -80432,18 +81531,13 @@ index 000000000000..cef2a0447b86 +{ + lockdep_assert_held(&c->replicas_gc_lock); + -+ if (ret) -+ goto err; -+ + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); -+ if (ret) -+ goto err; ++ ret = ret ?: ++ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: ++ replicas_table_update(c, &c->replicas_gc); + -+ ret = replicas_table_update(c, &c->replicas_gc); -+err: + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + @@ -80549,12 +81643,9 @@ index 000000000000..cef2a0447b86 + + bch2_cpu_replicas_sort(&new); + -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new); -+ if (ret) -+ goto err; ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: ++ replicas_table_update(c, &new); + -+ ret = replicas_table_update(c, &new); -+err: + kfree(new.entries); + + percpu_up_write(&c->mark_lock); @@ -81158,10 +82249,10 @@ index 000000000000..5cfff489bbc3 +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c new file mode 100644 -index 000000000000..61203d7c8d36 +index 000000000000..e151ada1c8bd --- /dev/null +++ b/fs/bcachefs/sb-clean.c -@@ -0,0 +1,395 @@ +@@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -81246,6 +82337,7 @@ index 000000000000..61203d7c8d36 + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ sb_clean_journal_seq_mismatch, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { @@ -81283,6 +82375,7 @@ index 000000000000..61203d7c8d36 + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(&k1->k)) || + l1 != l2, c, ++ sb_clean_btree_root_mismatch, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, @@ -81304,6 +82397,7 @@ index 000000000000..61203d7c8d36 + sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); + + if (fsck_err_on(!sb_clean, c, ++ sb_clean_missing, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; @@ -81537,7 +82631,7 @@ index 000000000000..61203d7c8d36 + + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, 0); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, @@ -81579,12 +82673,488 @@ index 000000000000..71caef281239 +void bch2_fs_mark_clean(struct bch_fs *); + +#endif /* _BCACHEFS_SB_CLEAN_H */ +diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c +new file mode 100644 +index 000000000000..f0930ab7f036 +--- /dev/null ++++ b/fs/bcachefs/sb-errors.c +@@ -0,0 +1,172 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "sb-errors.h" ++#include "super-io.h" ++ ++static const char * const bch2_sb_error_strs[] = { ++#define x(t, n, ...) [n] = #t, ++ BCH_SB_ERRS() ++ NULL ++}; ++ ++static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) ++{ ++ if (id < BCH_SB_ERR_MAX) ++ prt_str(out, bch2_sb_error_strs[id]); ++ else ++ prt_printf(out, "(unknown error %u)", id); ++} ++ ++static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) ++{ ++ return e ++ ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) ++ : 0; ++} ++ ++static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) ++{ ++ return (sizeof(struct bch_sb_field_errors) + ++ sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); ++} ++ ++static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_errors *e = field_to_type(f, errors); ++ unsigned i, nr = bch2_sb_field_errors_nr_entries(e); ++ ++ for (i = 0; i < nr; i++) { ++ if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { ++ prt_printf(err, "entry with count 0 (id "); ++ bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); ++ prt_printf(err, ")"); ++ return -BCH_ERR_invalid_sb_errors; ++ } ++ ++ if (i + 1 < nr && ++ BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= ++ BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { ++ prt_printf(err, "entries out of order"); ++ return -BCH_ERR_invalid_sb_errors; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_errors *e = field_to_type(f, errors); ++ unsigned i, nr = bch2_sb_field_errors_nr_entries(e); ++ ++ if (out->nr_tabstops <= 1) ++ printbuf_tabstop_push(out, 16); ++ ++ for (i = 0; i < nr; i++) { ++ bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); ++ prt_tab(out); ++ prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); ++ prt_tab(out); ++ bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); ++ prt_newline(out); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_errors = { ++ .validate = bch2_sb_errors_validate, ++ .to_text = bch2_sb_errors_to_text, ++}; ++ ++void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) ++{ ++ bch_sb_errors_cpu *e = &c->fsck_error_counts; ++ struct bch_sb_error_entry_cpu n = { ++ .id = err, ++ .nr = 1, ++ .last_error_time = ktime_get_real_seconds() ++ }; ++ unsigned i; ++ ++ mutex_lock(&c->fsck_error_counts_lock); ++ for (i = 0; i < e->nr; i++) { ++ if (err == e->data[i].id) { ++ e->data[i].nr++; ++ e->data[i].last_error_time = n.last_error_time; ++ goto out; ++ } ++ if (err < e->data[i].id) ++ break; ++ } ++ ++ if (darray_make_room(e, 1)) ++ goto out; ++ ++ darray_insert_item(e, i, n); ++out: ++ mutex_unlock(&c->fsck_error_counts_lock); ++} ++ ++void bch2_sb_errors_from_cpu(struct bch_fs *c) ++{ ++ bch_sb_errors_cpu *src = &c->fsck_error_counts; ++ struct bch_sb_field_errors *dst = ++ bch2_sb_field_resize(&c->disk_sb, errors, ++ bch2_sb_field_errors_u64s(src->nr)); ++ unsigned i; ++ ++ if (!dst) ++ return; ++ ++ for (i = 0; i < src->nr; i++) { ++ SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); ++ SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); ++ dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); ++ } ++} ++ ++static int bch2_sb_errors_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); ++ bch_sb_errors_cpu *dst = &c->fsck_error_counts; ++ unsigned i, nr = bch2_sb_field_errors_nr_entries(src); ++ int ret; ++ ++ if (!nr) ++ return 0; ++ ++ mutex_lock(&c->fsck_error_counts_lock); ++ ret = darray_make_room(dst, nr); ++ if (ret) ++ goto err; ++ ++ dst->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); ++ dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); ++ dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); ++ } ++err: ++ mutex_unlock(&c->fsck_error_counts_lock); ++ ++ return ret; ++} ++ ++void bch2_fs_sb_errors_exit(struct bch_fs *c) ++{ ++ darray_exit(&c->fsck_error_counts); ++} ++ ++void bch2_fs_sb_errors_init_early(struct bch_fs *c) ++{ ++ mutex_init(&c->fsck_error_counts_lock); ++ darray_init(&c->fsck_error_counts); ++} ++ ++int bch2_fs_sb_errors_init(struct bch_fs *c) ++{ ++ return bch2_sb_errors_to_cpu(c); ++} +diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h +new file mode 100644 +index 000000000000..5a09a53966be +--- /dev/null ++++ b/fs/bcachefs/sb-errors.h +@@ -0,0 +1,270 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SB_ERRORS_H ++#define _BCACHEFS_SB_ERRORS_H ++ ++#include "sb-errors_types.h" ++ ++#define BCH_SB_ERRS() \ ++ x(clean_but_journal_not_empty, 0) \ ++ x(dirty_but_no_journal_entries, 1) \ ++ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ ++ x(sb_clean_journal_seq_mismatch, 3) \ ++ x(sb_clean_btree_root_mismatch, 4) \ ++ x(sb_clean_missing, 5) \ ++ x(jset_unsupported_version, 6) \ ++ x(jset_unknown_csum, 7) \ ++ x(jset_last_seq_newer_than_seq, 8) \ ++ x(jset_past_bucket_end, 9) \ ++ x(jset_seq_blacklisted, 10) \ ++ x(journal_entries_missing, 11) \ ++ x(journal_entry_replicas_not_marked, 12) \ ++ x(journal_entry_past_jset_end, 13) \ ++ x(journal_entry_replicas_data_mismatch, 14) \ ++ x(journal_entry_bkey_u64s_0, 15) \ ++ x(journal_entry_bkey_past_end, 16) \ ++ x(journal_entry_bkey_bad_format, 17) \ ++ x(journal_entry_bkey_invalid, 18) \ ++ x(journal_entry_btree_root_bad_size, 19) \ ++ x(journal_entry_blacklist_bad_size, 20) \ ++ x(journal_entry_blacklist_v2_bad_size, 21) \ ++ x(journal_entry_blacklist_v2_start_past_end, 22) \ ++ x(journal_entry_usage_bad_size, 23) \ ++ x(journal_entry_data_usage_bad_size, 24) \ ++ x(journal_entry_clock_bad_size, 25) \ ++ x(journal_entry_clock_bad_rw, 26) \ ++ x(journal_entry_dev_usage_bad_size, 27) \ ++ x(journal_entry_dev_usage_bad_dev, 28) \ ++ x(journal_entry_dev_usage_bad_pad, 29) \ ++ x(btree_node_unreadable, 30) \ ++ x(btree_node_fault_injected, 31) \ ++ x(btree_node_bad_magic, 32) \ ++ x(btree_node_bad_seq, 33) \ ++ x(btree_node_unsupported_version, 34) \ ++ x(btree_node_bset_older_than_sb_min, 35) \ ++ x(btree_node_bset_newer_than_sb, 36) \ ++ x(btree_node_data_missing, 37) \ ++ x(btree_node_bset_after_end, 38) \ ++ x(btree_node_replicas_sectors_written_mismatch, 39) \ ++ x(btree_node_replicas_data_mismatch, 40) \ ++ x(bset_unknown_csum, 41) \ ++ x(bset_bad_csum, 42) \ ++ x(bset_past_end_of_btree_node, 43) \ ++ x(bset_wrong_sector_offset, 44) \ ++ x(bset_empty, 45) \ ++ x(bset_bad_seq, 46) \ ++ x(bset_blacklisted_journal_seq, 47) \ ++ x(first_bset_blacklisted_journal_seq, 48) \ ++ x(btree_node_bad_btree, 49) \ ++ x(btree_node_bad_level, 50) \ ++ x(btree_node_bad_min_key, 51) \ ++ x(btree_node_bad_max_key, 52) \ ++ x(btree_node_bad_format, 53) \ ++ x(btree_node_bkey_past_bset_end, 54) \ ++ x(btree_node_bkey_bad_format, 55) \ ++ x(btree_node_bad_bkey, 56) \ ++ x(btree_node_bkey_out_of_order, 57) \ ++ x(btree_root_bkey_invalid, 58) \ ++ x(btree_root_read_error, 59) \ ++ x(btree_root_bad_min_key, 50) \ ++ x(btree_root_bad_max_key, 61) \ ++ x(btree_node_read_error, 62) \ ++ x(btree_node_topology_bad_min_key, 63) \ ++ x(btree_node_topology_bad_max_key, 64) \ ++ x(btree_node_topology_overwritten_by_prev_node, 65) \ ++ x(btree_node_topology_overwritten_by_next_node, 66) \ ++ x(btree_node_topology_interior_node_empty, 67) \ ++ x(fs_usage_hidden_wrong, 68) \ ++ x(fs_usage_btree_wrong, 69) \ ++ x(fs_usage_data_wrong, 70) \ ++ x(fs_usage_cached_wrong, 71) \ ++ x(fs_usage_reserved_wrong, 72) \ ++ x(fs_usage_persistent_reserved_wrong, 73) \ ++ x(fs_usage_nr_inodes_wrong, 74) \ ++ x(fs_usage_replicas_wrong, 75) \ ++ x(dev_usage_buckets_wrong, 76) \ ++ x(dev_usage_sectors_wrong, 77) \ ++ x(dev_usage_fragmented_wrong, 78) \ ++ x(dev_usage_buckets_ec_wrong, 79) \ ++ x(bkey_version_in_future, 80) \ ++ x(bkey_u64s_too_small, 81) \ ++ x(bkey_invalid_type_for_btree, 82) \ ++ x(bkey_extent_size_zero, 83) \ ++ x(bkey_extent_size_greater_than_offset, 84) \ ++ x(bkey_size_nonzero, 85) \ ++ x(bkey_snapshot_nonzero, 86) \ ++ x(bkey_snapshot_zero, 87) \ ++ x(bkey_at_pos_max, 88) \ ++ x(bkey_before_start_of_btree_node, 89) \ ++ x(bkey_after_end_of_btree_node, 90) \ ++ x(bkey_val_size_nonzero, 91) \ ++ x(bkey_val_size_too_small, 92) \ ++ x(alloc_v1_val_size_bad, 93) \ ++ x(alloc_v2_unpack_error, 94) \ ++ x(alloc_v3_unpack_error, 95) \ ++ x(alloc_v4_val_size_bad, 96) \ ++ x(alloc_v4_backpointers_start_bad, 97) \ ++ x(alloc_key_data_type_bad, 98) \ ++ x(alloc_key_empty_but_have_data, 99) \ ++ x(alloc_key_dirty_sectors_0, 100) \ ++ x(alloc_key_data_type_inconsistency, 101) \ ++ x(alloc_key_to_missing_dev_bucket, 102) \ ++ x(alloc_key_cached_inconsistency, 103) \ ++ x(alloc_key_cached_but_read_time_zero, 104) \ ++ x(alloc_key_to_missing_lru_entry, 105) \ ++ x(alloc_key_data_type_wrong, 106) \ ++ x(alloc_key_gen_wrong, 107) \ ++ x(alloc_key_dirty_sectors_wrong, 108) \ ++ x(alloc_key_cached_sectors_wrong, 109) \ ++ x(alloc_key_stripe_wrong, 110) \ ++ x(alloc_key_stripe_redundancy_wrong, 111) \ ++ x(bucket_sector_count_overflow, 112) \ ++ x(bucket_metadata_type_mismatch, 113) \ ++ x(need_discard_key_wrong, 114) \ ++ x(freespace_key_wrong, 115) \ ++ x(freespace_hole_missing, 116) \ ++ x(bucket_gens_val_size_bad, 117) \ ++ x(bucket_gens_key_wrong, 118) \ ++ x(bucket_gens_hole_wrong, 119) \ ++ x(bucket_gens_to_invalid_dev, 120) \ ++ x(bucket_gens_to_invalid_buckets, 121) \ ++ x(bucket_gens_nonzero_for_invalid_buckets, 122) \ ++ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ ++ x(need_discard_freespace_key_bad, 124) \ ++ x(backpointer_pos_wrong, 125) \ ++ x(backpointer_to_missing_device, 126) \ ++ x(backpointer_to_missing_alloc, 127) \ ++ x(backpointer_to_missing_ptr, 128) \ ++ x(lru_entry_at_time_0, 129) \ ++ x(lru_entry_to_invalid_bucket, 130) \ ++ x(lru_entry_bad, 131) \ ++ x(btree_ptr_val_too_big, 132) \ ++ x(btree_ptr_v2_val_too_big, 133) \ ++ x(btree_ptr_has_non_ptr, 134) \ ++ x(extent_ptrs_invalid_entry, 135) \ ++ x(extent_ptrs_no_ptrs, 136) \ ++ x(extent_ptrs_too_many_ptrs, 137) \ ++ x(extent_ptrs_redundant_crc, 138) \ ++ x(extent_ptrs_redundant_stripe, 139) \ ++ x(extent_ptrs_unwritten, 140) \ ++ x(extent_ptrs_written_and_unwritten, 141) \ ++ x(ptr_to_invalid_device, 142) \ ++ x(ptr_to_duplicate_device, 143) \ ++ x(ptr_after_last_bucket, 144) \ ++ x(ptr_before_first_bucket, 145) \ ++ x(ptr_spans_multiple_buckets, 146) \ ++ x(ptr_to_missing_backpointer, 147) \ ++ x(ptr_to_missing_alloc_key, 148) \ ++ x(ptr_to_missing_replicas_entry, 149) \ ++ x(ptr_to_missing_stripe, 150) \ ++ x(ptr_to_incorrect_stripe, 151) \ ++ x(ptr_gen_newer_than_bucket_gen, 152) \ ++ x(ptr_too_stale, 153) \ ++ x(stale_dirty_ptr, 154) \ ++ x(ptr_bucket_data_type_mismatch, 155) \ ++ x(ptr_cached_and_erasure_coded, 156) \ ++ x(ptr_crc_uncompressed_size_too_small, 157) \ ++ x(ptr_crc_csum_type_unknown, 158) \ ++ x(ptr_crc_compression_type_unknown, 159) \ ++ x(ptr_crc_redundant, 160) \ ++ x(ptr_crc_uncompressed_size_too_big, 161) \ ++ x(ptr_crc_nonce_mismatch, 162) \ ++ x(ptr_stripe_redundant, 163) \ ++ x(reservation_key_nr_replicas_invalid, 164) \ ++ x(reflink_v_refcount_wrong, 165) \ ++ x(reflink_p_to_missing_reflink_v, 166) \ ++ x(stripe_pos_bad, 167) \ ++ x(stripe_val_size_bad, 168) \ ++ x(stripe_sector_count_wrong, 169) \ ++ x(snapshot_tree_pos_bad, 170) \ ++ x(snapshot_tree_to_missing_snapshot, 171) \ ++ x(snapshot_tree_to_missing_subvol, 172) \ ++ x(snapshot_tree_to_wrong_subvol, 173) \ ++ x(snapshot_tree_to_snapshot_subvol, 174) \ ++ x(snapshot_pos_bad, 175) \ ++ x(snapshot_parent_bad, 176) \ ++ x(snapshot_children_not_normalized, 177) \ ++ x(snapshot_child_duplicate, 178) \ ++ x(snapshot_child_bad, 179) \ ++ x(snapshot_skiplist_not_normalized, 180) \ ++ x(snapshot_skiplist_bad, 181) \ ++ x(snapshot_should_not_have_subvol, 182) \ ++ x(snapshot_to_bad_snapshot_tree, 183) \ ++ x(snapshot_bad_depth, 184) \ ++ x(snapshot_bad_skiplist, 185) \ ++ x(subvol_pos_bad, 186) \ ++ x(subvol_not_master_and_not_snapshot, 187) \ ++ x(subvol_to_missing_root, 188) \ ++ x(subvol_root_wrong_bi_subvol, 189) \ ++ x(bkey_in_missing_snapshot, 190) \ ++ x(inode_pos_inode_nonzero, 191) \ ++ x(inode_pos_blockdev_range, 192) \ ++ x(inode_unpack_error, 193) \ ++ x(inode_str_hash_invalid, 194) \ ++ x(inode_v3_fields_start_bad, 195) \ ++ x(inode_snapshot_mismatch, 196) \ ++ x(inode_unlinked_but_clean, 197) \ ++ x(inode_unlinked_but_nlink_nonzero, 198) \ ++ x(inode_checksum_type_invalid, 199) \ ++ x(inode_compression_type_invalid, 200) \ ++ x(inode_subvol_root_but_not_dir, 201) \ ++ x(inode_i_size_dirty_but_clean, 202) \ ++ x(inode_i_sectors_dirty_but_clean, 203) \ ++ x(inode_i_sectors_wrong, 204) \ ++ x(inode_dir_wrong_nlink, 205) \ ++ x(inode_dir_multiple_links, 206) \ ++ x(inode_multiple_links_but_nlink_0, 207) \ ++ x(inode_wrong_backpointer, 208) \ ++ x(inode_wrong_nlink, 209) \ ++ x(inode_unreachable, 210) \ ++ x(deleted_inode_but_clean, 211) \ ++ x(deleted_inode_missing, 212) \ ++ x(deleted_inode_is_dir, 213) \ ++ x(deleted_inode_not_unlinked, 214) \ ++ x(extent_overlapping, 215) \ ++ x(extent_in_missing_inode, 216) \ ++ x(extent_in_non_reg_inode, 217) \ ++ x(extent_past_end_of_inode, 218) \ ++ x(dirent_empty_name, 219) \ ++ x(dirent_val_too_big, 220) \ ++ x(dirent_name_too_long, 221) \ ++ x(dirent_name_embedded_nul, 222) \ ++ x(dirent_name_dot_or_dotdot, 223) \ ++ x(dirent_name_has_slash, 224) \ ++ x(dirent_d_type_wrong, 225) \ ++ x(dirent_d_parent_subvol_wrong, 226) \ ++ x(dirent_in_missing_dir_inode, 227) \ ++ x(dirent_in_non_dir_inode, 228) \ ++ x(dirent_to_missing_inode, 229) \ ++ x(dirent_to_missing_subvol, 230) \ ++ x(dirent_to_itself, 231) \ ++ x(quota_type_invalid, 232) \ ++ x(xattr_val_size_too_small, 233) \ ++ x(xattr_val_size_too_big, 234) \ ++ x(xattr_invalid_type, 235) \ ++ x(xattr_name_invalid_chars, 236) \ ++ x(xattr_in_missing_inode, 237) \ ++ x(root_subvol_missing, 238) \ ++ x(root_dir_missing, 239) \ ++ x(root_inode_not_dir, 240) \ ++ x(dir_loop, 241) \ ++ x(hash_table_key_duplicate, 242) \ ++ x(hash_table_key_wrong_offset, 243) ++ ++enum bch_sb_error_id { ++#define x(t, n) BCH_FSCK_ERR_##t = n, ++ BCH_SB_ERRS() ++#undef x ++ BCH_SB_ERR_MAX ++}; ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_errors; ++ ++void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); ++ ++void bch2_sb_errors_from_cpu(struct bch_fs *); ++ ++void bch2_fs_sb_errors_exit(struct bch_fs *); ++void bch2_fs_sb_errors_init_early(struct bch_fs *); ++int bch2_fs_sb_errors_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_SB_ERRORS_H */ +diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h +new file mode 100644 +index 000000000000..b1c099843a39 +--- /dev/null ++++ b/fs/bcachefs/sb-errors_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SB_ERRORS_TYPES_H ++#define _BCACHEFS_SB_ERRORS_TYPES_H ++ ++#include "darray.h" ++ ++struct bch_sb_error_entry_cpu { ++ u64 id:16, ++ nr:48; ++ u64 last_error_time; ++}; ++ ++typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; ++ ++#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ ++ diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c new file mode 100644 -index 000000000000..6dd85bb996fe +index 000000000000..bed0f857fe5b --- /dev/null +++ b/fs/bcachefs/sb-members.c -@@ -0,0 +1,339 @@ +@@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -81594,21 +83164,28 @@ index 000000000000..6dd85bb996fe +#include "sb-members.h" +#include "super-io.h" + -+/* Code for bch_sb_field_members_v1: */ ++#define x(t, n, ...) [n] = #t, ++static const char * const bch2_iops_measurements[] = { ++ BCH_IOPS_MEASUREMENTS() ++ NULL ++}; + -+static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i) -+{ -+ return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -+} ++char * const bch2_member_error_strs[] = { ++ BCH_MEMBER_ERROR_TYPES() ++ NULL ++}; ++#undef x ++ ++/* Code for bch_sb_field_members_v1: */ + +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) +{ -+ return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); ++ return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); +} + +static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) +{ -+ struct bch_member ret, *p = members_v2_get_mut(mi, i); ++ struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); + return ret; @@ -81623,7 +83200,8 @@ index 000000000000..6dd85bb996fe +{ + struct bch_member ret, *p = members_v1_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); -+ memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret; ++ memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); ++ return ret; +} + +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) @@ -81649,7 +83227,7 @@ index 000000000000..6dd85bb996fe + + for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { + void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); -+ memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); ++ memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); + memset(dst + le16_to_cpu(mi->member_bytes), + 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); + } @@ -81658,7 +83236,7 @@ index 000000000000..6dd85bb996fe + return 0; +} + -+int bch2_members_v2_init(struct bch_fs *c) ++int bch2_sb_members_v2_init(struct bch_fs *c) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; @@ -81678,7 +83256,7 @@ index 000000000000..6dd85bb996fe + return sb_members_v2_resize_entries(c); +} + -+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) ++int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; @@ -81692,7 +83270,7 @@ index 000000000000..6dd85bb996fe + mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); + + for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) -+ memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); ++ memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); + + return 0; +} @@ -81742,6 +83320,8 @@ index 000000000000..6dd85bb996fe + u64 bucket_size = le16_to_cpu(m.bucket_size); + u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; + ++ if (!bch2_member_exists(&m)) ++ return; + + prt_printf(out, "Device:"); + prt_tab(out); @@ -81750,6 +83330,21 @@ index 000000000000..6dd85bb996fe + + printbuf_indent_add(out, 2); + ++ prt_printf(out, "Label:"); ++ prt_tab(out); ++ if (BCH_MEMBER_GROUP(&m)) { ++ unsigned idx = BCH_MEMBER_GROUP(&m) - 1; ++ ++ if (idx < disk_groups_nr(gi)) ++ prt_printf(out, "%s (%u)", ++ gi->entries[idx].label, idx); ++ else ++ prt_printf(out, "(bad disk labels section)"); ++ } else { ++ prt_printf(out, "(none)"); ++ } ++ prt_newline(out); ++ + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m.uuid.b); @@ -81760,6 +83355,13 @@ index 000000000000..6dd85bb996fe + prt_units_u64(out, device_size << 9); + prt_newline(out); + ++ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { ++ prt_printf(out, "%s errors:", bch2_member_error_strs[i]); ++ prt_tab(out); ++ prt_u64(out, le64_to_cpu(m.errors[i])); ++ prt_newline(out); ++ } ++ + for (unsigned i = 0; i < BCH_IOPS_NR; i++) { + prt_printf(out, "%s iops:", bch2_iops_measurements[i]); + prt_tab(out); @@ -81785,7 +83387,7 @@ index 000000000000..6dd85bb996fe + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m.last_mount) -+ pr_time(out, le64_to_cpu(m.last_mount)); ++ bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); @@ -81798,21 +83400,6 @@ index 000000000000..6dd85bb996fe + : "unknown"); + prt_newline(out); + -+ prt_printf(out, "Label:"); -+ prt_tab(out); -+ if (BCH_MEMBER_GROUP(&m)) { -+ unsigned idx = BCH_MEMBER_GROUP(&m) - 1; -+ -+ if (idx < disk_groups_nr(gi)) -+ prt_printf(out, "%s (%u)", -+ gi->entries[idx].label, idx); -+ else -+ prt_printf(out, "(bad disk labels section)"); -+ } else { -+ prt_printf(out, "(none)"); -+ } -+ prt_newline(out); -+ + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(&m)) @@ -81849,8 +83436,7 @@ index 000000000000..6dd85bb996fe + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + unsigned i; + -+ if ((void *) members_v1_get_mut(mi, sb->nr_devices) > -+ vstruct_end(&mi->field)) { ++ if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { + prt_printf(err, "too many devices for section size"); + return -BCH_ERR_invalid_sb_members; + } @@ -81873,10 +83459,8 @@ index 000000000000..6dd85bb996fe + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member m = members_v1_get(mi, i); -+ member_to_text(out, m, gi, sb, i); -+ } ++ for (i = 0; i < sb->nr_devices; i++) ++ member_to_text(out, members_v1_get(mi, i), gi, sb, i); +} + +const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { @@ -81891,10 +83475,8 @@ index 000000000000..6dd85bb996fe + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member m = members_v2_get(mi, i); -+ member_to_text(out, m, gi, sb, i); -+ } ++ for (i = 0; i < sb->nr_devices; i++) ++ member_to_text(out, members_v2_get(mi, i), gi, sb, i); +} + +static int bch2_sb_members_v2_validate(struct bch_sb *sb, @@ -81902,7 +83484,7 @@ index 000000000000..6dd85bb996fe + struct printbuf *err) +{ + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); -+ size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) - ++ size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - + (void *) mi; + + if (mi_bytes > vstruct_bytes(&mi->field)) { @@ -81924,18 +83506,95 @@ index 000000000000..6dd85bb996fe + .validate = bch2_sb_members_v2_validate, + .to_text = bch2_sb_members_v2_to_text, +}; ++ ++void bch2_sb_members_from_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); ++ struct bch_dev *ca; ++ unsigned i, e; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) { ++ struct bch_member *m = __bch2_members_v2_get_mut(mi, i); ++ ++ for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) ++ m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_member m; ++ ++ mutex_lock(&ca->fs->sb_lock); ++ m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); ++ mutex_unlock(&ca->fs->sb_lock); ++ ++ printbuf_tabstop_push(out, 12); ++ ++ prt_str(out, "IO errors since filesystem creation"); ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { ++ prt_printf(out, "%s:", bch2_member_error_strs[i]); ++ prt_tab(out); ++ prt_u64(out, atomic64_read(&ca->errors[i])); ++ prt_newline(out); ++ } ++ printbuf_indent_sub(out, 2); ++ ++ prt_str(out, "IO errors since "); ++ bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); ++ prt_str(out, " ago"); ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { ++ prt_printf(out, "%s:", bch2_member_error_strs[i]); ++ prt_tab(out); ++ prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); ++ prt_newline(out); ++ } ++ printbuf_indent_sub(out, 2); ++} ++ ++void bch2_dev_errors_reset(struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_member *m; ++ ++ mutex_lock(&c->sb_lock); ++ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) ++ m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); ++ m->errors_reset_time = ktime_get_real_seconds(); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h new file mode 100644 -index 000000000000..430f3457bfd4 +index 000000000000..03613e3eb8e3 --- /dev/null +++ b/fs/bcachefs/sb-members.h -@@ -0,0 +1,182 @@ +@@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_H +#define _BCACHEFS_SB_MEMBERS_H + -+int bch2_members_v2_init(struct bch_fs *c); -+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); ++extern char * const bch2_member_error_strs[]; ++ ++static inline struct bch_member * ++__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) ++{ ++ return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); ++} ++ ++int bch2_sb_members_v2_init(struct bch_fs *c); ++int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); + @@ -82111,6 +83770,43 @@ index 000000000000..430f3457bfd4 +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; + ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) ++{ ++ if (dev < sb->nr_devices) { ++ struct bch_member m = bch2_sb_member_get(sb, dev); ++ return bch2_member_exists(&m); ++ } ++ return false; ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), ++ .valid = bch2_member_exists(mi), ++ }; ++} ++ ++void bch2_sb_members_from_cpu(struct bch_fs *); ++ ++void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); ++void bch2_dev_errors_reset(struct bch_dev *); ++ +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h new file mode 100644 @@ -82440,10 +84136,10 @@ index 000000000000..3dfaf34a43b2 +#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c new file mode 100644 -index 000000000000..b684b9f00c1b +index 000000000000..b775cf0fb7cb --- /dev/null +++ b/fs/bcachefs/six.c -@@ -0,0 +1,913 @@ +@@ -0,0 +1,917 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -82457,6 +84153,8 @@ index 000000000000..b684b9f00c1b +#include +#include + ++#include ++ +#include "six.h" + +#ifdef DEBUG @@ -82908,11 +84606,12 @@ index 000000000000..b684b9f00c1b + smp_mb__after_atomic(); + } + ++ trace_contention_begin(lock, 0); ++ lock_contended(&lock->dep_map, ip); ++ + if (six_optimistic_spin(lock, type)) + goto out; + -+ lock_contended(&lock->dep_map, ip); -+ + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; @@ -82992,6 +84691,7 @@ index 000000000000..b684b9f00c1b + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); + } ++ trace_contention_end(lock, 0); + + return ret; +} @@ -83758,10 +85458,10 @@ index 000000000000..4c268b0b8316 +#endif /* _LINUX_SIX_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c new file mode 100644 -index 000000000000..4982468bfe11 +index 000000000000..e9af77b384c7 --- /dev/null +++ b/fs/bcachefs/snapshot.c -@@ -0,0 +1,1689 @@ +@@ -0,0 +1,1713 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -83794,17 +85494,18 @@ index 000000000000..4982468bfe11 + le32_to_cpu(t.v->root_snapshot)); +} + -+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || ++ bkey_lt(k.k->p, POS(0, 1)), c, err, ++ snapshot_tree_pos_bad, ++ "bad pos"); ++fsck_err: ++ return ret; +} + +int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, @@ -83966,68 +85667,60 @@ index 000000000000..4982468bfe11 + le32_to_cpu(s.v->skip[2])); +} + -+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_snapshot s; + u32 i, id; ++ int ret = 0; + -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || ++ bkey_lt(k.k->p, POS(0, 1)), c, err, ++ snapshot_pos_bad, ++ "bad pos"); + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); -+ if (id && id <= k.k->p.offset) { -+ prt_printf(err, "bad parent node (%u <= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, ++ snapshot_parent_bad, ++ "bad parent node (%u <= %llu)", ++ id, k.k->p.offset); + -+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { -+ prt_printf(err, "children not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, ++ snapshot_children_not_normalized, ++ "children not normalized"); + -+ if (s.v->children[0] && -+ s.v->children[0] == s.v->children[1]) { -+ prt_printf(err, "duplicate child nodes"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, ++ snapshot_child_duplicate, ++ "duplicate child nodes"); + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + -+ if (id >= k.k->p.offset) { -+ prt_printf(err, "bad child node (%u >= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(id >= k.k->p.offset, c, err, ++ snapshot_child_bad, ++ "bad child node (%u >= %llu)", ++ id, k.k->p.offset); + } + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { -+ if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || -+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { -+ prt_printf(err, "skiplist not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || ++ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, ++ snapshot_skiplist_not_normalized, ++ "skiplist not normalized"); + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + -+ if ((id && !s.v->parent) || -+ (id && id <= k.k->p.offset)) { -+ prt_printf(err, "bad skiplist node %u", id); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, ++ snapshot_skiplist_bad, ++ "bad skiplist node %u", id); + } + } -+ -+ return 0; ++fsck_err: ++ return ret; +} + +static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) @@ -84089,8 +85782,9 @@ index 000000000000..4982468bfe11 + __set_is_ancestor_bitmap(c, id); + + if (BCH_SNAPSHOT_DELETED(s.v)) { -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); ++ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) ++ bch2_delete_dead_snapshots_async(c); + } + } else { + memset(t, 0, sizeof(*t)); @@ -84293,7 +85987,7 @@ index 000000000000..4982468bfe11 + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), -+ c, ++ c, snapshot_tree_to_missing_snapshot, + "snapshot tree points to missing/incorrect snapshot:\n %s", + (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); @@ -84305,17 +85999,20 @@ index 000000000000..4982468bfe11 + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + -+ if (fsck_err_on(ret, c, ++ if (fsck_err_on(ret, ++ c, snapshot_tree_to_missing_subvol, + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), -+ root_id), c, ++ root_id), ++ c, snapshot_tree_to_wrong_subvol, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, ++ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), ++ c, snapshot_tree_to_snapshot_subvol, + "snapshot tree points to snapshot subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { @@ -84551,7 +86248,9 @@ index 000000000000..4982468bfe11 + goto err; + } + } else { -+ if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", ++ if (fsck_err_on(s.subvol, ++ c, snapshot_should_not_have_subvol, ++ "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); @@ -84567,7 +86266,8 @@ index 000000000000..4982468bfe11 + if (ret < 0) + goto err; + -+ if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", ++ if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, ++ "snapshot points to missing/incorrect tree:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) @@ -84579,7 +86279,8 @@ index 000000000000..4982468bfe11 + + if (le32_to_cpu(s.depth) != real_depth && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", ++ fsck_err(c, snapshot_bad_depth, ++ "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); @@ -84596,7 +86297,8 @@ index 000000000000..4982468bfe11 + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with bad skiplist field:\n %s", ++ fsck_err(c, snapshot_bad_skiplist, ++ "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); @@ -85015,13 +86717,7 @@ index 000000000000..4982468bfe11 + return 0; +} + -+/* -+ * For a given snapshot, if it doesn't have a subvolume that points to it, and -+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it -+ * as deleted. -+ */ -+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) ++static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot snap; + u32 children[2]; @@ -85042,10 +86738,21 @@ index 000000000000..4982468bfe11 + bch2_snapshot_live(trans, children[1]); + if (ret < 0) + return ret; ++ return !ret; ++} + -+ if (!ret) -+ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); -+ return 0; ++/* ++ * For a given snapshot, if it doesn't have a subvolume that points to it, and ++ * it doesn't have child snapshot nodes - it's now redundant and we can mark it ++ * as deleted. ++ */ ++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ int ret = bch2_snapshot_needs_delete(trans, k); ++ ++ return ret <= 0 ++ ? ret ++ : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); +} + +static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, @@ -85106,12 +86813,12 @@ index 000000000000..4982468bfe11 + u32 id = le32_to_cpu(s->v.skip[j]); + + if (snapshot_list_has_id(deleted, id)) { -+ id = depth > 1 -+ ? bch2_snapshot_nth_parent_skip(c, ++ id = bch2_snapshot_nth_parent_skip(c, + parent, -+ get_random_u32_below(depth - 1), -+ deleted) -+ : parent; ++ depth > 1 ++ ? get_random_u32_below(depth - 1) ++ : 0, ++ deleted); + s->v.skip[j] = cpu_to_le32(id); + } + } @@ -85133,6 +86840,9 @@ index 000000000000..4982468bfe11 + u32 *i, id; + int ret = 0; + ++ if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) ++ return 0; ++ + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { @@ -85150,7 +86860,7 @@ index 000000000000..4982468bfe11 + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, -+ bch2_delete_redundant_snapshot(trans, &iter, k)); ++ bch2_delete_redundant_snapshot(trans, k)); + if (ret) { + bch_err_msg(c, ret, "deleting redundant snapshots"); + goto err; @@ -85191,6 +86901,15 @@ index 000000000000..4982468bfe11 + if (!btree_type_has_snapshots(id)) + continue; + ++ /* ++ * deleted inodes btree is maintained by a trigger on the inodes ++ * btree - no work for us to do here, and it's not safe to scan ++ * it because we'll see out of date keys due to the btree write ++ * buffer: ++ */ ++ if (id == BTREE_ID_deleted_inodes) ++ continue; ++ + ret = for_each_btree_key_commit(trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, @@ -85211,6 +86930,7 @@ index 000000000000..4982468bfe11 + } + } + ++ bch2_trans_unlock(trans); + down_write(&c->snapshot_create_lock); + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, @@ -85255,8 +86975,6 @@ index 000000000000..4982468bfe11 + goto err_create_lock; + } + } -+ -+ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); +err_create_lock: + up_write(&c->snapshot_create_lock); +err: @@ -85272,8 +86990,7 @@ index 000000000000..4982468bfe11 +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + -+ if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) -+ bch2_delete_dead_snapshots(c); ++ bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + @@ -85284,20 +87001,6 @@ index 000000000000..4982468bfe11 + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + -+int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *h) -+{ -+ struct bch_fs *c = trans->c; -+ -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) -+ return 0; -+ -+ bch2_delete_dead_snapshots_async(c); -+ return 0; -+} -+ +int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) @@ -85428,6 +87131,26 @@ index 000000000000..4982468bfe11 + return ret ?: trans_was_restarted(trans, restart_count); +} + ++static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_snapshot snap; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v) || ++ bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || ++ (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { ++ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); ++ return 0; ++ } ++ ++ return ret; ++} ++ +int bch2_snapshots_read(struct bch_fs *c) +{ + struct btree_iter iter; @@ -85438,7 +87161,8 @@ index 000000000000..4982468bfe11 + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: -+ bch2_snapshot_set_equiv(trans, k)) ?: ++ bch2_snapshot_set_equiv(trans, k) ?: ++ bch2_check_snapshot_needs_deletion(trans, k)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); @@ -85453,10 +87177,10 @@ index 000000000000..4982468bfe11 +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h new file mode 100644 -index 000000000000..de215d9d1252 +index 000000000000..f09a22f44239 --- /dev/null +++ b/fs/bcachefs/snapshot.h -@@ -0,0 +1,270 @@ +@@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_H +#define _BCACHEFS_SNAPSHOT_H @@ -85464,7 +87188,7 @@ index 000000000000..de215d9d1252 +enum bkey_invalid_flags; + +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ @@ -85478,7 +87202,7 @@ index 000000000000..de215d9d1252 +int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); @@ -85703,8 +87427,6 @@ index 000000000000..de215d9d1252 +int bch2_check_snapshots(struct bch_fs *); + +int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -+int bch2_delete_dead_snapshots_hook(struct btree_trans *, -+ struct btree_trans_commit_hook *); +void bch2_delete_dead_snapshots_work(struct work_struct *); + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -86105,10 +87827,10 @@ index 000000000000..ae21a8cca1b4 +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000000..caf2dd7dafff +index 000000000000..fccd25aa3242 --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,450 @@ +@@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -86173,7 +87895,8 @@ index 000000000000..caf2dd7dafff + if (ret) + return ret; + -+ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, ++ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, ++ c, subvol_not_master_and_not_snapshot, + "subvolume %llu is not set as snapshot but is not master subvolume", + k.k->p.offset)) { + struct bkey_i_subvolume *s = @@ -86208,16 +87931,17 @@ index 000000000000..caf2dd7dafff + +/* Subvolumes: */ + -+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ -+ if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || -+ bkey_gt(k.k->p, SUBVOL_POS_MAX)) { -+ prt_printf(err, "invalid pos"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ int ret = 0; + -+ return 0; ++ bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || ++ bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, ++ subvol_pos_bad, ++ "invalid pos"); ++fsck_err: ++ return ret; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, @@ -86341,7 +88065,6 @@ index 000000000000..caf2dd7dafff +{ + struct btree_iter iter; + struct bkey_s_c_subvolume subvol; -+ struct btree_trans_commit_hook *h; + u32 snapid; + int ret = 0; + @@ -86357,22 +88080,8 @@ index 000000000000..caf2dd7dafff + + snapid = le32_to_cpu(subvol.v->snapshot); + -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+ if (ret) -+ goto err; -+ -+ ret = bch2_snapshot_node_set_deleted(trans, snapid); -+ if (ret) -+ goto err; -+ -+ h = bch2_trans_kmalloc(trans, sizeof(*h)); -+ ret = PTR_ERR_OR_ZERO(h); -+ if (ret) -+ goto err; -+ -+ h->fn = bch2_delete_dead_snapshots_hook; -+ bch2_trans_commit_hook(trans, h); -+err: ++ ret = bch2_btree_delete_at(trans, &iter, 0) ?: ++ bch2_snapshot_node_set_deleted(trans, snapid); + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -86561,7 +88270,7 @@ index 000000000000..caf2dd7dafff +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000000..bb14f92e8687 +index 000000000000..a1003d30ab0a --- /dev/null +++ b/fs/bcachefs/subvolume.h @@ -0,0 +1,35 @@ @@ -86576,7 +88285,7 @@ index 000000000000..bb14f92e8687 + +int bch2_check_subvols(struct bch_fs *); + -+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -86639,10 +88348,10 @@ index 000000000000..86833445af20 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000000..332d41e1c0a3 +index 000000000000..f4cad903f4d6 --- /dev/null +++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1258 @@ +@@ -0,0 +1,1266 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -86658,6 +88367,7 @@ index 000000000000..332d41e1c0a3 +#include "replicas.h" +#include "quota.h" +#include "sb-clean.h" ++#include "sb-errors.h" +#include "sb-members.h" +#include "super-io.h" +#include "super.h" @@ -87365,7 +89075,7 @@ index 000000000000..332d41e1c0a3 + if (opt_defined(*opts, sb)) + goto err; + -+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", ++ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + path, err.buf); + printbuf_reset(&err); + @@ -87427,7 +89137,7 @@ index 000000000000..332d41e1c0a3 + + ret = bch2_sb_validate(sb, &err, READ); + if (ret) { -+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", ++ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); + goto err_no_print; + } @@ -87435,7 +89145,7 @@ index 000000000000..332d41e1c0a3 + printbuf_exit(&err); + return ret; +err: -+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", ++ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); +err_no_print: + bch2_free_super(sb); @@ -87450,7 +89160,12 @@ index 000000000000..332d41e1c0a3 + + /* XXX: return errors directly */ + -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", ++ if (bch2_dev_io_err_on(bio->bi_status, ca, ++ bio_data_dir(bio) ++ ? BCH_MEMBER_ERROR_write ++ : BCH_MEMBER_ERROR_read, ++ "superblock %s error: %s", ++ bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + @@ -87537,7 +89252,9 @@ index 000000000000..332d41e1c0a3 + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + + bch2_sb_counters_from_cpu(c); -+ bch_members_cpy_v2_v1(&c->disk_sb); ++ bch2_sb_members_from_cpu(c); ++ bch2_sb_members_cpy_v2_v1(&c->disk_sb); ++ bch2_sb_errors_from_cpu(c); + + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); @@ -87820,7 +89537,7 @@ index 000000000000..332d41e1c0a3 + prt_printf(out, "Created:"); + prt_tab(out); + if (sb->time_base_lo) -+ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); ++ bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + prt_printf(out, "(not set)"); + prt_newline(out); @@ -87903,10 +89620,10 @@ index 000000000000..332d41e1c0a3 +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 -index 000000000000..b0d8584f475f +index 000000000000..f5abd102bff7 --- /dev/null +++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,124 @@ +@@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H @@ -87932,6 +89649,11 @@ index 000000000000..b0d8584f475f + unsigned, + unsigned); + ++static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) ++{ ++ return le32_to_cpu(f->u64s) * sizeof(u64); ++} ++ +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + @@ -87987,41 +89709,6 @@ index 000000000000..b0d8584f475f + __bch2_check_set_feature(c, feat); +} + -+/* BCH_SB_FIELD_members_v1: */ -+ -+static inline bool bch2_member_exists(struct bch_member *m) -+{ -+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); -+} -+ -+static inline bool bch2_dev_exists(struct bch_sb *sb, -+ unsigned dev) -+{ -+ if (dev < sb->nr_devices) { -+ struct bch_member m = bch2_sb_member_get(sb, dev); -+ return bch2_member_exists(&m); -+ } -+ return false; -+} -+ -+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -+{ -+ return (struct bch_member_cpu) { -+ .nbuckets = le64_to_cpu(mi->nbuckets), -+ .first_bucket = le16_to_cpu(mi->first_bucket), -+ .bucket_size = le16_to_cpu(mi->bucket_size), -+ .group = BCH_MEMBER_GROUP(mi), -+ .state = BCH_MEMBER_STATE(mi), -+ .discard = BCH_MEMBER_DISCARD(mi), -+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), -+ .durability = BCH_MEMBER_DURABILITY(mi) -+ ? BCH_MEMBER_DURABILITY(mi) - 1 -+ : 1, -+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), -+ .valid = bch2_member_exists(mi), -+ }; -+} -+ +void bch2_sb_maybe_downgrade(struct bch_fs *); +void bch2_sb_upgrade(struct bch_fs *, unsigned); + @@ -88033,10 +89720,10 @@ index 000000000000..b0d8584f475f +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000000..0e85c22672be +index 000000000000..24672bb31cbe --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,2022 @@ +@@ -0,0 +1,2017 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -88088,6 +89775,7 @@ index 000000000000..0e85c22672be +#include "recovery.h" +#include "replicas.h" +#include "sb-clean.h" ++#include "sb-errors.h" +#include "sb-members.h" +#include "snapshot.h" +#include "subvolume.h" @@ -88439,7 +90127,7 @@ index 000000000000..0e85c22672be + + bch_info(c, "going read-write"); + -+ ret = bch2_members_v2_init(c); ++ ret = bch2_sb_members_v2_init(c); + if (ret) + goto err; + @@ -88520,6 +90208,7 @@ index 000000000000..0e85c22672be + bch2_time_stats_exit(&c->times[i]); + + bch2_free_pending_node_rewrites(c); ++ bch2_fs_sb_errors_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_snapshots_exit(c); + bch2_fs_quota_exit(c); @@ -88752,6 +90441,7 @@ index 000000000000..0e85c22672be + bch2_fs_quota_init(c); + bch2_fs_ec_init_early(c); + bch2_fs_move_init(c); ++ bch2_fs_sb_errors_init_early(c); + + INIT_LIST_HEAD(&c->list); + @@ -88768,8 +90458,8 @@ index 000000000000..0e85c22672be + + INIT_LIST_HEAD(&c->journal_iters); + -+ INIT_LIST_HEAD(&c->fsck_errors); -+ mutex_init(&c->fsck_error_lock); ++ INIT_LIST_HEAD(&c->fsck_error_msgs); ++ mutex_init(&c->fsck_error_msgs_lock); + + seqcount_init(&c->gc_pos_lock); + @@ -88879,6 +90569,7 @@ index 000000000000..0e85c22672be + } + + ret = bch2_fs_counters_init(c) ?: ++ bch2_fs_sb_errors_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_journal_init(&c->journal) ?: @@ -88981,16 +90672,13 @@ index 000000000000..0e85c22672be + + mutex_lock(&c->sb_lock); + -+ ret = bch2_members_v2_init(c); ++ ret = bch2_sb_members_v2_init(c); + if (ret) { + mutex_unlock(&c->sb_lock); + goto err; + } + + for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ for_each_online_member(ca, c, i) + bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); + + mutex_unlock(&c->sb_lock); @@ -88999,12 +90687,6 @@ index 000000000000..0e85c22672be + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + -+ for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { -+ mutex_lock(&c->btree_transaction_stats[i].lock); -+ bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); -+ mutex_unlock(&c->btree_transaction_stats[i].lock); -+ } -+ + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); @@ -89179,6 +90861,7 @@ index 000000000000..0e85c22672be + struct bch_member *member) +{ + struct bch_dev *ca; ++ unsigned i; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) @@ -89196,6 +90879,10 @@ index 000000000000..0e85c22672be + bch2_time_stats_init(&ca->io_latency[WRITE]); + + ca->mi = bch2_mi_to_cpu(member); ++ ++ for (i = 0; i < ARRAY_SIZE(member->errors); i++) ++ atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); ++ + ca->uuid = member->uuid; + + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, @@ -89630,7 +91317,7 @@ index 000000000000..0e85c22672be + dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); + + if (BCH_MEMBER_GROUP(&dev_mi)) { -+ bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); ++ bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + if (label.allocation_failure) { + ret = -ENOMEM; + goto err; @@ -89670,16 +91357,6 @@ index 000000000000..0e85c22672be + goto err_unlock; + } + -+ mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2); -+ -+ if (!bch2_sb_field_resize(&ca->disk_sb, members_v2, -+ le32_to_cpu(mi->field.u64s) + -+ sizeof(dev_mi) / sizeof(u64))) { -+ ret = -BCH_ERR_ENOSPC_sb_members; -+ bch_err_msg(c, ret, "setting up new superblock"); -+ goto err_unlock; -+ } -+ + if (dynamic_fault("bcachefs:add:no_slot")) + goto no_slot; + @@ -89693,6 +91370,8 @@ index 000000000000..0e85c22672be + +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ ++ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + @@ -89728,13 +91407,13 @@ index 000000000000..0e85c22672be + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { -+ bch_err_msg(c, ret, "marking new superblock"); ++ bch_err_msg(ca, ret, "marking new superblock"); + goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { -+ bch_err_msg(c, ret, "initializing free space"); ++ bch_err_msg(ca, ret, "initializing free space"); + goto err_late; + } + @@ -89802,19 +91481,26 @@ index 000000000000..0e85c22672be + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + ++ if (!ca->mi.freespace_initialized) { ++ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); ++ bch_err_msg(ca, ret, "initializing free space"); ++ if (ret) ++ goto err; ++ } ++ ++ if (!ca->journal.nr) { ++ ret = bch2_dev_journal_alloc(ca); ++ bch_err_msg(ca, ret, "allocating journal"); ++ if (ret) ++ goto err; ++ } ++ + mutex_lock(&c->sb_lock); -+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); -+ -+ m->last_mount = ++ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(ktime_get_real_seconds()); -+ + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + -+ ret = bch2_fs_freespace_init(c); -+ if (ret) -+ bch_err_msg(c, ret, "initializing free space"); -+ + up_write(&c->state_lock); + return 0; +err: @@ -89925,9 +91611,9 @@ index 000000000000..0e85c22672be +struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_opts opts) +{ -+ struct bch_sb_handle *sb = NULL; ++ DARRAY(struct bch_sb_handle) sbs = { 0 }; + struct bch_fs *c = NULL; -+ unsigned i, best_sb = 0; ++ struct bch_sb_handle *sb, *best = NULL; + struct printbuf errbuf = PRINTBUF; + int ret = 0; + @@ -89939,49 +91625,46 @@ index 000000000000..0e85c22672be + goto err; + } + -+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); -+ if (!sb) { -+ ret = -ENOMEM; ++ ret = darray_make_room(&sbs, nr_devices); ++ if (ret) + goto err; -+ } + -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ for (unsigned i = 0; i < nr_devices; i++) { ++ struct bch_sb_handle sb = { NULL }; ++ ++ ret = bch2_read_super(devices[i], &opts, &sb); + if (ret) + goto err; + ++ BUG_ON(darray_push(&sbs, sb)); + } + -+ for (i = 1; i < nr_devices; i++) -+ if (le64_to_cpu(sb[i].sb->seq) > -+ le64_to_cpu(sb[best_sb].sb->seq)) -+ best_sb = i; ++ darray_for_each(sbs, sb) ++ if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) ++ best = sb; + -+ i = 0; -+ while (i < nr_devices) { -+ if (i != best_sb && -+ !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) { -+ pr_info("%pg has been removed, skipping", sb[i].bdev); -+ bch2_free_super(&sb[i]); -+ array_remove_item(sb, nr_devices, i); ++ darray_for_each_reverse(sbs, sb) { ++ if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { ++ pr_info("%pg has been removed, skipping", sb->bdev); ++ bch2_free_super(sb); ++ darray_remove_item(&sbs, sb); ++ best -= best > sb; + continue; + } + -+ ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ ret = bch2_dev_in_fs(best->sb, sb->sb); + if (ret) + goto err_print; -+ i++; + } + -+ c = bch2_fs_alloc(sb[best_sb].sb, opts); -+ if (IS_ERR(c)) { -+ ret = PTR_ERR(c); ++ c = bch2_fs_alloc(best->sb, opts); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (ret) + goto err; -+ } + + down_write(&c->state_lock); -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_dev_attach_bdev(c, &sb[i]); ++ darray_for_each(sbs, sb) { ++ ret = bch2_dev_attach_bdev(c, sb); + if (ret) { + up_write(&c->state_lock); + goto err; @@ -90000,7 +91683,9 @@ index 000000000000..0e85c22672be + goto err; + } +out: -+ kfree(sb); ++ darray_for_each(sbs, sb) ++ bch2_free_super(sb); ++ darray_exit(&sbs); + printbuf_exit(&errbuf); + module_put(THIS_MODULE); + return c; @@ -90010,9 +91695,6 @@ index 000000000000..0e85c22672be +err: + if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); -+ if (sb) -+ for (i = 0; i < nr_devices; i++) -+ bch2_free_super(&sb[i]); + c = ERR_PTR(ret); + goto out; +} @@ -90119,10 +91801,10 @@ index 000000000000..bf762df18012 +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 -index 000000000000..78d6138db62d +index 000000000000..7dda4985b99f --- /dev/null +++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,52 @@ +@@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_TYPES_H +#define _BCACHEFS_SUPER_TYPES_H @@ -90162,25 +91844,13 @@ index 000000000000..78d6138db62d + u8 valid; +}; + -+struct bch_disk_group_cpu { -+ bool deleted; -+ u16 parent; -+ struct bch_devs_mask devs; -+}; -+ -+struct bch_disk_groups_cpu { -+ struct rcu_head rcu; -+ unsigned nr; -+ struct bch_disk_group_cpu entries[] __counted_by(nr); -+}; -+ +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000000..eb764b9a4629 +index 000000000000..662366ce9e00 --- /dev/null +++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1031 @@ +@@ -0,0 +1,1034 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces @@ -90332,7 +92002,9 @@ index 000000000000..eb764b9a4629 +read_attribute(first_bucket); +read_attribute(nbuckets); +rw_attribute(durability); -+read_attribute(iodone); ++read_attribute(io_done); ++read_attribute(io_errors); ++write_attribute(io_errors_reset); + +read_attribute(io_latency_read); +read_attribute(io_latency_write); @@ -90395,7 +92067,7 @@ index 000000000000..eb764b9a4629 + +rw_attribute(rebalance_enabled); +sysfs_pd_controller_attribute(rebalance); -+read_attribute(rebalance_work); ++read_attribute(rebalance_status); +rw_attribute(promote_whole_extents); + +read_attribute(new_stripes); @@ -90524,7 +92196,7 @@ index 000000000000..eb764b9a4629 + +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) +{ -+ prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); ++ prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); + bch2_bpos_to_text(out, c->gc_gens_pos); + prt_printf(out, "\n"); +} @@ -90569,8 +92241,8 @@ index 000000000000..eb764b9a4629 + if (attr == &sysfs_copy_gc_wait) + bch2_copygc_wait_to_text(out, c); + -+ if (attr == &sysfs_rebalance_work) -+ bch2_rebalance_work_to_text(out, c); ++ if (attr == &sysfs_rebalance_status) ++ bch2_rebalance_status_to_text(out, c); + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + @@ -90829,7 +92501,7 @@ index 000000000000..eb764b9a4629 + &sysfs_copy_gc_wait, + + &sysfs_rebalance_enabled, -+ &sysfs_rebalance_work, ++ &sysfs_rebalance_status, + sysfs_pd_controller_files(rebalance), + + &sysfs_moving_ctxts, @@ -90890,10 +92562,8 @@ index 000000000000..eb764b9a4629 + bch2_opt_set_by_id(&c->opts, id, v); + + if ((id == Opt_background_target || -+ id == Opt_background_compression) && v) { -+ bch2_rebalance_add_work(c, S64_MAX); -+ rebalance_wakeup(c); -+ } ++ id == Opt_background_compression) && v) ++ bch2_set_rebalance_needs_scan(c, 0); + + ret = size; +err: @@ -91065,7 +92735,7 @@ index 000000000000..eb764b9a4629 + NULL +}; + -+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) ++static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) +{ + int rw, i; + @@ -91093,13 +92763,8 @@ index 000000000000..eb764b9a4629 + sysfs_print(discard, ca->mi.discard); + + if (attr == &sysfs_label) { -+ if (ca->mi.group) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, c->disk_sb.sb, -+ ca->mi.group - 1); -+ mutex_unlock(&c->sb_lock); -+ } -+ ++ if (ca->mi.group) ++ bch2_disk_path_to_text(out, c, ca->mi.group - 1); + prt_char(out, '\n'); + } + @@ -91113,8 +92778,11 @@ index 000000000000..eb764b9a4629 + prt_char(out, '\n'); + } + -+ if (attr == &sysfs_iodone) -+ dev_iodone_to_text(out, ca); ++ if (attr == &sysfs_io_done) ++ dev_io_done_to_text(out, ca); ++ ++ if (attr == &sysfs_io_errors) ++ bch2_dev_io_errors_to_text(out, ca); + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); @@ -91181,6 +92849,9 @@ index 000000000000..eb764b9a4629 + return ret; + } + ++ if (attr == &sysfs_io_errors_reset) ++ bch2_dev_errors_reset(ca); ++ + return size; +} +SYSFS_OPS(bch2_dev); @@ -91198,7 +92869,9 @@ index 000000000000..eb764b9a4629 + &sysfs_label, + + &sysfs_has_data, -+ &sysfs_iodone, ++ &sysfs_io_done, ++ &sysfs_io_errors, ++ &sysfs_io_errors_reset, + + &sysfs_io_latency_read, + &sysfs_io_latency_write, @@ -92214,10 +93887,10 @@ index 000000000000..c73b18aea7e0 +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c new file mode 100644 -index 000000000000..33efa6005c6f +index 000000000000..dc48b52b01b4 --- /dev/null +++ b/fs/bcachefs/trace.c -@@ -0,0 +1,16 @@ +@@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_types.h" @@ -92227,6 +93900,7 @@ index 000000000000..33efa6005c6f +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "keylist.h" ++#include "move_types.h" +#include "opts.h" +#include "six.h" + @@ -92236,10 +93910,10 @@ index 000000000000..33efa6005c6f +#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h new file mode 100644 -index 000000000000..19264492151b +index 000000000000..893304a1f06e --- /dev/null +++ b/fs/bcachefs/trace.h -@@ -0,0 +1,1284 @@ +@@ -0,0 +1,1334 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -92310,7 +93984,7 @@ index 000000000000..19264492151b + TP_printk("%d,%d %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + @@ -92703,7 +94377,7 @@ index 000000000000..19264492151b + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, @@ -92764,7 +94438,7 @@ index 000000000000..19264492151b + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, @@ -93009,25 +94683,36 @@ index 000000000000..19264492151b +); + +TRACE_EVENT(move_data, -+ TP_PROTO(struct bch_fs *c, u64 sectors_moved, -+ u64 keys_moved), -+ TP_ARGS(c, sectors_moved, keys_moved), ++ TP_PROTO(struct bch_fs *c, ++ struct bch_move_stats *stats), ++ TP_ARGS(c, stats), + + TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, sectors_moved ) ++ __field(dev_t, dev ) + __field(u64, keys_moved ) ++ __field(u64, keys_raced ) ++ __field(u64, sectors_seen ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_raced ) + ), + + TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->sectors_moved = sectors_moved; -+ __entry->keys_moved = keys_moved; ++ __entry->dev = c->dev; ++ __entry->keys_moved = atomic64_read(&stats->keys_moved); ++ __entry->keys_raced = atomic64_read(&stats->keys_raced); ++ __entry->sectors_seen = atomic64_read(&stats->sectors_seen); ++ __entry->sectors_moved = atomic64_read(&stats->sectors_moved); ++ __entry->sectors_raced = atomic64_read(&stats->sectors_raced); + ), + -+ TP_printk("%d,%d sectors_moved %llu keys_moved %llu", ++ TP_printk("%d,%d keys moved %llu raced %llu" ++ "sectors seen %llu moved %llu raced %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->sectors_moved, __entry->keys_moved) ++ __entry->keys_moved, ++ __entry->keys_raced, ++ __entry->sectors_seen, ++ __entry->sectors_moved, ++ __entry->sectors_raced) +); + +TRACE_EVENT(evacuate_bucket, @@ -93254,7 +94939,7 @@ index 000000000000..19264492151b + TP_printk("%s %pS btree %s pos %llu:%llu:%u", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) @@ -93274,13 +94959,16 @@ index 000000000000..19264492151b + TP_ARGS(trans, caller_ip, path) +); + ++struct get_locks_fail; ++ +TRACE_EVENT(trans_restart_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_locks_want, -+ unsigned new_locks_want), -+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), ++ unsigned new_locks_want, ++ struct get_locks_fail *f), ++ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) @@ -93288,6 +94976,11 @@ index 000000000000..19264492151b + __field(u8, btree_id ) + __field(u8, old_locks_want ) + __field(u8, new_locks_want ) ++ __field(u8, level ) ++ __field(u32, path_seq ) ++ __field(u32, node_seq ) ++ __field(u32, path_alloc_seq ) ++ __field(u32, downgrade_seq) + TRACE_BPOS_entries(pos) + ), + @@ -93297,18 +94990,28 @@ index 000000000000..19264492151b + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = new_locks_want; ++ __entry->level = f->l; ++ __entry->path_seq = path->l[f->l].lock_seq; ++ __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; ++ __entry->path_alloc_seq = path->alloc_seq; ++ __entry->downgrade_seq = path->downgrade_seq; + TRACE_BPOS_assign(pos, path->pos) + ), + -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", ++ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_locks_want, -+ __entry->new_locks_want) ++ __entry->new_locks_want, ++ __entry->level, ++ __entry->path_seq, ++ __entry->node_seq, ++ __entry->path_alloc_seq, ++ __entry->downgrade_seq) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, @@ -93461,7 +95164,7 @@ index 000000000000..19264492151b + TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", + __entry->trans_fn, + (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], ++ bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, @@ -93469,6 +95172,27 @@ index 000000000000..19264492151b + __entry->new_u64s) +); + ++TRACE_EVENT(path_downgrade, ++ TP_PROTO(struct btree_trans *trans, ++ unsigned long caller_ip, ++ struct btree_path *path), ++ TP_ARGS(trans, caller_ip, path), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 32 ) ++ __field(unsigned long, caller_ip ) ++ ), ++ ++ TP_fast_assign( ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ ), ++ ++ TP_printk("%s %pS", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip) ++); ++ +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), @@ -93605,10 +95329,10 @@ index 000000000000..905801772002 +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000000..08bac0ba8d0b +index 000000000000..84b142fcc3df --- /dev/null +++ b/fs/bcachefs/util.c -@@ -0,0 +1,1141 @@ +@@ -0,0 +1,1159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache @@ -94078,6 +95802,24 @@ index 000000000000..08bac0ba8d0b + prt_printf(out, "%s", u->name); +} + ++#ifndef __KERNEL__ ++#include ++void bch2_prt_datetime(struct printbuf *out, time64_t sec) ++{ ++ time_t t = sec; ++ char buf[64]; ++ ctime_r(&t, buf); ++ prt_str(out, buf); ++} ++#else ++void bch2_prt_datetime(struct printbuf *out, time64_t sec) ++{ ++ char buf[64]; ++ snprintf(buf, sizeof(buf), "%ptT", &sec); ++ prt_u64(out, sec); ++} ++#endif ++ +#define TABSTOP_SIZE 12 + +static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) @@ -94752,10 +96494,10 @@ index 000000000000..08bac0ba8d0b +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000000..849a37ae497c +index 000000000000..2984b57b2958 --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,852 @@ +@@ -0,0 +1,833 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -95003,26 +96745,7 @@ index 000000000000..849a37ae497c +#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) + +void bch2_pr_time_units(struct printbuf *, u64); -+ -+#ifdef __KERNEL__ -+static inline void pr_time(struct printbuf *out, u64 time) -+{ -+ prt_printf(out, "%llu", time); -+} -+#else -+#include -+static inline void pr_time(struct printbuf *out, u64 _time) -+{ -+ char time_str[64]; -+ time_t time = _time; -+ struct tm *tm = localtime(&time); -+ size_t err = strftime(time_str, sizeof(time_str), "%c", tm); -+ if (!err) -+ prt_printf(out, "(formatting error)"); -+ else -+ prt_printf(out, "%s", time_str); -+} -+#endif ++void bch2_prt_datetime(struct printbuf *, time64_t); + +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) @@ -95831,10 +97554,10 @@ index 000000000000..a6561b4b36a6 +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000000..b069b1a62e25 +index 000000000000..a39ff0c296ec --- /dev/null +++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,651 @@ +@@ -0,0 +1,643 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -95907,46 +97630,38 @@ index 000000000000..b069b1a62e25 + .cmp_bkey = xattr_cmp_bkey, +}; + -+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ -+ const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len)); ++ int ret = 0; + -+ if (bkey_val_u64s(k.k) < -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))) { -+ prt_printf(err, "value too small (%zu < %u)", -+ bkey_val_u64s(k.k), -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, ++ xattr_val_size_too_small, ++ "value too small (%zu < %u)", ++ bkey_val_u64s(k.k), val_u64s); + + /* XXX why +4 ? */ -+ if (bkey_val_u64s(k.k) > -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)); -+ return -BCH_ERR_invalid_bkey; -+ } ++ val_u64s = xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4); + -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (!handler) { -+ prt_printf(err, "invalid type (%u)", xattr.v->x_type); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, ++ xattr_val_size_too_big, ++ "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), val_u64s); + -+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { -+ prt_printf(err, "xattr name has invalid characters"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, ++ xattr_invalid_type, ++ "invalid type (%u)", xattr.v->x_type); + -+ return 0; ++ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, ++ xattr_name_invalid_chars, ++ "xattr name has invalid characters"); ++fsck_err: ++ return ret; +} + +void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, @@ -96427,7 +98142,7 @@ index 000000000000..b069b1a62e25 + if (value && + (opt_id == Opt_background_compression || + opt_id == Opt_background_target)) -+ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); + + return bch2_err_class(ret); +} @@ -96488,7 +98203,7 @@ index 000000000000..b069b1a62e25 +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 -index 000000000000..f5a52e3a6016 +index 000000000000..1337f31a5c49 --- /dev/null +++ b/fs/bcachefs/xattr.h @@ -0,0 +1,50 @@ @@ -96500,7 +98215,7 @@ index 000000000000..f5a52e3a6016 + +extern const struct bch_hash_desc bch2_xattr_hash_desc; + -+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, ++int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -96576,22 +98291,24 @@ index 25ac74d30bff..796e23761ba0 100644 } EXPORT_SYMBOL(d_tmpfile); diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h -similarity index 93% +similarity index 91% rename from drivers/md/bcache/closure.h rename to include/linux/closure.h -index c88cdc4ae4ec..722a586bb224 100644 +index c88cdc4ae4ec..de7bb47d8a46 100644 --- a/drivers/md/bcache/closure.h +++ b/include/linux/closure.h -@@ -155,7 +155,7 @@ struct closure { +@@ -154,8 +154,9 @@ struct closure { + struct closure *parent; atomic_t remaining; ++ bool closure_get_happened; -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e -@@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list); +@@ -172,6 +173,11 @@ void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void __closure_sync(struct closure *cl); @@ -96603,12 +98320,16 @@ index c88cdc4ae4ec..722a586bb224 100644 /** * closure_sync - sleep until a closure a closure has nothing left to wait on * -@@ -180,19 +185,17 @@ void __closure_sync(struct closure *cl); +@@ -180,19 +186,21 @@ void __closure_sync(struct closure *cl); */ static inline void closure_sync(struct closure *cl) { - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -+ if (closure_nr_remaining(cl) != 1) ++#ifdef CONFIG_DEBUG_CLOSURES ++ BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened); ++#endif ++ ++ if (cl->closure_get_happened) __closure_sync(cl); } @@ -96625,7 +98346,7 @@ index c88cdc4ae4ec..722a586bb224 100644 static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} -@@ -200,21 +203,21 @@ static inline void closure_debug_destroy(struct closure *cl) {} +@@ -200,21 +208,21 @@ static inline void closure_debug_destroy(struct closure *cl) {} static inline void closure_set_ip(struct closure *cl) { @@ -96650,7 +98371,16 @@ index c88cdc4ae4ec..722a586bb224 100644 cl->waiting_on = f; #endif } -@@ -243,6 +246,7 @@ static inline void closure_queue(struct closure *cl) +@@ -230,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; +- /* between atomic_dec() in closure_put() */ +- smp_mb__before_atomic(); + } + + static inline void closure_queue(struct closure *cl) +@@ -243,6 +249,7 @@ static inline void closure_queue(struct closure *cl) */ BUILD_BUG_ON(offsetof(struct closure, fn) != offsetof(struct work_struct, func)); @@ -96658,16 +98388,18 @@ index c88cdc4ae4ec..722a586bb224 100644 if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); -@@ -255,7 +259,7 @@ static inline void closure_queue(struct closure *cl) +@@ -255,7 +262,9 @@ static inline void closure_queue(struct closure *cl) */ static inline void closure_get(struct closure *cl) { -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++ cl->closure_get_happened = true; ++ +#ifdef CONFIG_DEBUG_CLOSURES BUG_ON((atomic_inc_return(&cl->remaining) & CLOSURE_REMAINING_MASK) <= 1); #else -@@ -271,7 +275,7 @@ static inline void closure_get(struct closure *cl) +@@ -271,12 +280,13 @@ static inline void closure_get(struct closure *cl) */ static inline void closure_init(struct closure *cl, struct closure *parent) { @@ -96676,7 +98408,13 @@ index c88cdc4ae4ec..722a586bb224 100644 cl->parent = parent; if (parent) closure_get(parent); -@@ -375,4 +379,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++ cl->closure_get_happened = false; + + closure_debug_create(cl); + closure_set_ip(cl); +@@ -375,4 +385,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, continue_at_nobarrier(cl, fn, wq); } @@ -96964,10 +98702,10 @@ index 740109b6e2c8..57d394575919 100644 obj-$(CONFIG_GLOB) += glob.o diff --git a/drivers/md/bcache/closure.c b/lib/closure.c -similarity index 88% +similarity index 83% rename from drivers/md/bcache/closure.c rename to lib/closure.c -index d8d9394a6beb..0855e698ced1 100644 +index d8d9394a6beb..f86c9eeafb35 100644 --- a/drivers/md/bcache/closure.c +++ b/lib/closure.c @@ -6,13 +6,13 @@ @@ -96987,23 +98725,39 @@ index d8d9394a6beb..0855e698ced1 100644 static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; -@@ -45,6 +45,7 @@ void closure_sub(struct closure *cl, int v) +@@ -21,6 +21,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + + if (!r) { ++ smp_acquire__after_ctrl_dep(); ++ ++ cl->closure_get_happened = false; ++ + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { + atomic_set(&cl->remaining, + CLOSURE_REMAINING_INITIALIZER); +@@ -43,16 +47,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) + /* For clearing flags with the same atomic op as a put */ + void closure_sub(struct closure *cl, int v) { - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); ++ closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining)); } +EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount -@@ -53,6 +54,7 @@ void closure_put(struct closure *cl) + */ + void closure_put(struct closure *cl) { - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); ++ closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining)); } +EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier -@@ -74,6 +76,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) +@@ -74,6 +80,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } @@ -97011,7 +98765,14 @@ index d8d9394a6beb..0855e698ced1 100644 /** * closure_wait - add a closure to a waitlist -@@ -93,6 +96,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +@@ -87,12 +94,14 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + ++ cl->closure_get_happened = true; + closure_set_waiting(cl, _RET_IP_); + atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + llist_add(&cl->list, &waitlist->list); return true; } @@ -97019,7 +98780,7 @@ index d8d9394a6beb..0855e698ced1 100644 struct closure_syncer { struct task_struct *task; -@@ -127,8 +131,9 @@ void __sched __closure_sync(struct closure *cl) +@@ -127,8 +136,9 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } @@ -97030,7 +98791,7 @@ index d8d9394a6beb..0855e698ced1 100644 static LIST_HEAD(closure_list); static DEFINE_SPINLOCK(closure_list_lock); -@@ -144,6 +149,7 @@ void closure_debug_create(struct closure *cl) +@@ -144,6 +154,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } @@ -97038,7 +98799,7 @@ index d8d9394a6beb..0855e698ced1 100644 void closure_debug_destroy(struct closure *cl) { -@@ -156,8 +162,7 @@ void closure_debug_destroy(struct closure *cl) +@@ -156,8 +167,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } @@ -97048,7 +98809,7 @@ index d8d9394a6beb..0855e698ced1 100644 static int debug_show(struct seq_file *f, void *data) { -@@ -181,7 +186,7 @@ static int debug_show(struct seq_file *f, void *data) +@@ -181,7 +191,7 @@ static int debug_show(struct seq_file *f, void *data) seq_printf(f, " W %pS\n", (void *) cl->waiting_on); @@ -97057,7 +98818,7 @@ index d8d9394a6beb..0855e698ced1 100644 } spin_unlock_irq(&closure_list_lock); -@@ -190,18 +195,11 @@ static int debug_show(struct seq_file *f, void *data) +@@ -190,18 +200,11 @@ static int debug_show(struct seq_file *f, void *data) DEFINE_SHOW_ATTRIBUTE(debug);