diff --git a/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch b/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch index 8cb13cf..fc7456c 100644 --- a/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch +++ b/linux-tkg-patches/5.18/0008-5.18-bcachefs.patch @@ -1,10 +1,11 @@ -From e2bc97d02026d17fad53c5b34ff4ca9aacf45080 Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Sat, 2 Jul 2022 02:47:15 +0200 -Subject: [PATCH] bcachefs-5.18: introduce bcachefs patchset +From 98a9e9a069c0619986de099d587dae0158d82eac Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 22 Jul 2022 14:22:01 +0200 +Subject: [PATCH] 5.18-bcachefs -Signed-off-by: Piotr Gorski +Signed-off-by: Peter Jung --- + .github/ISSUE_TEMPLATE/bug_report.md | 61 + Documentation/core-api/printk-formats.rst | 22 + arch/powerpc/kernel/process.c | 16 +- arch/powerpc/kernel/security.c | 75 +- @@ -25,18 +26,18 @@ Signed-off-by: Piotr Gorski drivers/pci/p2pdma.c | 21 +- fs/Kconfig | 1 + fs/Makefile | 1 + - fs/bcachefs/Kconfig | 52 + - fs/bcachefs/Makefile | 68 + + fs/bcachefs/Kconfig | 59 + + fs/bcachefs/Makefile | 69 + fs/bcachefs/acl.c | 406 ++ fs/bcachefs/acl.h | 58 + - fs/bcachefs/alloc_background.c | 1600 ++++++++ - fs/bcachefs/alloc_background.h | 181 + - fs/bcachefs/alloc_foreground.c | 1282 ++++++ - fs/bcachefs/alloc_foreground.h | 173 + + fs/bcachefs/alloc_background.c | 1552 ++++++++ + fs/bcachefs/alloc_background.h | 183 + + fs/bcachefs/alloc_foreground.c | 1380 +++++++ + fs/bcachefs/alloc_foreground.h | 181 + fs/bcachefs/alloc_types.h | 87 + - fs/bcachefs/backpointers.c | 891 +++++ + fs/bcachefs/backpointers.c | 875 ++++ fs/bcachefs/backpointers.h | 38 + - fs/bcachefs/bcachefs.h | 988 +++++ + fs/bcachefs/bcachefs.h | 1000 +++++ fs/bcachefs/bcachefs_format.h | 2052 ++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ fs/bcachefs/bkey.c | 1175 ++++++ @@ -48,23 +49,23 @@ Signed-off-by: Piotr Gorski fs/bcachefs/bkey_sort.h | 44 + fs/bcachefs/bset.c | 1598 ++++++++ fs/bcachefs/bset.h | 615 +++ - fs/bcachefs/btree_cache.c | 1162 ++++++ + fs/bcachefs/btree_cache.c | 1170 ++++++ fs/bcachefs/btree_cache.h | 107 + - fs/bcachefs/btree_gc.c | 2128 ++++++++++ + fs/bcachefs/btree_gc.c | 2098 ++++++++++ fs/bcachefs/btree_gc.h | 112 + fs/bcachefs/btree_io.c | 2150 ++++++++++ fs/bcachefs/btree_io.h | 222 ++ - fs/bcachefs/btree_iter.c | 3471 ++++++++++++++++ - fs/bcachefs/btree_iter.h | 411 ++ - fs/bcachefs/btree_key_cache.c | 850 ++++ + fs/bcachefs/btree_iter.c | 3515 +++++++++++++++++ + fs/bcachefs/btree_iter.h | 556 +++ + fs/bcachefs/btree_key_cache.c | 855 ++++ fs/bcachefs/btree_key_cache.h | 47 + - fs/bcachefs/btree_locking.h | 259 ++ - fs/bcachefs/btree_types.h | 687 ++++ - fs/bcachefs/btree_update.h | 156 + - fs/bcachefs/btree_update_interior.c | 2253 +++++++++++ + fs/bcachefs/btree_locking.h | 289 ++ + fs/bcachefs/btree_types.h | 697 ++++ + fs/bcachefs/btree_update.h | 158 + + fs/bcachefs/btree_update_interior.c | 2266 +++++++++++ fs/bcachefs/btree_update_interior.h | 321 ++ - fs/bcachefs/btree_update_leaf.c | 1815 +++++++++ - fs/bcachefs/buckets.c | 2114 ++++++++++ + fs/bcachefs/btree_update_leaf.c | 1800 +++++++++ + fs/bcachefs/buckets.c | 2113 ++++++++++ fs/bcachefs/buckets.h | 300 ++ fs/bcachefs/buckets_types.h | 103 + fs/bcachefs/buckets_waiting_for_journal.c | 167 + @@ -72,7 +73,7 @@ Signed-off-by: Piotr Gorski .../buckets_waiting_for_journal_types.h | 23 + fs/bcachefs/chardev.c | 760 ++++ fs/bcachefs/chardev.h | 31 + - fs/bcachefs/checksum.c | 707 ++++ + fs/bcachefs/checksum.c | 712 ++++ fs/bcachefs/checksum.h | 204 + fs/bcachefs/clock.c | 191 + fs/bcachefs/clock.h | 38 + @@ -82,20 +83,21 @@ Signed-off-by: Piotr Gorski fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + fs/bcachefs/darray.h | 77 + - fs/bcachefs/data_update.c | 379 ++ + fs/bcachefs/data_update.c | 376 ++ fs/bcachefs/data_update.h | 38 + - fs/bcachefs/debug.c | 707 ++++ + fs/bcachefs/debug.c | 764 ++++ fs/bcachefs/debug.h | 30 + fs/bcachefs/dirent.c | 565 +++ fs/bcachefs/dirent.h | 67 + fs/bcachefs/disk_groups.c | 506 +++ fs/bcachefs/disk_groups.h | 90 + - fs/bcachefs/ec.c | 1695 ++++++++ + fs/bcachefs/ec.c | 1673 ++++++++ fs/bcachefs/ec.h | 230 ++ fs/bcachefs/ec_types.h | 46 + - fs/bcachefs/errcode.h | 12 + - fs/bcachefs/error.c | 185 + - fs/bcachefs/error.h | 238 ++ + fs/bcachefs/errcode.c | 51 + + fs/bcachefs/errcode.h | 64 + + fs/bcachefs/error.c | 184 + + fs/bcachefs/error.h | 223 ++ fs/bcachefs/extent_update.c | 178 + fs/bcachefs/extent_update.h | 12 + fs/bcachefs/extents.c | 1324 +++++++ @@ -105,24 +107,24 @@ Signed-off-by: Piotr Gorski fs/bcachefs/fifo.h | 127 + fs/bcachefs/fs-common.c | 496 +++ fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io.c | 3496 +++++++++++++++++ + fs/bcachefs/fs-io.c | 3496 ++++++++++++++++ fs/bcachefs/fs-io.h | 56 + fs/bcachefs/fs-ioctl.c | 523 +++ fs/bcachefs/fs-ioctl.h | 81 + fs/bcachefs/fs.c | 1939 +++++++++ fs/bcachefs/fs.h | 208 + - fs/bcachefs/fsck.c | 2413 ++++++++++++ + fs/bcachefs/fsck.c | 2390 +++++++++++ fs/bcachefs/fsck.h | 8 + fs/bcachefs/inode.c | 771 ++++ fs/bcachefs/inode.h | 189 + - fs/bcachefs/io.c | 2417 ++++++++++++ + fs/bcachefs/io.c | 2422 ++++++++++++ fs/bcachefs/io.h | 189 + fs/bcachefs/io_types.h | 161 + fs/bcachefs/journal.c | 1429 +++++++ fs/bcachefs/journal.h | 521 +++ fs/bcachefs/journal_io.c | 1735 ++++++++ fs/bcachefs/journal_io.h | 59 + - fs/bcachefs/journal_reclaim.c | 849 ++++ + fs/bcachefs/journal_reclaim.c | 852 ++++ fs/bcachefs/journal_reclaim.h | 86 + fs/bcachefs/journal_sb.c | 220 ++ fs/bcachefs/journal_sb.h | 24 + @@ -132,26 +134,26 @@ Signed-off-by: Piotr Gorski fs/bcachefs/keylist.c | 67 + fs/bcachefs/keylist.h | 76 + fs/bcachefs/keylist_types.h | 16 + - fs/bcachefs/lru.c | 219 ++ + fs/bcachefs/lru.c | 206 + fs/bcachefs/lru.h | 19 + - fs/bcachefs/migrate.c | 193 + + fs/bcachefs/migrate.c | 186 + fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 951 +++++ + fs/bcachefs/move.c | 952 +++++ fs/bcachefs/move.h | 67 + fs/bcachefs/move_types.h | 19 + - fs/bcachefs/movinggc.c | 282 ++ - fs/bcachefs/movinggc.h | 9 + + fs/bcachefs/movinggc.c | 285 ++ + fs/bcachefs/movinggc.h | 10 + fs/bcachefs/opts.c | 578 +++ - fs/bcachefs/opts.h | 504 +++ - fs/bcachefs/quota.c | 859 ++++ + fs/bcachefs/opts.h | 509 +++ + fs/bcachefs/quota.c | 823 ++++ fs/bcachefs/quota.h | 71 + fs/bcachefs/quota_types.h | 43 + - fs/bcachefs/rebalance.c | 358 ++ + fs/bcachefs/rebalance.c | 361 ++ fs/bcachefs/rebalance.h | 28 + fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1584 ++++++++ + fs/bcachefs/recovery.c | 1597 ++++++++ fs/bcachefs/recovery.h | 58 + - fs/bcachefs/reflink.c | 421 ++ + fs/bcachefs/reflink.c | 422 ++ fs/bcachefs/reflink.h | 76 + fs/bcachefs/replicas.c | 1073 +++++ fs/bcachefs/replicas.h | 106 + @@ -159,20 +161,20 @@ Signed-off-by: Piotr Gorski fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + fs/bcachefs/str_hash.h | 351 ++ - fs/bcachefs/subvolume.c | 1095 ++++++ - fs/bcachefs/subvolume.h | 126 + + fs/bcachefs/subvolume.c | 1108 ++++++ + fs/bcachefs/subvolume.h | 137 + fs/bcachefs/subvolume_types.h | 9 + fs/bcachefs/super-io.c | 1602 ++++++++ fs/bcachefs/super-io.h | 126 + - fs/bcachefs/super.c | 1970 ++++++++++ + fs/bcachefs/super.c | 1950 +++++++++ fs/bcachefs/super.h | 264 ++ fs/bcachefs/super_types.h | 51 + fs/bcachefs/sysfs.c | 943 +++++ fs/bcachefs/sysfs.h | 48 + - fs/bcachefs/tests.c | 947 +++++ + fs/bcachefs/tests.c | 976 +++++ fs/bcachefs/tests.h | 15 + fs/bcachefs/trace.c | 12 + - fs/bcachefs/util.c | 958 +++++ + fs/bcachefs/util.c | 964 +++++ fs/bcachefs/util.h | 783 ++++ fs/bcachefs/varint.c | 121 + fs/bcachefs/varint.h | 11 + @@ -204,13 +206,16 @@ Signed-off-by: Piotr Gorski include/linux/trace_events.h | 2 +- include/linux/trace_seq.h | 17 +- include/linux/vmalloc.h | 1 + - include/trace/events/bcachefs.h | 1020 +++++ + include/net/9p/9p.h | 2 +- + include/net/9p/client.h | 20 +- + include/trace/events/bcachefs.h | 1048 +++++ init/init_task.c | 1 + kernel/Kconfig.locks | 3 + kernel/locking/Makefile | 1 + kernel/locking/lockdep.c | 20 + kernel/locking/six.c | 759 ++++ kernel/module.c | 4 +- + kernel/stacktrace.c | 2 + kernel/trace/trace.c | 45 +- kernel/trace/trace_dynevent.c | 34 +- kernel/trace/trace_events_filter.c | 2 +- @@ -222,6 +227,7 @@ Signed-off-by: Piotr Gorski lib/Kconfig.debug | 9 + lib/Makefile | 8 +- {drivers/md/bcache => lib}/closure.c | 35 +- + lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- lib/hexdump.c | 246 +- lib/pretty-printers.c | 60 + @@ -241,8 +247,14 @@ Signed-off-by: Piotr Gorski mm/slab_common.c | 53 +- mm/vmalloc.c | 21 + mm/vmscan.c | 88 + + net/9p/client.c | 97 +- + net/9p/trans_fd.c | 12 +- + net/9p/trans_rdma.c | 4 +- + net/9p/trans_virtio.c | 4 +- + net/9p/trans_xen.c | 2 +- tools/testing/nvdimm/test/ndtest.c | 22 +- - 237 files changed, 83816 insertions(+), 2162 deletions(-) + 248 files changed, 84382 insertions(+), 2223 deletions(-) + create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile create mode 100644 fs/bcachefs/acl.c @@ -311,6 +323,7 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/ec.c create mode 100644 fs/bcachefs/ec.h create mode 100644 fs/bcachefs/ec_types.h + create mode 100644 fs/bcachefs/errcode.c create mode 100644 fs/bcachefs/errcode.h create mode 100644 fs/bcachefs/error.c create mode 100644 fs/bcachefs/error.h @@ -410,8 +423,75 @@ Signed-off-by: Piotr Gorski delete mode 100644 lib/seq_buf.c rename {lib => mm}/show_mem.c (83%) +diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md +new file mode 100644 +index 000000000000..8af34357dd98 +--- /dev/null ++++ b/.github/ISSUE_TEMPLATE/bug_report.md +@@ -0,0 +1,61 @@ ++--- ++name: Bug report ++about: Create a report to help us improve ++title: " [short commit id]" ++labels: bug ++assignees: YellowOnion ++ ++--- ++ ++**Please search for duplicates** ++ ++**Version** ++ ++Make sure you're using a reasonably new version. ++ ++Provide the commit hash from the kernel version (preferable) or tools, don't say "I'm using the latest master" as that will very quickly become out of date. ++ ++**Generic info** ++Provide the output of: ++``` ++bcachefs fs usage ++bcachefs show-super ++``` ++**Tools bugs** ++ ++* pull the latest version, compile it, do not strip the binary. ++* provide the exact commands you used to run. ++* run with gdb: `gdb -ex run --args ./bcacehfs ` ++ ++If you get an assert/segfault etc: ++* type `bt` in to and provide the output here. ++ ++If the tools lockup: ++* run `perf top -p $(pidof bcachefs)` and provide a screenshot. ++* press ctrl+c to interrupt the process and provide the output of `bt`. ++ ++**Kernel bugs** ++Compile the kernel with these flags: ++ ++``` ++CONFIG_PREEMPT=y ++CONFIG_BCACHEFS_DEBUG=y ++CONFIG_KALLSYMS=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_DEBUG_FS=y ++CONFIG_DYNAMIC_FTRACE=y ++CONFIG_FTRACE=y ++``` ++Provide the output of `dmesg` either in a paste-bin or as attachment, if less than 30~ lines just provide inline here. ++ ++ ++**Optional Advanced** ++ ++If lockup or performance issues: ++* run `perf record` and `perf record -e 'bcachefs:*' -o events.data` both during the window of issue and then ctrl+c. ++* run `perf archive` to dump symbols. ++* archive, compress and upload the files: `perf.data`, `events.data` and `perf.data.tar.bz2`. ++ ++Upload large files to a file storage provider: ++* provide the output of `bcachefs list_journal -a | zstd -f -T0 -o ../journal.log.zst` ++*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst -index 5e89497ba..4f4a35b3a 100644 +index 5e89497ba314..4f4a35b3aadc 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -625,6 +625,28 @@ Examples:: @@ -444,7 +524,7 @@ index 5e89497ba..4f4a35b3a 100644 ====== diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c -index 9be279469..4212864c8 100644 +index 9be279469a85..4212864c81d5 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -39,7 +39,7 @@ @@ -496,7 +576,7 @@ index 9be279469..4212864c8 100644 } diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c -index d96fd14bd..b34de62e6 100644 +index d96fd14bd7c9..b34de62e65ce 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -10,7 +10,7 @@ @@ -645,7 +725,7 @@ index d96fd14bd..b34de62e6 100644 #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c -index 82cae0897..fe2b41858 100644 +index 82cae08976bc..fe2b41858b5f 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -12,7 +12,7 @@ @@ -738,7 +818,7 @@ index 82cae0897..fe2b41858 100644 DEVICE_ATTR_RO(flags); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c -index 83f901e2c..5b6720b6a 100644 +index 83f901e2c2df..5b6720b6a417 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -19,7 +19,7 @@ @@ -805,7 +885,7 @@ index 83f901e2c..5b6720b6a 100644 ret = rdtgroup_setup_root(); if (ret) diff --git a/block/bio.c b/block/bio.c -index d3ca79c3e..8779a80f8 100644 +index d3ca79c3ebdf..8779a80f8156 100644 --- a/block/bio.c +++ b/block/bio.c @@ -553,15 +553,15 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) @@ -880,7 +960,7 @@ index d3ca79c3e..8779a80f8 100644 static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c -index a7329475a..a0929889c 100644 +index a7329475aba2..a0929889cf27 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -207,6 +207,7 @@ const char *blk_status_to_str(blk_status_t status) @@ -892,7 +972,7 @@ index a7329475a..a0929889c 100644 /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h -index 8ccbc6e07..16067c4ac 100644 +index 8ccbc6e07636..16067c4ac775 100644 --- a/block/blk.h +++ b/block/blk.h @@ -240,7 +240,6 @@ static inline void blk_integrity_del(struct gendisk *disk) @@ -904,7 +984,7 @@ index 8ccbc6e07..16067c4ac 100644 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c -index c740f0faa..90aa034dc 100644 +index c740f0faad39..90aa034dceb0 100644 --- a/drivers/acpi/apei/erst-dbg.c +++ b/drivers/acpi/apei/erst-dbg.c @@ -11,6 +11,7 @@ @@ -916,7 +996,7 @@ index c740f0faa..90aa034dc 100644 #include #include diff --git a/drivers/block/loop.c b/drivers/block/loop.c -index 4e1dce3be..0e822f3ef 100644 +index 4e1dce3beab0..0e822f3ef912 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1153,8 +1153,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) @@ -929,7 +1009,7 @@ index 4e1dce3be..0e822f3ef 100644 int err; diff --git a/drivers/clk/tegra/clk-bpmp.c b/drivers/clk/tegra/clk-bpmp.c -index 6ecf18f71..301551174 100644 +index 6ecf18f71c32..301551174c13 100644 --- a/drivers/clk/tegra/clk-bpmp.c +++ b/drivers/clk/tegra/clk-bpmp.c @@ -5,7 +5,7 @@ @@ -991,7 +1071,7 @@ index 6ecf18f71..301551174 100644 static int tegra_bpmp_probe_clocks(struct tegra_bpmp *bpmp, diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c -index 3088c5b82..a8c5f90e8 100644 +index 3088c5b829f0..a8c5f90e8208 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -19,7 +19,7 @@ @@ -1038,7 +1118,7 @@ index 3088c5b82..a8c5f90e8 100644 /* diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig -index cf3e80969..f1a1f0c4a 100644 +index cf3e8096942a..f1a1f0c4a0ea 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -4,6 +4,7 @@ config BCACHE @@ -1066,7 +1146,7 @@ index cf3e80969..f1a1f0c4a 100644 bool "Asynchronous device registration (EXPERIMENTAL)" depends on BCACHE diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile -index 5b87e5967..054e8a33a 100644 +index 5b87e59676b8..054e8a33a7ab 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -2,6 +2,6 @@ @@ -1079,7 +1159,7 @@ index 5b87e5967..054e8a33a 100644 + journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 9ed9c955a..dbb72beb0 100644 +index 9ed9c955add7..dbb72beb036c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -179,6 +179,7 @@ @@ -1099,7 +1179,7 @@ index 9ed9c955a..dbb72beb0 100644 struct bucket { atomic_t pin; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 2bb55278d..4a517301d 100644 +index 2bb55278d22d..4a517301db08 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2914,7 +2914,6 @@ static int __init bcache_init(void) @@ -1111,7 +1191,7 @@ index 2bb55278d..4a517301d 100644 bcache_is_reboot = false; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 6f3cb7c92..f61ab1bad 100644 +index 6f3cb7c92130..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,6 +4,7 @@ @@ -1132,7 +1212,7 @@ index 6f3cb7c92..f61ab1bad 100644 #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c -index 30b1df3c9..3b7a6ca44 100644 +index 30b1df3c9d2f..3b7a6ca44668 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -17,7 +17,7 @@ @@ -1207,7 +1287,7 @@ index 30b1df3c9..3b7a6ca44 100644 acs_redirects = true; diff --git a/fs/Kconfig b/fs/Kconfig -index 30b751c7f..1160311af 100644 +index 30b751c7f11a..1160311af303 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" @@ -1219,7 +1299,7 @@ index 30b751c7f..1160311af 100644 endif # BLOCK diff --git a/fs/Makefile b/fs/Makefile -index 208a74e0b..5d5c8c792 100644 +index 208a74e0b00e..5d5c8c792058 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -134,6 +134,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ @@ -1232,10 +1312,10 @@ index 208a74e0b..5d5c8c792 100644 obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000..27742ce27 +index 000000000000..008886967841 --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,52 @@ +@@ -0,0 +1,59 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support" @@ -1260,6 +1340,7 @@ index 000000000..27742ce27 + select XOR_BLOCKS + select XXHASH + select SRCU ++ select SYMBOLIC_ERRNAME + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. @@ -1288,12 +1369,18 @@ index 000000000..27742ce27 + depends on BCACHEFS_FS + help + Include some unit and performance tests for the core btree code ++ ++config BCACHEFS_LOCK_TIME_STATS ++ bool "bcachefs lock time statistics" ++ depends on BCACHEFS_FS ++ help ++ Expose statistics for how long we held a lock in debugfs diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 -index 000000000..d68aaf1a2 +index 000000000000..5dad8ed03a20 --- /dev/null +++ b/fs/bcachefs/Makefile -@@ -0,0 +1,68 @@ +@@ -0,0 +1,69 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + @@ -1324,6 +1411,7 @@ index 000000000..d68aaf1a2 + disk_groups.o \ + data_update.o \ + ec.o \ ++ errcode.o \ + error.o \ + extents.o \ + extent_update.o \ @@ -1364,7 +1452,7 @@ index 000000000..d68aaf1a2 +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 -index 000000000..5070caf8f +index 000000000000..5c6ccf685094 --- /dev/null +++ b/fs/bcachefs/acl.c @@ -0,0 +1,406 @@ @@ -1606,7 +1694,7 @@ index 000000000..5070caf8f + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); + if (ret) { -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (ret != -ENOENT) + acl = ERR_PTR(ret); @@ -1705,7 +1793,7 @@ index 000000000..5070caf8f +btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err; @@ -1776,7 +1864,7 @@ index 000000000..5070caf8f +#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h new file mode 100644 -index 000000000..2d76a4897 +index 000000000000..2d76a4897ba8 --- /dev/null +++ b/fs/bcachefs/acl.h @@ -0,0 +1,58 @@ @@ -1840,10 +1928,10 @@ index 000000000..2d76a4897 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000..738567173 +index 000000000000..cd6cbd2064ee --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1600 @@ +@@ -0,0 +1,1552 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -2389,7 +2477,7 @@ index 000000000..738567173 + bch2_trans_exit(&trans); + + if (ret) -+ bch_err(c, "error reading alloc info: %i", ret); ++ bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); + + return ret; +} @@ -2577,12 +2665,13 @@ index 000000000..738567173 + if (ret) + goto err; + -+ if (fsck_err_on(k.k->type != discard_key_type, c, -+ "incorrect key in need_discard btree (got %s should be %s)\n" -+ " %s", -+ bch2_bkey_types[k.k->type], -+ bch2_bkey_types[discard_key_type], -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ if (k.k->type != discard_key_type && ++ (c->opts.reconstruct_alloc || ++ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[discard_key_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + @@ -2604,13 +2693,14 @@ index 000000000..738567173 + if (ret) + goto err; + -+ if (fsck_err_on(k.k->type != freespace_key_type, c, -+ "incorrect key in freespace btree (got %s should be %s)\n" -+ " %s", -+ bch2_bkey_types[k.k->type], -+ bch2_bkey_types[freespace_key_type], -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ if (k.k->type != freespace_key_type && ++ (c->opts.reconstruct_alloc || ++ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[freespace_key_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + @@ -2638,7 +2728,7 @@ index 000000000..738567173 +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; -+ struct bkey_s_c k, freespace_k; ++ struct bkey_s_c alloc_k; + struct bch_alloc_v4 a; + u64 genbits; + struct bpos pos; @@ -2648,14 +2738,6 @@ index 000000000..738567173 + struct printbuf buf = PRINTBUF; + int ret; + -+ freespace_k = bch2_btree_iter_peek(iter); -+ if (!freespace_k.k) -+ return 1; -+ -+ ret = bkey_err(freespace_k); -+ if (ret) -+ return ret; -+ + pos = iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = iter->pos.offset & (~0ULL << 56); @@ -2667,18 +2749,18 @@ index 000000000..738567173 + bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) + goto delete; + -+ k = bch2_btree_iter_peek_slot(&alloc_iter); -+ ret = bkey_err(k); ++ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); + if (ret) + goto err; + -+ bch2_alloc_to_v4(k, &a); ++ bch2_alloc_to_v4(alloc_k, &a); + + if (fsck_err_on(a.data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(a)), c, + "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_ids[iter->btree_id], + a.data_type == state, + genbits >> 56, alloc_freespace_genbits(a) >> 56)) @@ -2699,6 +2781,7 @@ index 000000000..738567173 +{ + struct btree_trans trans; + struct btree_iter iter, discard_iter, freespace_iter; ++ struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); @@ -2710,7 +2793,7 @@ index 000000000..738567173 + bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, ++ ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_check_alloc_key(&trans, &iter, @@ -2728,36 +2811,16 @@ index 000000000..738567173 + if (ret < 0) + goto err; + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ while (1) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_check_discard_freespace_key(&trans, &iter)); -+ if (ret) -+ break; -+ -+ bch2_btree_iter_advance(&iter); -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret < 0) -+ goto err; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ while (1) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_check_discard_freespace_key(&trans, &iter)); -+ if (ret) -+ break; -+ -+ bch2_btree_iter_advance(&iter); -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_need_discard, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_discard_freespace_key(&trans, &iter)) ?: ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_freespace, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_discard_freespace_key(&trans, &iter)); +err: + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; @@ -2851,32 +2914,53 @@ index 000000000..738567173 + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_check_alloc_to_lru_ref(&trans, &iter)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_alloc_to_lru_ref(&trans, &iter)); + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} + -+static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos, -+ struct bch_dev *ca, bool *discard_done) ++static int bch2_discard_one_bucket(struct btree_trans *trans, ++ struct btree_iter *need_discard_iter, ++ struct bpos *discard_pos_done, ++ u64 *seen, ++ u64 *open, ++ u64 *need_journal_commit, ++ u64 *discarded) +{ + struct bch_fs *c = trans->c; -+ struct btree_iter iter; ++ struct bpos pos = need_discard_iter->pos; ++ struct btree_iter iter = { NULL }; + struct bkey_s_c k; ++ struct bch_dev *ca; + struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; -+ int ret; ++ bool did_discard = false; ++ int ret = 0; + -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos, ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); ++ return 0; ++ } ++ ++ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { ++ (*open)++; ++ goto out; ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ pos.inode, pos.offset)) { ++ (*need_journal_commit)++; ++ goto out; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ need_discard_iter->pos, + BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); @@ -2912,7 +2996,8 @@ index 000000000..738567173 + goto out; + } + -+ if (!*discard_done && ca->mi.discard && !c->opts.nochanges) { ++ if (bkey_cmp(*discard_pos_done, iter.pos) && ++ ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree @@ -2922,19 +3007,31 @@ index 000000000..738567173 + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL, 0); -+ *discard_done = true; + -+ ret = bch2_trans_relock(trans) ? 0 : -EINTR; ++ ret = bch2_trans_relock(trans); + if (ret) + goto out; + } + ++ *discard_pos_done = iter.pos; ++ did_discard = true; ++ + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); +write: -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); ++ if (ret) ++ goto out; ++ ++ if (did_discard) { ++ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); ++ (*discarded)++; ++ } +out: + bch2_trans_iter_exit(trans, &iter); ++ percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} @@ -2942,61 +3039,27 @@ index 000000000..738567173 +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); -+ struct bch_dev *ca = NULL; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; ++ struct bpos discard_pos_done = POS_MAX; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_need_discard, -+ POS_MIN, 0, k, ret) { -+ bool discard_done = false; -+ -+ if (ca && k.k->p.inode != ca->dev_idx) { -+ percpu_ref_put(&ca->io_ref); -+ ca = NULL; -+ } -+ -+ if (!ca) { -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ ca = NULL; -+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); -+ continue; -+ } -+ } -+ -+ seen++; -+ -+ if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) { -+ open++; -+ continue; -+ } -+ -+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -+ c->journal.flushed_seq_ondisk, -+ k.k->p.inode, k.k->p.offset)) { -+ need_journal_commit++; -+ continue; -+ } -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_NOFAIL, -+ bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done)); -+ if (ret) -+ break; -+ -+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); -+ discarded++; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ca) -+ percpu_ref_put(&ca->io_ref); ++ /* ++ * We're doing the commit in bch2_discard_one_bucket instead of using ++ * for_each_btree_key_commit() so that we can increment counters after ++ * successful commit: ++ */ ++ ret = for_each_btree_key2(&trans, iter, ++ BTREE_ID_need_discard, POS_MIN, 0, k, ++ bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, ++ &seen, ++ &open, ++ &need_journal_commit, ++ &discarded)); + + bch2_trans_exit(&trans); + @@ -3005,7 +3068,8 @@ index 000000000..738567173 + + percpu_ref_put(&c->writes); + -+ trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret); ++ trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ++ bch2_err_str(ret)); +} + +void bch2_do_discards(struct bch_fs *c) @@ -3015,29 +3079,20 @@ index 000000000..738567173 + percpu_ref_put(&c->writes); +} + -+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca, -+ struct bpos *bucket_pos, unsigned *cached_sectors) ++static int invalidate_one_bucket(struct btree_trans *trans, ++ struct btree_iter *lru_iter, struct bkey_s_c k, ++ unsigned dev_idx, s64 *nr_to_invalidate) +{ + struct bch_fs *c = trans->c; -+ struct btree_iter lru_iter, alloc_iter = { NULL }; -+ struct bkey_s_c k; ++ struct btree_iter alloc_iter = { NULL }; + struct bkey_i_alloc_v4 *a; -+ u64 bucket, idx; ++ struct bpos bucket; + struct printbuf buf = PRINTBUF; -+ int ret; ++ unsigned cached_sectors; ++ int ret = 0; + -+ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, -+ POS(ca->dev_idx, 0), 0); -+next_lru: -+ k = bch2_btree_iter_peek(&lru_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto out; -+ -+ if (!k.k || k.k->p.inode != ca->dev_idx) { -+ ret = 1; -+ goto out; -+ } ++ if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx) ++ return 1; + + if (k.k->type != KEY_TYPE_lru) { + prt_printf(&buf, "non lru key in lru btree:\n "); @@ -3045,26 +3100,22 @@ index 000000000..738567173 + + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); -+ bch2_btree_iter_advance(&lru_iter); -+ goto next_lru; + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; -+ goto out; + } ++ ++ goto out; + } + -+ idx = k.k->p.offset; -+ bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx)); + -+ *bucket_pos = POS(ca->dev_idx, bucket); -+ -+ a = bch2_trans_start_alloc_update(trans, &alloc_iter, *bucket_pos); ++ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + -+ if (idx != alloc_lru_idx(a->v)) { ++ if (k.k->p.offset != alloc_lru_idx(a->v)) { + prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_printf(&buf, "\n "); @@ -3072,19 +3123,18 @@ index 000000000..738567173 + + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); -+ bch2_btree_iter_advance(&lru_iter); -+ goto next_lru; + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; -+ goto out; + } ++ ++ goto out; + } + + if (!a->v.cached_sectors) + bch_err(c, "invalidating empty bucket, confused"); + -+ *cached_sectors = a->v.cached_sectors; ++ cached_sectors = a->v.cached_sectors; + + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + a->v.gen++; @@ -3094,13 +3144,18 @@ index 000000000..738567173 + a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + -+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, -+ BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + if (ret) + goto out; ++ ++ trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors); ++ this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]); ++ --*nr_to_invalidate; +out: + bch2_trans_iter_exit(trans, &alloc_iter); -+ bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} @@ -3110,8 +3165,9 @@ index 000000000..738567173 + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans trans; -+ struct bpos bucket; -+ unsigned i, sectors; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); @@ -3120,17 +3176,13 @@ index 000000000..738567173 + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + -+ while (nr_to_invalidate-- >= 0) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_NOFAIL, -+ invalidate_one_bucket(&trans, ca, &bucket, -+ §ors)); -+ if (ret) -+ break; ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru, ++ POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k, ++ invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate)); + -+ trace_invalidate_bucket(c, bucket.inode, bucket.offset, sectors); -+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]); ++ if (ret < 0) { ++ percpu_ref_put(&ca->ref); ++ break; + } + } + @@ -3145,16 +3197,13 @@ index 000000000..738567173 + percpu_ref_put(&c->writes); +} + -+static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter) ++static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, struct bch_dev *ca) +{ + struct bch_alloc_v4 a; -+ struct bkey_s_c k; -+ int ret; + -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; ++ if (iter->pos.offset >= ca->mi.nbuckets) ++ return 1; + + bch2_alloc_to_v4(k, &a); + return bch2_bucket_do_index(trans, k, &a, true); @@ -3170,25 +3219,16 @@ index 000000000..738567173 + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, -+ POS(ca->dev_idx, ca->mi.first_bucket), -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (iter.pos.offset >= ca->mi.nbuckets) -+ break; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW, -+ bucket_freespace_init(&trans, &iter)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bucket_freespace_init(&trans, &iter, k, ca)); + + bch2_trans_exit(&trans); + -+ if (ret) { -+ bch_err(ca, "error initializing free space: %i", ret); ++ if (ret < 0) { ++ bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); + return ret; + } + @@ -3197,7 +3237,7 @@ index 000000000..738567173 + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); + -+ return ret; ++ return 0; +} + +int bch2_fs_freespace_init(struct bch_fs *c) @@ -3446,10 +3486,10 @@ index 000000000..738567173 +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000..2ac6b5046 +index 000000000000..044bc72992d4 --- /dev/null +++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,181 @@ +@@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H @@ -3602,11 +3642,13 @@ index 000000000..2ac6b5046 +static inline u64 should_invalidate_buckets(struct bch_dev *ca, + struct bch_dev_usage u) +{ -+ u64 free = u.d[BCH_DATA_free].buckets + -+ u.d[BCH_DATA_need_discard].buckets; ++ u64 want_free = ca->mi.nbuckets >> 7; ++ u64 free = max_t(s64, 0, ++ u.d[BCH_DATA_free].buckets ++ + u.d[BCH_DATA_need_discard].buckets ++ - bch2_dev_buckets_reserved(ca, RESERVE_none)); + -+ return clamp_t(s64, (ca->mi.nbuckets >> 7) - free, -+ 0, u.d[BCH_DATA_cached].buckets); ++ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); +} + +void bch2_do_invalidates(struct bch_fs *); @@ -3633,10 +3675,10 @@ index 000000000..2ac6b5046 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000..7a878a690 +index 000000000000..6e52230e69e1 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1282 @@ +@@ -0,0 +1,1380 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -3665,6 +3707,7 @@ index 000000000..7a878a690 +#include "error.h" +#include "io.h" +#include "journal.h" ++#include "movinggc.h" + +#include +#include @@ -3865,7 +3908,7 @@ index 000000000..7a878a690 + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); -+ return ERR_PTR(-OPEN_BUCKETS_EMPTY); ++ return ERR_PTR(-BCH_ERR_open_buckets_empty); + } + + /* Recheck under lock: */ @@ -3977,7 +4020,10 @@ index 000000000..7a878a690 + skipped_need_journal_commit, + skipped_nouse, + cl); ++ if (!ob) ++ iter.path->preserve = false; +err: ++ set_btree_iter_dontneed(&iter); + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; @@ -4016,15 +4062,15 @@ index 000000000..7a878a690 + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * -+bch2_bucket_alloc_trans_early(struct btree_trans *trans, -+ struct bch_dev *ca, -+ enum alloc_reserve reserve, -+ u64 *cur_bucket, -+ u64 *buckets_seen, -+ u64 *skipped_open, -+ u64 *skipped_need_journal_commit, -+ u64 *skipped_nouse, -+ struct closure *cl) ++bch2_bucket_alloc_early(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -4034,7 +4080,7 @@ index 000000000..7a878a690 + *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); + *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); + -+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a; + @@ -4064,10 +4110,10 @@ index 000000000..7a878a690 + + *cur_bucket = iter.pos.offset; + -+ return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY); ++ return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); +} + -+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ++static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + struct bch_dev *ca, + enum alloc_reserve reserve, + u64 *cur_bucket, @@ -4082,29 +4128,24 @@ index 000000000..7a878a690 + struct open_bucket *ob = NULL; + int ret; + -+ if (unlikely(!ca->mi.freespace_initialized)) -+ return bch2_bucket_alloc_trans_early(trans, ca, reserve, -+ cur_bucket, -+ buckets_seen, -+ skipped_open, -+ skipped_need_journal_commit, -+ skipped_nouse, -+ cl); -+ + BUG_ON(ca->new_fs_bucket_idx); + ++ /* ++ * XXX: ++ * On transaction restart, we'd like to restart from the bucket we were ++ * at previously ++ */ + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); -+ *cur_bucket < k.k->p.offset && !ob; ++ *cur_bucket < k.k->p.offset; + (*cur_bucket)++) { -+ if (btree_trans_too_many_iters(trans)) { -+ ob = ERR_PTR(-EINTR); ++ ret = btree_trans_too_many_iters(trans); ++ if (ret) + break; -+ } + + (*buckets_seen)++; + @@ -4114,8 +4155,11 @@ index 000000000..7a878a690 + skipped_need_journal_commit, + skipped_nouse, + k, cl); ++ if (ob) ++ break; + } -+ if (ob) ++ ++ if (ob || ret) + break; + } + bch2_trans_iter_exit(trans, &iter); @@ -4128,15 +4172,19 @@ index 000000000..7a878a690 + * + * Returns index of bucket on success, 0 on failure + * */ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ++ struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ ++ struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; + struct bch_dev_usage usage; ++ bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); ++ u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; + u64 avail; -+ u64 cur_bucket = 0; ++ u64 cur_bucket = start; + u64 buckets_seen = 0; + u64 skipped_open = 0; + u64 skipped_need_journal_commit = 0; @@ -4145,7 +4193,7 @@ index 000000000..7a878a690 + int ret; +again: + usage = bch2_dev_usage_read(ca); -+ avail = dev_buckets_free(ca, usage,reserve); ++ avail = dev_buckets_free(ca, usage, reserve); + + if (usage.d[BCH_DATA_need_discard].buckets > avail) + bch2_do_discards(c); @@ -4166,7 +4214,7 @@ index 000000000..7a878a690 + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + -+ ob = ERR_PTR(-FREELIST_EMPTY); ++ ob = ERR_PTR(-BCH_ERR_freelist_empty); + goto err; + } + @@ -4179,34 +4227,67 @@ index 000000000..7a878a690 + return ob; + } + -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, -+ &cur_bucket, -+ &buckets_seen, -+ &skipped_open, -+ &skipped_need_journal_commit, -+ &skipped_nouse, -+ cl))); ++ ob = likely(ca->mi.freespace_initialized) ++ ? bch2_bucket_alloc_freelist(trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl) ++ : bch2_bucket_alloc_early(trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl); + + if (skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); ++ ++ if (!ob && !ret && !freespace_initialized && start) { ++ start = cur_bucket = 0; ++ goto again; ++ } ++ ++ if (!freespace_initialized) ++ ca->bucket_alloc_trans_early_cursor = cur_bucket; +err: + if (!ob) -+ ob = ERR_PTR(ret ?: -FREELIST_EMPTY); ++ ob = ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); + + if (IS_ERR(ob)) { -+ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail, ++ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], ++ usage.d[BCH_DATA_free].buckets, ++ avail, ++ bch2_copygc_wait_amount(c), ++ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, -+ cl == NULL, PTR_ERR(ob)); ++ cl == NULL, ++ bch2_err_str(PTR_ERR(ob))); + atomic_long_inc(&c->bucket_alloc_fail); + } + + return ob; +} + ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct open_bucket *ob; ++ ++ bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, ++ may_alloc_partial, cl))); ++ return ob; ++} ++ +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) +{ @@ -4272,7 +4353,7 @@ index 000000000..7a878a690 + ob_push(c, ptrs, ob); +} + -+int bch2_bucket_alloc_set(struct bch_fs *c, ++static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, @@ -4283,11 +4364,12 @@ index 000000000..7a878a690 + unsigned flags, + struct closure *cl) +{ ++ struct bch_fs *c = trans->c; + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; + struct bch_dev *ca; -+ int ret = -INSUFFICIENT_DEVICES; ++ int ret = -BCH_ERR_insufficient_devices; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); @@ -4311,7 +4393,7 @@ index 000000000..7a878a690 + continue; + } + -+ ob = bch2_bucket_alloc(c, ca, reserve, ++ ob = bch2_bucket_alloc_trans(trans, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment(ca, stripe); @@ -4319,8 +4401,7 @@ index 000000000..7a878a690 + + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); -+ -+ if (cl) ++ if (ret == -EINTR || cl) + break; + continue; + } @@ -4337,6 +4418,24 @@ index 000000000..7a878a690 + return ret; +} + ++int bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_bucket_alloc_set_trans(&trans, ptrs, stripe, ++ devs_may_alloc, nr_replicas, ++ nr_effective, have_cache, reserve, ++ flags, cl)); ++} ++ +/* Allocate from stripes: */ + +/* @@ -4441,7 +4540,7 @@ index 000000000..7a878a690 + wp->ptrs = ptrs_skip; +} + -+static int open_bucket_add_buckets(struct bch_fs *c, ++static int open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, @@ -4454,6 +4553,7 @@ index 000000000..7a878a690 + unsigned flags, + struct closure *_cl) +{ ++ struct bch_fs *c = trans->c; + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; @@ -4485,8 +4585,9 @@ index 000000000..7a878a690 + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags, _cl); -+ if (ret == -FREELIST_EMPTY || -+ ret == -OPEN_BUCKETS_EMPTY) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ bch2_err_matches(ret, BCH_ERR_freelist_empty) || ++ bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; + if (*nr_effective >= nr_replicas) + return 0; @@ -4504,10 +4605,13 @@ index 000000000..7a878a690 + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ -+ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ++ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); -+ if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { ++ if (ret && ++ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && ++ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && ++ !cl && _cl) { + cl = _cl; + goto retry_blocking; + } @@ -4625,15 +4729,25 @@ index 000000000..7a878a690 + return true; +} + -+static struct write_point *writepoint_find(struct bch_fs *c, ++static void bch2_trans_mutex_lock(struct btree_trans *trans, ++ struct mutex *lock) ++{ ++ if (!mutex_trylock(lock)) { ++ bch2_trans_unlock(trans); ++ mutex_lock(lock); ++ } ++} ++ ++static struct write_point *writepoint_find(struct btree_trans *trans, + unsigned long write_point) +{ ++ struct bch_fs *c = trans->c; + struct write_point *wp, *oldest; + struct hlist_head *head; + + if (!(write_point & 1UL)) { + wp = (struct write_point *) write_point; -+ mutex_lock(&wp->lock); ++ bch2_trans_mutex_lock(trans, &wp->lock); + return wp; + } + @@ -4642,7 +4756,7 @@ index 000000000..7a878a690 + wp = __writepoint_find(head, write_point); + if (wp) { +lock_wp: -+ mutex_lock(&wp->lock); ++ bch2_trans_mutex_lock(trans, &wp->lock); + if (wp->write_point == write_point) + goto out; + mutex_unlock(&wp->lock); @@ -4655,8 +4769,8 @@ index 000000000..7a878a690 + if (!oldest || time_before64(wp->last_used, oldest->last_used)) + oldest = wp; + -+ mutex_lock(&oldest->lock); -+ mutex_lock(&c->write_points_hash_lock); ++ bch2_trans_mutex_lock(trans, &oldest->lock); ++ bch2_trans_mutex_lock(trans, &c->write_points_hash_lock); + if (oldest >= c->write_points + c->write_points_nr || + try_increase_writepoints(c)) { + mutex_unlock(&c->write_points_hash_lock); @@ -4684,7 +4798,7 @@ index 000000000..7a878a690 +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, @@ -4695,6 +4809,7 @@ index 000000000..7a878a690 + unsigned flags, + struct closure *cl) +{ ++ struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + struct open_buckets ptrs; @@ -4714,7 +4829,7 @@ index 000000000..7a878a690 + write_points_nr = c->write_points_nr; + have_cache = false; + -+ wp = writepoint_find(c, write_point.v); ++ wp = writepoint_find(trans, write_point.v); + + if (wp->data_type == BCH_DATA_user) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; @@ -4724,21 +4839,21 @@ index 000000000..7a878a690 + have_cache = true; + + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + ob_flags, cl); + } else { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + ob_flags, NULL); -+ if (!ret) ++ if (!ret || ret == -EINTR) + goto alloc_done; + -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + 0, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, @@ -4750,7 +4865,7 @@ index 000000000..7a878a690 + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + -+ if (ret == -INSUFFICIENT_DEVICES && ++ if (ret == -BCH_ERR_insufficient_devices && + nr_effective >= nr_replicas_required) + ret = 0; + @@ -4781,19 +4896,44 @@ index 000000000..7a878a690 + + mutex_unlock(&wp->lock); + -+ if (ret == -FREELIST_EMPTY && ++ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + -+ switch (ret) { -+ case -OPEN_BUCKETS_EMPTY: -+ case -FREELIST_EMPTY: ++ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || ++ bch2_err_matches(ret, BCH_ERR_freelist_empty)) + return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); -+ case -INSUFFICIENT_DEVICES: ++ ++ if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) + return ERR_PTR(-EROFS); -+ default: -+ return ERR_PTR(ret); -+ } ++ ++ return ERR_PTR(ret); ++} ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ ++ bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target, ++ erasure_code, ++ write_point, ++ devs_have, ++ nr_replicas, ++ nr_replicas_required, ++ reserve, ++ flags, cl))); ++ return wp; ++ +} + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) @@ -4921,10 +5061,10 @@ index 000000000..7a878a690 +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 -index 000000000..8bc78877f +index 000000000000..6de63a351fa8 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,173 @@ +@@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H @@ -5063,6 +5203,14 @@ index 000000000..8bc78877f + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); + ++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); +struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + unsigned, unsigned, + struct write_point_specifier, @@ -5100,7 +5248,7 @@ index 000000000..8bc78877f +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 -index 000000000..e078584d4 +index 000000000000..e078584d46f6 --- /dev/null +++ b/fs/bcachefs/alloc_types.h @@ -0,0 +1,87 @@ @@ -5193,10 +5341,10 @@ index 000000000..e078584d4 +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 -index 000000000..f3260bbef +index 000000000000..5a46b25b0587 --- /dev/null +++ b/fs/bcachefs/backpointers.c -@@ -0,0 +1,891 @@ +@@ -0,0 +1,875 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -5642,8 +5790,8 @@ index 000000000..f3260bbef + goto out; + } + -+ for_each_btree_key(trans, bp_iter, BTREE_ID_backpointers, -+ bp_pos, 0, k, ret) { ++ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, ++ bp_pos, 0, k, ret) { + if (bpos_cmp(k.k->p, bp_end_pos) >= 0) + break; + @@ -5768,22 +5916,16 @@ index 000000000..f3260bbef + return NULL; +} + -+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter) ++static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, ++ struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bch_dev *ca; -+ struct bkey_s_c k, alloc_k; ++ struct bkey_s_c alloc_k; + struct printbuf buf = PRINTBUF; + int ret = 0; + -+ k = bch2_btree_iter_peek(bp_iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ if (!k.k) -+ return 0; -+ + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + "backpointer for mising device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -5818,25 +5960,14 @@ index 000000000..f3260bbef +/* verify that every backpointer has a corresponding alloc key */ +int bch2_check_btree_backpointers(struct bch_fs *c) +{ -+ struct btree_trans trans; + struct btree_iter iter; -+ int ret = 0; ++ struct bkey_s_c k; + -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_backpointers, POS_MIN, 0); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ bch2_check_btree_backpointer(&trans, &iter)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; ++ return bch2_trans_run(c, ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_backpointers, POS_MIN, 0, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ bch2_check_btree_backpointer(&trans, &iter, k))); +} + +static int check_bp_exists(struct btree_trans *trans, @@ -5899,6 +6030,7 @@ index 000000000..f3260bbef + bch2_bkey_val_to_text(&buf, c, alloc_k); + + if (c->sb.version < bcachefs_metadata_version_backpointers || ++ c->opts.reconstruct_alloc || + fsck_err(c, "%s", buf.buf)) { + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); + @@ -6003,7 +6135,7 @@ index 000000000..f3260bbef + BTREE_ITER_PREFETCH); + + do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, ++ ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent_to_backpointers(&trans, &iter)); @@ -6016,7 +6148,7 @@ index 000000000..f3260bbef + if (ret) + break; + -+ ret = __bch2_trans_do(&trans, NULL, NULL, ++ ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_btree_root_to_backpointers(&trans, btree_id)); @@ -6074,7 +6206,7 @@ index 000000000..f3260bbef + BTREE_ITER_PREFETCH, k, ret) { + u64 bp_offset = 0; + -+ while (!(ret = __bch2_trans_do(&trans, NULL, NULL, ++ while (!(ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_one_backpointer(&trans, iter.pos, &bp_offset))) && @@ -6090,7 +6222,7 @@ index 000000000..f3260bbef +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 -index 000000000..fe42af296 +index 000000000000..fe42af296e9c --- /dev/null +++ b/fs/bcachefs/backpointers.h @@ -0,0 +1,38 @@ @@ -6134,10 +6266,10 @@ index 000000000..fe42af296 +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000..8b4d0eb5c +index 000000000000..8ffdb4dee47a --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,988 @@ +@@ -0,0 +1,1000 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -6459,6 +6591,8 @@ index 000000000..8b4d0eb5c +#undef BCH_DEBUG_PARAM +#endif + ++#define BCH_LOCK_TIME_NR 128 ++ +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ @@ -6603,6 +6737,7 @@ index 000000000..8b4d0eb5c + + /* Allocator: */ + u64 new_fs_bucket_idx; ++ u64 bucket_alloc_trans_early_cursor; + + unsigned nr_open_buckets; + unsigned nr_btree_reserve; @@ -6655,6 +6790,8 @@ index 000000000..8b4d0eb5c + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, + ++ BCH_FS_HAVE_DELETED_SNAPSHOTS, ++ + /* errors: */ + BCH_FS_ERROR, + BCH_FS_TOPOLOGY_ERROR, @@ -6666,6 +6803,11 @@ index 000000000..8b4d0eb5c + unsigned id; +}; + ++struct lock_held_stats { ++ struct time_stats times[BCH_LOCK_TIME_NR]; ++ const char *names[BCH_LOCK_TIME_NR]; ++}; ++ +struct bch_fs_pcpu { + u64 sectors_available; +}; @@ -7059,6 +7201,8 @@ index 000000000..8b4d0eb5c + bool promote_whole_extents; + + struct time_stats times[BCH_TIME_STAT_NR]; ++ ++ struct lock_held_stats lock_held_stats; +}; + +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) @@ -7128,7 +7272,7 @@ index 000000000..8b4d0eb5c +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000..147fde141 +index 000000000000..147fde1417b0 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h @@ -0,0 +1,2052 @@ @@ -9186,7 +9330,7 @@ index 000000000..147fde141 +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h new file mode 100644 -index 000000000..b2edabf58 +index 000000000000..b2edabf58260 --- /dev/null +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -0,0 +1,368 @@ @@ -9560,7 +9704,7 @@ index 000000000..b2edabf58 +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 -index 000000000..cc0689635 +index 000000000000..cc0689635164 --- /dev/null +++ b/fs/bcachefs/bkey.c @@ -0,0 +1,1175 @@ @@ -10741,7 +10885,7 @@ index 000000000..cc0689635 +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 -index 000000000..7dee3d8e0 +index 000000000000..7dee3d8e0a3d --- /dev/null +++ b/fs/bcachefs/bkey.h @@ -0,0 +1,566 @@ @@ -11313,7 +11457,7 @@ index 000000000..7dee3d8e0 +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h new file mode 100644 -index 000000000..0d7c67a95 +index 000000000000..0d7c67a959af --- /dev/null +++ b/fs/bcachefs/bkey_buf.h @@ -0,0 +1,60 @@ @@ -11379,7 +11523,7 @@ index 000000000..0d7c67a95 +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000..e0cbac881 +index 000000000000..e0cbac8811af --- /dev/null +++ b/fs/bcachefs/bkey_methods.c @@ -0,0 +1,503 @@ @@ -11888,7 +12032,7 @@ index 000000000..e0cbac881 +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000..db894b40d +index 000000000000..db894b40d2ca --- /dev/null +++ b/fs/bcachefs/bkey_methods.h @@ -0,0 +1,175 @@ @@ -12069,7 +12213,7 @@ index 000000000..db894b40d +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 -index 000000000..b1385a77d +index 000000000000..b1385a77da11 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,198 @@ @@ -12273,7 +12417,7 @@ index 000000000..b1385a77d +} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h new file mode 100644 -index 000000000..79cf11d1b +index 000000000000..79cf11d1b4e7 --- /dev/null +++ b/fs/bcachefs/bkey_sort.h @@ -0,0 +1,44 @@ @@ -12323,7 +12467,7 @@ index 000000000..79cf11d1b +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 -index 000000000..fa60ef84e +index 000000000000..fa60ef84e4ef --- /dev/null +++ b/fs/bcachefs/bset.c @@ -0,0 +1,1598 @@ @@ -13927,7 +14071,7 @@ index 000000000..fa60ef84e +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 -index 000000000..0d46534c3 +index 000000000000..0d46534c3dcd --- /dev/null +++ b/fs/bcachefs/bset.h @@ -0,0 +1,615 @@ @@ -14548,10 +14692,10 @@ index 000000000..0d46534c3 +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000..4d032ae3b +index 000000000000..579a8f8c6a65 --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1162 @@ +@@ -0,0 +1,1170 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -14561,6 +14705,7 @@ index 000000000..4d032ae3b +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" ++#include "errcode.h" +#include "error.h" + +#include @@ -15256,8 +15401,7 @@ index 000000000..4d032ae3b + if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_trans_restart_relock_parent_for_fill(trans->fn, + _THIS_IP_, btree_id, &path->pos); -+ btree_trans_restart(trans); -+ return ERR_PTR(-EINTR); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); + } + + b = bch2_btree_node_mem_alloc(c, level != 0); @@ -15266,8 +15410,8 @@ index 000000000..4d032ae3b + trans->memory_allocation_failure = true; + trace_trans_restart_memory_allocation_failure(trans->fn, + _THIS_IP_, btree_id, &path->pos); -+ btree_trans_restart(trans); -+ return ERR_PTR(-EINTR); ++ ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); + } + + if (IS_ERR(b)) @@ -15304,18 +15448,19 @@ index 000000000..4d032ae3b + if (!sync) + return NULL; + -+ if (trans && -+ (!bch2_trans_relock(trans) || -+ !bch2_btree_path_relock_intent(trans, path))) { -+ BUG_ON(!trans->restarted); -+ return ERR_PTR(-EINTR); ++ if (trans) { ++ int ret = bch2_trans_relock(trans) ?: ++ bch2_btree_path_relock_intent(trans, path); ++ if (ret) { ++ BUG_ON(!trans->restarted); ++ return ERR_PTR(ret); ++ } + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { + trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, + btree_id, &path->pos); -+ btree_trans_restart(trans); -+ return ERR_PTR(-EINTR); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + } + + return b; @@ -15326,7 +15471,9 @@ index 000000000..4d032ae3b + struct btree *b = container_of(lock, struct btree, c.lock); + const struct bkey_i *k = p; + -+ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ return BCH_ERR_lock_fail_node_reused; ++ return 0; +} + +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) @@ -15385,6 +15532,7 @@ index 000000000..4d032ae3b + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; ++ int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + @@ -15447,13 +15595,16 @@ index 000000000..4d032ae3b + * was removed - and we'll bail out: + */ + if (btree_node_read_locked(path, level + 1)) -+ btree_node_unlock(path, level + 1); ++ btree_node_unlock(trans, path, level + 1); + -+ if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type, -+ lock_node_check_fn, (void *) k, trace_ip)) { -+ if (!trans->restarted) ++ ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type, ++ lock_node_check_fn, (void *) k, trace_ip); ++ if (unlikely(ret)) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) + goto retry; -+ return ERR_PTR(-EINTR); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ERR_PTR(ret); ++ BUG(); + } + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || @@ -15467,8 +15618,7 @@ index 000000000..4d032ae3b + trace_ip, + path->btree_id, + &path->pos); -+ btree_trans_restart(trans); -+ return ERR_PTR(-EINTR); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + } + @@ -15484,11 +15634,13 @@ index 000000000..4d032ae3b + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ -+ if (trans && -+ (!bch2_trans_relock(trans) || -+ !bch2_btree_path_relock_intent(trans, path))) { -+ BUG_ON(!trans->restarted); -+ return ERR_PTR(-EINTR); ++ if (trans) { ++ int ret = bch2_trans_relock(trans) ?: ++ bch2_btree_path_relock_intent(trans, path); ++ if (ret) { ++ BUG_ON(!trans->restarted); ++ return ERR_PTR(ret); ++ } + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) @@ -15716,7 +15868,7 @@ index 000000000..4d032ae3b +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000..25906127c +index 000000000000..25906127c023 --- /dev/null +++ b/fs/bcachefs/btree_cache.h @@ -0,0 +1,107 @@ @@ -15829,10 +15981,10 @@ index 000000000..25906127c +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000..0e2c8745c +index 000000000000..2f563365ea4c --- /dev/null +++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,2128 @@ +@@ -0,0 +1,2098 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -15933,7 +16085,7 @@ index 000000000..0e2c8745c + buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = -BCH_ERR_need_topology_repair; + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); @@ -15961,7 +16113,7 @@ index 000000000..0e2c8745c + buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = -BCH_ERR_need_topology_repair; + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); @@ -16237,8 +16389,8 @@ index 000000000..0e2c8745c + } + + if (ret) { -+ bch_err(c, "%s: error %i getting btree node", -+ __func__, ret); ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); + break; + } + @@ -16306,8 +16458,8 @@ index 000000000..0e2c8745c + ret = PTR_ERR_OR_ZERO(cur); + + if (ret) { -+ bch_err(c, "%s: error %i getting btree node", -+ __func__, ret); ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); + goto err; + } + @@ -16372,7 +16524,7 @@ index 000000000..0e2c8745c + + if (ret == DROP_THIS_NODE) { + bch_err(c, "empty btree root - repair unimplemented"); -+ ret = FSCK_ERR_EXIT; ++ ret = -BCH_ERR_fsck_repair_unimplemented; + } + } + @@ -16399,7 +16551,8 @@ index 000000000..0e2c8745c + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + -+ if (fsck_err_on(!g->gen_valid, c, ++ if (c->opts.reconstruct_alloc || ++ fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -16583,13 +16736,15 @@ index 000000000..0e2c8745c + if (level) + bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); + -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, *k); -+ bch_info(c, "updated %s", buf.buf); ++ if (c->opts.verbose) { ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, *k); ++ bch_info(c, "updated %s", buf.buf); + -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); -+ bch_info(c, "new key %s", buf.buf); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf.buf); ++ } + + *k = bkey_i_to_s_c(new); + } @@ -16631,12 +16786,12 @@ index 000000000..0e2c8745c + atomic64_set(&c->key_version, k->k->version.lo); + } + -+ ret = __bch2_trans_do(trans, NULL, NULL, 0, ++ ret = commit_do(trans, NULL, NULL, 0, + bch2_mark_key(trans, old, *k, flags)); +fsck_err: +err: + if (ret) -+ bch_err(c, "%s: ret %i", __func__, ret); ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -16742,7 +16897,8 @@ index 000000000..0e2c8745c + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); + if (ret) { -+ bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); ++ bch_err(c, "%s: error from bch2_gc_mark_key: %s", ++ __func__, bch2_err_str(ret)); + goto fsck_err; + } + @@ -16791,7 +16947,7 @@ index 000000000..0e2c8745c + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { -+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR; ++ ret = -BCH_ERR_need_topology_repair; + bch_info(c, "Halting mark and sweep to start topology repair pass"); + goto fsck_err; + } else { @@ -16802,8 +16958,8 @@ index 000000000..0e2c8745c + continue; + } + } else if (ret) { -+ bch_err(c, "%s: error %i getting btree node", -+ __func__, ret); ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); + break; + } + @@ -16844,7 +17000,7 @@ index 000000000..0e2c8745c + if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); -+ ret = FSCK_ERR_EXIT; ++ ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + @@ -16853,7 +17009,7 @@ index 000000000..0e2c8745c + if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, + "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); -+ ret = FSCK_ERR_EXIT; ++ ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + @@ -16870,7 +17026,7 @@ index 000000000..0e2c8745c + six_unlock_read(&b->c.lock); + + if (ret < 0) -+ bch_err(c, "%s: ret %i", __func__, ret); ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + printbuf_exit(&buf); + return ret; +} @@ -16903,7 +17059,7 @@ index 000000000..0e2c8745c + : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + + if (ret < 0) -+ bch_err(c, "%s: ret %i", __func__, ret); ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -17012,29 +17168,28 @@ index 000000000..0e2c8745c +{ + struct bch_dev *ca = NULL; + struct printbuf buf = PRINTBUF; -+ bool verify = !metadata_only && (!initial || -+ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); ++ bool verify = !metadata_only && ++ !c->opts.reconstruct_alloc && ++ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; + + percpu_down_write(&c->mark_lock); + +#define copy_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ -+ , ##__VA_ARGS__, dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ } ++ if (dst->_f != src->_f && \ ++ (!verify || \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f))) \ ++ dst->_f = src->_f +#define copy_stripe_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, "stripe %zu has wrong "_msg \ -+ ": got %u, should be %u", \ -+ iter.pos, ##__VA_ARGS__, \ -+ dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ } ++ if (dst->_f != src->_f && \ ++ (!verify || \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f))) \ ++ dst->_f = src->_f +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ @@ -17102,7 +17257,7 @@ index 000000000..0e2c8745c + if (ca) + percpu_ref_put(&ca->ref); + if (ret) -+ bch_err(c, "%s: ret %i", __func__, ret); ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); @@ -17157,21 +17312,19 @@ index 000000000..0e2c8745c + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bkey_s_c k, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc, *b; -+ struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old, new; + enum bch_data_type type; + int ret; + -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; ++ if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ return 1; + + bch2_alloc_to_v4(k, &old); + new = old; @@ -17212,7 +17365,8 @@ index 000000000..0e2c8745c + return 0; + +#define copy_bucket_field(_f) \ -+ if (fsck_err_on(new._f != gc._f, c, \ ++ if (c->opts.reconstruct_alloc || \ ++ fsck_err_on(new._f != gc._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ @@ -17263,31 +17417,21 @@ index 000000000..0e2c8745c + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, -+ POS(ca->dev_idx, ca->mi.first_bucket), -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) -+ break; ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_alloc_write_key(&trans, &iter, k, metadata_only)); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW, -+ bch2_alloc_write_key(&trans, &iter, -+ metadata_only)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) { -+ bch_err(c, "error writing alloc info: %i", ret); ++ if (ret < 0) { ++ bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); + percpu_ref_put(&ca->ref); + break; + } + } + + bch2_trans_exit(&trans); -+ return ret; ++ return ret < 0 ? ret : 0; +} + +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) @@ -17344,7 +17488,7 @@ index 000000000..0e2c8745c + bch2_trans_exit(&trans); + + if (ret) -+ bch_err(c, "error reading alloc info at gc start: %i", ret); ++ bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); + + return ret; +} @@ -17371,14 +17515,64 @@ index 000000000..0e2c8745c + }; +} + ++static int bch2_gc_write_reflink_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ size_t *idx) ++{ ++ struct bch_fs *c = trans->c; ++ const __le64 *refcount = bkey_refcount_c(k); ++ struct printbuf buf = PRINTBUF; ++ struct reflink_gc *r; ++ int ret = 0; ++ ++ if (!refcount) ++ return 0; ++ ++ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && ++ r->offset < k.k->p.offset) ++ ++*idx; ++ ++ if (!r || ++ r->offset != k.k->p.offset || ++ r->size != k.k->size) { ++ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); ++ return -EINVAL; ++ } ++ ++ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, ++ "reflink key has wrong refcount:\n" ++ " %s\n" ++ " should be %u", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ r->refcount)) { ++ struct bkey_i *new; ++ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(new, k); ++ ++ if (!r->refcount) ++ new->k.type = KEY_TYPE_deleted; ++ else ++ *bkey_refcount(new) = cpu_to_le64(r->refcount); ++ ++ ret = bch2_trans_update(trans, iter, new, 0); ++ } ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct reflink_gc *r; + size_t idx = 0; -+ struct printbuf buf = PRINTBUF; + int ret = 0; + + if (metadata_only) @@ -17386,57 +17580,14 @@ index 000000000..0e2c8745c + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ const __le64 *refcount = bkey_refcount_c(k); ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); + -+ if (!refcount) -+ continue; -+ -+ r = genradix_ptr(&c->reflink_gc_table, idx++); -+ if (!r || -+ r->offset != k.k->p.offset || -+ r->size != k.k->size) { -+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); -+ ret = -EINVAL; -+ break; -+ } -+ -+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, -+ "reflink key has wrong refcount:\n" -+ " %s\n" -+ " should be %u", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf), -+ r->refcount)) { -+ struct bkey_i *new; -+ -+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); -+ if (!new) { -+ ret = -ENOMEM; -+ break; -+ } -+ -+ bkey_reassemble(new, k); -+ -+ if (!r->refcount) -+ new->k.type = KEY_TYPE_deleted; -+ else -+ *bkey_refcount(new) = cpu_to_le64(r->refcount); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); -+ kfree(new); -+ -+ if (ret) -+ break; -+ } -+ } -+fsck_err: -+ bch2_trans_iter_exit(&trans, &iter); + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); -+ printbuf_exit(&buf); + return ret; +} + @@ -17488,15 +17639,59 @@ index 000000000..0e2c8745c + r->refcount = 0; +} + ++static int bch2_gc_write_stripes_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct printbuf buf = PRINTBUF; ++ const struct bch_stripe *s; ++ struct gc_stripe *m; ++ unsigned i; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return 0; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) ++ goto inconsistent; ++ return 0; ++inconsistent: ++ if (fsck_err_on(true, c, ++ "stripe has wrong block sector count %u:\n" ++ " %s\n" ++ " should be %u", i, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ m ? m->block_sectors[i] : 0)) { ++ struct bkey_i_stripe *new; ++ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(&new->k_i, k); ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ++ ++ ret = bch2_trans_update(trans, iter, &new->k_i, 0); ++ } ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct gc_stripe *m; -+ const struct bch_stripe *s; -+ struct printbuf buf = PRINTBUF; -+ unsigned i; + int ret = 0; + + if (metadata_only) @@ -17504,50 +17699,13 @@ index 000000000..0e2c8745c + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->type != KEY_TYPE_stripe) -+ continue; -+ -+ s = bkey_s_c_to_stripe(k).v; -+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); -+ -+ for (i = 0; i < s->nr_blocks; i++) -+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) -+ goto inconsistent; -+ continue; -+inconsistent: -+ if (fsck_err_on(true, c, -+ "stripe has wrong block sector count %u:\n" -+ " %s\n" -+ " should be %u", i, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf), -+ m ? m->block_sectors[i] : 0)) { -+ struct bkey_i_stripe *new; -+ -+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); -+ if (!new) { -+ ret = -ENOMEM; -+ break; -+ } -+ -+ bkey_reassemble(&new->k_i, k); -+ -+ for (i = 0; i < new->v.nr_blocks; i++) -+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); -+ kfree(new); -+ } -+ } -+fsck_err: -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_gc_write_stripes_key(&trans, &iter, k)); + + bch2_trans_exit(&trans); -+ -+ printbuf_exit(&buf); + return ret; +} + @@ -17609,7 +17767,7 @@ index 000000000..0e2c8745c + + ret = bch2_gc_btrees(c, initial, metadata_only); + -+ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR && ++ if (ret == -BCH_ERR_need_topology_repair && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); @@ -17617,8 +17775,8 @@ index 000000000..0e2c8745c + ret = 0; + } + -+ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR) -+ ret = FSCK_ERR_EXIT; ++ if (ret == -BCH_ERR_need_topology_repair) ++ ret = -BCH_ERR_fsck_errors_not_fixed; + + if (ret) + goto out; @@ -17680,10 +17838,15 @@ index 000000000..0e2c8745c + return ret; +} + -+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) ++static int gc_btree_gens_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) +{ ++ struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; ++ struct bkey_i *u; ++ int ret; + + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { @@ -17691,7 +17854,7 @@ index 000000000..0e2c8745c + + if (ptr_stale(ca, ptr) > 16) { + percpu_up_read(&c->mark_lock); -+ return true; ++ goto update; + } + } + @@ -17703,77 +17866,27 @@ index 000000000..0e2c8745c + *gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); ++ return 0; ++update: ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; + -+ return false; ++ bkey_reassemble(u, k); ++ ++ bch2_extent_normalize(c, bkey_i_to_s(u)); ++ return bch2_trans_update(trans, iter, u, 0); +} + -+/* -+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree -+ * node pointers currently never have cached pointers that can become stale: -+ */ -+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf sk; -+ int ret = 0, commit_err = 0; -+ -+ bch2_bkey_buf_init(&sk); -+ -+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ while ((bch2_trans_begin(trans), -+ k = bch2_btree_iter_peek(&iter)).k) { -+ ret = bkey_err(k); -+ -+ if (ret == -EINTR) -+ continue; -+ if (ret) -+ break; -+ -+ c->gc_gens_pos = iter.pos; -+ -+ if (gc_btree_gens_key(c, k) && !commit_err) { -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); -+ -+ commit_err = -+ bch2_trans_update(trans, &iter, sk.k, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_NOFAIL); -+ if (commit_err == -EINTR) { -+ commit_err = 0; -+ continue; -+ } -+ } -+ -+ bch2_btree_iter_advance(&iter); -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ bch2_bkey_buf_exit(&sk, c); -+ -+ return ret; -+} -+ -+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) ++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); -+ struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bkey_i_alloc_v4 *a_mut; + int ret; + -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ + bch2_alloc_to_v4(k, &a); + + if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) @@ -17833,26 +17946,35 @@ index 000000000..0e2c8745c + + for (i = 0; i < BTREE_ID_NR; i++) + if ((1 << i) & BTREE_ID_HAS_PTRS) { ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; -+ ret = bch2_gc_btree_gens(&trans, i); ++ ret = for_each_btree_key_commit(&trans, iter, i, ++ POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ gc_btree_gens_key(&trans, &iter, k)); + if (ret) { -+ bch_err(c, "error recalculating oldest_gen: %i", ret); ++ bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + goto err; + } + } + -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL, -+ bch2_alloc_write_oldest_gen(&trans, &iter)); -+ if (ret) { -+ bch_err(c, "error writing oldest_gen: %i", ret); -+ break; -+ } ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS_MIN, ++ BTREE_ITER_PREFETCH, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_alloc_write_oldest_gen(&trans, &iter, k)); ++ if (ret) { ++ bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); ++ goto err; + } -+ bch2_trans_iter_exit(&trans, &iter); + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; @@ -17922,7 +18044,7 @@ index 000000000..0e2c8745c + ret = bch2_gc_gens(c); +#endif + if (ret < 0) -+ bch_err(c, "btree gc failed: %i", ret); ++ bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); + + debug_check_no_locks_held(); + } @@ -17952,7 +18074,7 @@ index 000000000..0e2c8745c + + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) { -+ bch_err(c, "error creating gc thread: %li", PTR_ERR(p)); ++ bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); + return PTR_ERR(p); + } + @@ -17963,7 +18085,7 @@ index 000000000..0e2c8745c +} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h new file mode 100644 -index 000000000..95d803b57 +index 000000000000..95d803b5743d --- /dev/null +++ b/fs/bcachefs/btree_gc.h @@ -0,0 +1,112 @@ @@ -18081,7 +18203,7 @@ index 000000000..95d803b57 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000..9bf3f77bc +index 000000000000..ae731b3a3908 --- /dev/null +++ b/fs/bcachefs/btree_io.c @@ -0,0 +1,2150 @@ @@ -18630,7 +18752,7 @@ index 000000000..9bf3f77bc + struct printbuf out = PRINTBUF; \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ -+ prt_printf(&out, ": " msg, ##__VA_ARGS__); \ ++ prt_printf(&out, ": " msg, ##__VA_ARGS__); \ + \ + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ @@ -18645,7 +18767,7 @@ index 000000000..9bf3f77bc + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + case BTREE_ERR_WANT_RETRY: \ + if (have_retry) { \ @@ -18657,7 +18779,7 @@ index 000000000..9bf3f77bc + ret = BTREE_RETRY_READ; \ + goto fsck_err; \ + case BTREE_ERR_FATAL: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + } \ + break; \ @@ -18665,7 +18787,7 @@ index 000000000..9bf3f77bc + bch_err(c, "corrupt metadata before write: %s", out.buf);\ + \ + if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + } \ + break; \ @@ -20237,7 +20359,7 @@ index 000000000..9bf3f77bc +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000..8af853642 +index 000000000000..8af853642123 --- /dev/null +++ b/fs/bcachefs/btree_io.h @@ -0,0 +1,222 @@ @@ -20465,10 +20587,10 @@ index 000000000..8af853642 +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000..a1512eb06 +index 000000000000..a90a45939aa3 --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,3471 @@ +@@ -0,0 +1,3515 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -20487,6 +20609,7 @@ index 000000000..a1512eb06 +#include "replicas.h" +#include "subvolume.h" + ++#include +#include +#include + @@ -20517,7 +20640,7 @@ index 000000000..a1512eb06 + if (need_resched() || race_fault()) { + bch2_trans_unlock(trans); + schedule(); -+ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ return bch2_trans_relock(trans); + } else { + return 0; + } @@ -20664,12 +20787,14 @@ index 000000000..a1512eb06 + return true; + } +fail: -+ trace_btree_node_relock_fail(trans->fn, _RET_IP_, -+ path->btree_id, -+ &path->pos, -+ (unsigned long) b, -+ path->l[level].lock_seq, -+ is_btree_node(path, level) ? b->c.lock.state.seq : 0); ++ if (b != BTREE_ITER_NO_NODE_CACHED && ++ b != BTREE_ITER_NO_NODE_INIT) ++ trace_btree_node_relock_fail(trans->fn, _RET_IP_, ++ path->btree_id, ++ &path->pos, ++ (unsigned long) b, ++ path->l[level].lock_seq, ++ is_btree_node(path, level) ? b->c.lock.state.seq : 0); + return false; +} + @@ -20705,7 +20830,7 @@ index 000000000..a1512eb06 + + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { -+ btree_node_unlock(path, level); ++ btree_node_unlock(trans, path, level); + goto success; + } + @@ -20740,7 +20865,7 @@ index 000000000..a1512eb06 + * the node that we failed to relock: + */ + if (fail_idx >= 0) { -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { @@ -20766,13 +20891,13 @@ index 000000000..a1512eb06 +} + +/* Slowpath: */ -+bool __bch2_btree_node_lock(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct bpos pos, unsigned level, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) ++int __bch2_btree_node_lock(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) +{ + struct btree_path *linked; + unsigned reason; @@ -20834,7 +20959,6 @@ index 000000000..a1512eb06 + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + linked->cached)) <= 0) { -+ BUG_ON(trans->in_traverse_all); + reason = 7; + goto deadlock; + } @@ -20851,8 +20975,7 @@ index 000000000..a1512eb06 + path->btree_id, + path->cached, + &pos); -+ btree_trans_restart(trans); -+ return false; ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); +} + +/* Btree iterator locking: */ @@ -20890,8 +21013,8 @@ index 000000000..a1512eb06 +/* + * Only for btree_cache.c - only relocks intent locks + */ -+bool bch2_btree_path_relock_intent(struct btree_trans *trans, -+ struct btree_path *path) ++int bch2_btree_path_relock_intent(struct btree_trans *trans, ++ struct btree_path *path) +{ + unsigned l; + @@ -20899,20 +21022,19 @@ index 000000000..a1512eb06 + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, + path->btree_id, &path->pos); -+ btree_trans_restart(trans); -+ return false; ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } + } + -+ return true; ++ return 0; +} + +__flatten -+static bool bch2_btree_path_relock(struct btree_trans *trans, ++static int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + bool ret = btree_path_get_locks(trans, path, false); @@ -20920,9 +21042,10 @@ index 000000000..a1512eb06 + if (!ret) { + trace_trans_restart_relock_path(trans->fn, trace_ip, + path->btree_id, &path->pos); -+ btree_trans_restart(trans); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } -+ return ret; ++ ++ return 0; +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, @@ -20970,7 +21093,8 @@ index 000000000..a1512eb06 + return false; +} + -+void __bch2_btree_path_downgrade(struct btree_path *path, ++void __bch2_btree_path_downgrade(struct btree_trans *trans, ++ struct btree_path *path, + unsigned new_locks_want) +{ + unsigned l; @@ -20982,7 +21106,7 @@ index 000000000..a1512eb06 + while (path->nodes_locked && + (l = __fls(path->nodes_locked)) >= path->locks_want) { + if (l > path->level) { -+ btree_node_unlock(path, l); ++ btree_node_unlock(trans, path, l); + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); @@ -21000,27 +21124,27 @@ index 000000000..a1512eb06 + struct btree_path *path; + + trans_for_each_path(trans, path) -+ bch2_btree_path_downgrade(path); ++ bch2_btree_path_downgrade(trans, path); +} + +/* Btree transaction locking: */ + -+bool bch2_trans_relock(struct btree_trans *trans) ++int bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) -+ return false; ++ return -BCH_ERR_transaction_restart_relock; + + trans_for_each_path(trans, path) + if (path->should_be_locked && -+ !bch2_btree_path_relock(trans, path, _RET_IP_)) { ++ bch2_btree_path_relock(trans, path, _RET_IP_)) { + trace_trans_restart_relock(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + BUG_ON(!trans->restarted); -+ return false; ++ return -BCH_ERR_transaction_restart_relock; + } -+ return true; ++ return 0; +} + +void bch2_trans_unlock(struct btree_trans *trans) @@ -21028,7 +21152,7 @@ index 000000000..a1512eb06 + struct btree_path *path; + + trans_for_each_path(trans, path) -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + + /* + * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking @@ -21056,7 +21180,7 @@ index 000000000..a1512eb06 + bkey_cmp(ck->key.pos, path->pos)); + + if (!locked) -+ btree_node_unlock(path, 0); ++ btree_node_unlock(trans, path, 0); +} + +static void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -21113,7 +21237,7 @@ index 000000000..a1512eb06 + } + + if (!locked) -+ btree_node_unlock(path, level); ++ btree_node_unlock(trans, path, level); + return; +err: + bch2_bpos_to_text(&buf1, path->pos); @@ -21490,27 +21614,29 @@ index 000000000..a1512eb06 + bch2_btree_node_iter_peek_all(&l->iter, l->b)); +} + -+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, ++static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ -+ struct bkey_s_c k = __btree_iter_unpack(c, l, u, ++ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->key.k.p; ++ bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + -+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, ++static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ -+ struct bkey_s_c k = __btree_iter_unpack(c, l, u, ++ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_prev(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->data->min_key; ++ bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + @@ -21585,7 +21711,7 @@ index 000000000..a1512eb06 + } + + if (!parent_locked) -+ btree_node_unlock(path, plevel); ++ btree_node_unlock(trans, path, plevel); +} + +static inline void __btree_path_level_init(struct btree_path *path, @@ -21637,7 +21763,7 @@ index 000000000..a1512eb06 + + if (path->nodes_locked && + t != BTREE_NODE_UNLOCKED) { -+ btree_node_unlock(path, b->c.level); ++ btree_node_unlock(trans, path, b->c.level); + six_lock_increment(&b->c.lock, t); + mark_btree_node_locked(trans, path, b->c.level, t); + } @@ -21665,7 +21791,9 @@ index 000000000..a1512eb06 + struct btree *b = container_of(lock, struct btree, c.lock); + struct btree **rootp = p; + -+ return b == *rootp ? 0 : -1; ++ if (b != *rootp) ++ return BCH_ERR_lock_fail_root_changed; ++ return 0; +} + +static inline int btree_path_lock_root(struct btree_trans *trans, @@ -21677,6 +21805,7 @@ index 000000000..a1512eb06 + struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; + enum six_lock_type lock_type; + unsigned i; ++ int ret; + + EBUG_ON(path->nodes_locked); + @@ -21698,13 +21827,16 @@ index 000000000..a1512eb06 + } + + lock_type = __btree_lock_want(path, path->level); -+ if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, -+ path->level, lock_type, -+ lock_root_check_fn, rootp, -+ trace_ip))) { -+ if (trans->restarted) -+ return -EINTR; -+ continue; ++ ret = btree_node_lock(trans, path, b, SPOS_MAX, ++ path->level, lock_type, ++ lock_root_check_fn, rootp, ++ trace_ip); ++ if (unlikely(ret)) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) ++ continue; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ BUG(); + } + + if (likely(b == READ_ONCE(*rootp) && @@ -21756,7 +21888,7 @@ index 000000000..a1512eb06 + } + + if (!was_locked) -+ btree_node_unlock(path, path->level); ++ btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; @@ -21791,7 +21923,7 @@ index 000000000..a1512eb06 + } + + if (!was_locked) -+ btree_node_unlock(path, path->level); ++ btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; @@ -21816,7 +21948,7 @@ index 000000000..a1512eb06 + bp->mem_ptr = (unsigned long)b; + + if (!locked) -+ btree_node_unlock(path, plevel); ++ btree_node_unlock(trans, path, plevel); +} + +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, @@ -21889,7 +22021,7 @@ index 000000000..a1512eb06 + btree_node_mem_ptr_set(trans, path, level + 1, b); + + if (btree_node_read_locked(path, level + 1)) -+ btree_node_unlock(path, level + 1); ++ btree_node_unlock(trans, path, level + 1); + path->level = level; + + bch2_btree_path_verify_locks(path); @@ -21909,11 +22041,11 @@ index 000000000..a1512eb06 + int i, ret = 0; + + if (trans->in_traverse_all) -+ return -EINTR; ++ return -BCH_ERR_transaction_restart_in_traverse_all; + + trans->in_traverse_all = true; +retry_all: -+ trans->restarted = false; ++ trans->restarted = 0; + trans->traverse_all_idx = U8_MAX; + + trans_for_each_path(trans, path) @@ -21957,7 +22089,8 @@ index 000000000..a1512eb06 + */ + if (path->uptodate) { + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); -+ if (ret == -EINTR || ret == -ENOMEM) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ ret == -ENOMEM) + goto retry_all; + if (ret) + goto err; @@ -21998,9 +22131,10 @@ index 000000000..a1512eb06 + return true; +} + -+static void btree_path_set_level_up(struct btree_path *path) ++static void btree_path_set_level_up(struct btree_trans *trans, ++ struct btree_path *path) +{ -+ btree_node_unlock(path, path->level); ++ btree_node_unlock(trans, path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); @@ -22016,7 +22150,7 @@ index 000000000..a1512eb06 + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(path, l); ++ btree_node_unlock(trans, path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); @@ -22030,7 +22164,7 @@ index 000000000..a1512eb06 + + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) { -+ btree_node_unlock(path, l); ++ btree_node_unlock(trans, path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } @@ -22041,7 +22175,7 @@ index 000000000..a1512eb06 + i++) + if (!bch2_btree_node_relock(trans, path, i)) + while (l <= i) { -+ btree_node_unlock(path, l); ++ btree_node_unlock(trans, path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } @@ -22064,19 +22198,17 @@ index 000000000..a1512eb06 + unsigned long trace_ip) +{ + unsigned depth_want = path->level; -+ int ret = 0; ++ int ret = trans->restarted; + -+ if (unlikely(trans->restarted)) { -+ ret = -EINTR; ++ if (unlikely(ret)) + goto out; -+ } + + /* + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: + */ + if (path->should_be_locked) { -+ ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; ++ ret = bch2_btree_path_relock(trans, path, trace_ip); + goto out; + } + @@ -22110,7 +22242,7 @@ index 000000000..a1512eb06 + goto out; + } + -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + path->level = depth_want; + + if (ret == -EIO) @@ -22125,7 +22257,7 @@ index 000000000..a1512eb06 + + path->uptodate = BTREE_ITER_UPTODATE; +out: -+ BUG_ON((ret == -EINTR) != !!trans->restarted); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + bch2_btree_path_verify(trans, path); + return ret; +} @@ -22133,6 +22265,16 @@ index 000000000..a1512eb06 +int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ ++ if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); ++ u64 mask = ~(~0ULL << restart_probability_bits); ++ ++ if ((prandom_u32() & mask) == mask) { ++ trace_transaction_restart_injected(trans->fn, _RET_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); ++ } ++ } ++ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + @@ -22207,7 +22349,7 @@ index 000000000..a1512eb06 + bch2_btree_path_check_sort(trans, path, cmp); + + if (unlikely(path->cached)) { -+ btree_node_unlock(path, 0); ++ btree_node_unlock(trans, path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + goto out; @@ -22230,7 +22372,7 @@ index 000000000..a1512eb06 + + if (l != path->level) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + } +out: + bch2_btree_path_verify(trans, path); @@ -22271,7 +22413,7 @@ index 000000000..a1512eb06 + +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +{ -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + btree_path_list_remove(trans, path); + trans->paths_allocated &= ~(1ULL << path->idx); +} @@ -22609,26 +22751,25 @@ index 000000000..a1512eb06 + + /* got to end? */ + if (!btree_path_node(path, path->level + 1)) { -+ btree_path_set_level_up(path); ++ btree_path_set_level_up(trans, path); + return NULL; + } + + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { -+ __bch2_btree_path_unlock(path); ++ __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, + path->btree_id, &path->pos); -+ btree_trans_restart(trans); -+ ret = -EINTR; ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + goto err; + } + + b = btree_path_node(path, path->level + 1); + + if (!bpos_cmp(iter->pos, b->key.k.p)) { -+ btree_node_unlock(path, path->level); ++ btree_node_unlock(trans, path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; + } else { @@ -22997,8 +23138,8 @@ index 000000000..a1512eb06 +out: + if (iter->update_path) { + if (iter->update_path->uptodate && -+ !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) { -+ k = bkey_s_c_err(-EINTR); ++ (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) { ++ k = bkey_s_c_err(ret); + } else { + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; @@ -23065,7 +23206,7 @@ index 000000000..a1512eb06 + (iter->advanced && + !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; -+ btree_path_set_level_up(iter->path); ++ btree_path_set_level_up(trans, iter->path); + iter->advanced = false; + continue; + } @@ -23165,13 +23306,13 @@ index 000000000..a1512eb06 + goto out; + } + -+ k = btree_path_level_peek(trans->c, iter->path, ++ k = btree_path_level_peek(trans, iter->path, + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 + : bpos_cmp(k.k->p, search_key) > 0)) -+ k = btree_path_level_prev(trans->c, iter->path, ++ k = btree_path_level_prev(trans, iter->path, + &iter->path->l[0], &iter->k); + + bch2_btree_path_check_sort(trans, iter->path, 0); @@ -23655,8 +23796,7 @@ index 000000000..a1512eb06 + + if (old_bytes) { + trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); -+ btree_trans_restart(trans); -+ return ERR_PTR(-EINTR); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + } + } + @@ -23670,11 +23810,11 @@ index 000000000..a1512eb06 + * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset + * -+ * While iterating over nodes or updating nodes a attempt to lock a btree -+ * node may return EINTR when the trylock fails. When this occurs -+ * bch2_trans_begin() should be called and the transaction retried. ++ * While iterating over nodes or updating nodes a attempt to lock a btree node ++ * may return BCH_ERR_transaction_restart when the trylock fails. When this ++ * occurs bch2_trans_begin() should be called and the transaction retried. + */ -+void bch2_trans_begin(struct btree_trans *trans) ++u32 bch2_trans_begin(struct btree_trans *trans) +{ + struct btree_path *path; + @@ -23712,12 +23852,28 @@ index 000000000..a1512eb06 + path->preserve = false; + } + -+ bch2_trans_cond_resched(trans); ++ if (!trans->restarted && ++ (need_resched() || ++ ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { ++ bch2_trans_unlock(trans); ++ cond_resched(); ++ bch2_trans_relock(trans); ++ } + ++ trans->last_restarted_ip = _RET_IP_; + if (trans->restarted) + bch2_btree_path_traverse_all(trans); + -+ trans->restarted = false; ++ trans->last_begin_time = ktime_get_ns(); ++ return trans->restart_count; ++} ++ ++void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) ++{ ++ bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans, ++ "trans->restart_count %u, should be %u, last restarted by %ps\n", ++ trans->restart_count, restart_count, ++ (void *) trans->last_restarted_ip); +} + +static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) @@ -23751,8 +23907,18 @@ index 000000000..a1512eb06 + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->fn = fn; ++ trans->last_begin_time = ktime_get_ns(); + trans->task = current; + ++ while (c->lock_held_stats.names[trans->lock_name_idx] != fn ++ && c->lock_held_stats.names[trans->lock_name_idx] != 0) ++ trans->lock_name_idx++; ++ ++ if (trans->lock_name_idx >= BCH_LOCK_TIME_NR) ++ pr_warn_once("lock_times array not big enough!"); ++ else ++ c->lock_held_stats.names[trans->lock_name_idx] = fn; ++ + bch2_trans_alloc_paths(trans, c); + + if (expected_mem_bytes) { @@ -23942,10 +24108,10 @@ index 000000000..a1512eb06 +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000..9da0a4152 +index 000000000000..1b02f75d4cab --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,411 @@ +@@ -0,0 +1,556 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -23953,6 +24119,8 @@ index 000000000..9da0a4152 +#include "bset.h" +#include "btree_types.h" + ++#include ++ +static inline void __btree_path_get(struct btree_path *path, bool intent) +{ + path->ref++; @@ -24107,19 +24275,36 @@ index 000000000..9da0a4152 + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + -+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); ++int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); + +void bch2_path_put(struct btree_trans *, struct btree_path *, bool); + -+bool bch2_trans_relock(struct btree_trans *); ++int bch2_trans_relock(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); + -+__always_inline -+static inline int btree_trans_restart(struct btree_trans *trans) ++static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) +{ -+ trans->restarted = true; -+ bch2_trans_unlock(trans); -+ return -EINTR; ++ return restart_count != trans->restart_count; ++} ++ ++void bch2_trans_verify_not_restarted(struct btree_trans *, u32); ++ ++__always_inline ++static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) ++{ ++ BUG_ON(err <= 0); ++ BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart)); ++ ++ trans->restarted = err; ++ trans->restart_count++; ++ return -err; ++} ++ ++__always_inline ++static inline int btree_trans_restart(struct btree_trans *trans, int err) ++{ ++ btree_trans_restart_nounlock(trans, err); ++ return -err; +} + +bool bch2_btree_node_upgrade(struct btree_trans *, @@ -24139,14 +24324,15 @@ index 000000000..9da0a4152 + : path->uptodate == BTREE_ITER_UPTODATE; +} + -+void __bch2_btree_path_downgrade(struct btree_path *, unsigned); ++void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); + -+static inline void bch2_btree_path_downgrade(struct btree_path *path) ++static inline void bch2_btree_path_downgrade(struct btree_trans *trans, ++ struct btree_path *path) +{ + unsigned new_locks_want = path->level + !!path->intent_ref; + + if (path->locks_want > new_locks_want) -+ __bch2_btree_path_downgrade(path, new_locks_want); ++ __bch2_btree_path_downgrade(trans, path, new_locks_want); +} + +void bch2_trans_downgrade(struct btree_trans *); @@ -24231,7 +24417,7 @@ index 000000000..9da0a4152 +} + +void *bch2_trans_kmalloc(struct btree_trans *, size_t); -+void bch2_trans_begin(struct btree_trans *); ++u32 bch2_trans_begin(struct btree_trans *); + +static inline struct btree * +__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) @@ -24239,7 +24425,7 @@ index 000000000..9da0a4152 + struct btree *b; + + while (b = bch2_btree_iter_peek_node(iter), -+ PTR_ERR_OR_ZERO(b) == -EINTR) ++ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) + bch2_trans_begin(trans); + + return b; @@ -24263,6 +24449,15 @@ index 000000000..9da0a4152 + return PTR_ERR_OR_ZERO(k.k); +} + ++static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, ++ unsigned flags) ++{ ++ BUG_ON(flags & BTREE_ITER_ALL_LEVELS); ++ ++ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : ++ bch2_btree_iter_peek_prev(iter); ++} ++ +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + unsigned flags) +{ @@ -24286,8 +24481,12 @@ index 000000000..9da0a4152 + +static inline int btree_trans_too_many_iters(struct btree_trans *trans) +{ -+ return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 -+ ? -EINTR : 0; ++ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX) { ++ trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); ++ } ++ ++ return 0; +} + +static inline struct bkey_s_c @@ -24298,12 +24497,124 @@ index 000000000..9da0a4152 + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), -+ bkey_err(k) == -EINTR)) ++ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + ++#define lockrestart_do(_trans, _do) \ ++({ \ ++ u32 _restart_count; \ ++ int _ret; \ ++ \ ++ do { \ ++ _restart_count = bch2_trans_begin(_trans); \ ++ _ret = (_do); \ ++ } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ ++ \ ++ if (!_ret) \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ \ ++ _ret; \ ++}) ++ ++/* ++ * nested_lockrestart_do(), nested_commit_do(): ++ * ++ * These are like lockrestart_do() and commit_do(), with two differences: ++ * ++ * - We don't call bch2_trans_begin() unless we had a transaction restart ++ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a ++ * transaction restart ++ */ ++#define nested_lockrestart_do(_trans, _do) \ ++({ \ ++ u32 _restart_count, _orig_restart_count; \ ++ int _ret; \ ++ \ ++ _restart_count = _orig_restart_count = (_trans)->restart_count; \ ++ \ ++ while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ ++ _restart_count = bch2_trans_begin(_trans); \ ++ \ ++ if (!_ret) \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ \ ++ if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ ++ _ret = -BCH_ERR_transaction_restart_nested; \ ++ \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key2(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _do) \ ++({ \ ++ int _ret = 0; \ ++ \ ++ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ \ ++ while (1) { \ ++ u32 _restart_count = bch2_trans_begin(_trans); \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ ++ if (!(_k).k) { \ ++ _ret = 0; \ ++ break; \ ++ } \ ++ \ ++ _ret = bkey_err(_k) ?: (_do); \ ++ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ ++ continue; \ ++ if (_ret) \ ++ break; \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ if (!bch2_btree_iter_advance(&(_iter))) \ ++ break; \ ++ } \ ++ \ ++ bch2_trans_iter_exit((_trans), &(_iter)); \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _do) \ ++({ \ ++ int _ret = 0; \ ++ \ ++ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ \ ++ while (1) { \ ++ u32 _restart_count = bch2_trans_begin(_trans); \ ++ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ ++ if (!(_k).k) { \ ++ _ret = 0; \ ++ break; \ ++ } \ ++ \ ++ _ret = bkey_err(_k) ?: (_do); \ ++ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ ++ continue; \ ++ if (_ret) \ ++ break; \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ if (!bch2_btree_iter_rewind(&(_iter))) \ ++ break; \ ++ } \ ++ \ ++ bch2_trans_iter_exit((_trans), &(_iter)); \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ ++ _start, _iter_flags, _k, \ ++ _disk_res, _journal_seq, _commit_flags,\ ++ _do) \ ++ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ ++ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_commit_flags))) ++ +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -24359,10 +24670,10 @@ index 000000000..9da0a4152 +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000..a5b0a956e +index 000000000000..661006e427f2 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,850 @@ +@@ -0,0 +1,855 @@ + +#include "bcachefs.h" +#include "btree_cache.h" @@ -24370,6 +24681,7 @@ index 000000000..a5b0a956e +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" ++#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" @@ -24657,7 +24969,7 @@ index 000000000..a5b0a956e + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_trans_restart_relock_key_cache_fill(trans->fn, + _THIS_IP_, ck_path->btree_id, &ck_path->pos); -+ ret = btree_trans_restart(trans); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + goto err; + } + @@ -24712,8 +25024,10 @@ index 000000000..a5b0a956e + struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); + const struct btree_path *path = p; + -+ return ck->key.btree_id == path->btree_id && -+ !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; ++ if (ck->key.btree_id != path->btree_id && ++ bpos_cmp(ck->key.pos, path->pos)) ++ return BCH_ERR_lock_fail_node_reused; ++ return 0; +} + +__flatten @@ -24752,14 +25066,15 @@ index 000000000..a5b0a956e + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + -+ if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, -+ lock_want, -+ bkey_cached_check_fn, path, _THIS_IP_)) { -+ if (!trans->restarted) ++ ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0, ++ lock_want, ++ bkey_cached_check_fn, path, _THIS_IP_); ++ if (ret) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) + goto retry; -+ -+ ret = -EINTR; -+ goto err; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto err; ++ BUG(); + } + + if (ck->key.btree_id != path->btree_id || @@ -24778,7 +25093,7 @@ index 000000000..a5b0a956e + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { + trace_transaction_restart_ip(trans->fn, _THIS_IP_); -+ ret = btree_trans_restart(trans); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + goto err; + } + @@ -24795,8 +25110,8 @@ index 000000000..a5b0a956e + + return ret; +err: -+ if (ret != -EINTR) { -+ btree_node_unlock(path, 0); ++ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ btree_node_unlock(trans, path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_ERROR; + } + return ret; @@ -24862,13 +25177,14 @@ index 000000000..a5b0a956e + ? JOURNAL_WATERMARK_reserved + : 0)| + commit_flags); -+ if (ret) { -+ bch2_fs_fatal_err_on(ret != -EINTR && -+ ret != -EAGAIN && -+ !bch2_journal_error(j), c, -+ "error flushing key cache: %i", ret); ++ ++ bch2_fs_fatal_err_on(ret && ++ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && ++ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && ++ !bch2_journal_error(j), c, ++ "error flushing key cache: %s", bch2_err_str(ret)); ++ if (ret) + goto out; -+ } + + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); @@ -25215,7 +25531,7 @@ index 000000000..a5b0a956e +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 -index 000000000..670746e72 +index 000000000000..670746e72dab --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h @@ -0,0 +1,47 @@ @@ -25268,10 +25584,10 @@ index 000000000..670746e72 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000..67c970d72 +index 000000000000..49eef650e436 --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,259 @@ +@@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -25332,7 +25648,7 @@ index 000000000..67c970d72 + path->nodes_intent_locked &= ~(1 << level); +} + -+static inline void mark_btree_node_locked(struct btree_trans *trans, ++static inline void mark_btree_node_locked_noreset(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + enum six_lock_type type) @@ -25347,11 +25663,22 @@ index 000000000..67c970d72 + path->nodes_intent_locked |= type << level; +} + ++static inline void mark_btree_node_locked(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ mark_btree_node_locked_noreset(trans, path, level, type); ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ path->l[level].lock_taken_time = ktime_get_ns(); ++#endif ++} ++ +static inline void mark_btree_node_intent_locked(struct btree_trans *trans, + struct btree_path *path, + unsigned level) +{ -+ mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent); +} + +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) @@ -25373,23 +25700,35 @@ index 000000000..67c970d72 + return BTREE_NODE_UNLOCKED; +} + -+static inline void btree_node_unlock(struct btree_path *path, unsigned level) ++static inline void btree_node_unlock(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) +{ + int lock_type = btree_node_locked_type(path, level); + + EBUG_ON(level >= BTREE_MAX_DEPTH); + -+ if (lock_type != BTREE_NODE_UNLOCKED) ++ if (lock_type != BTREE_NODE_UNLOCKED) { + six_unlock_type(&path->l[level].b->c.lock, lock_type); ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ if (trans->lock_name_idx < BCH_LOCK_TIME_NR) { ++ struct bch_fs *c = trans->c; ++ ++ __bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx], ++ path->l[level].lock_taken_time, ++ ktime_get_ns()); ++ } ++#endif ++ } + mark_btree_node_unlocked(path, level); +} + -+static inline void __bch2_btree_path_unlock(struct btree_path *path) ++static inline void __bch2_btree_path_unlock(struct btree_trans *trans, ++ struct btree_path *path) +{ + btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + + while (path->nodes_locked) -+ btree_node_unlock(path, __ffs(path->nodes_locked)); ++ btree_node_unlock(trans, path, __ffs(path->nodes_locked)); +} + +static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) @@ -25406,7 +25745,7 @@ index 000000000..67c970d72 + } +} + -+static inline bool btree_node_lock_type(struct btree_trans *trans, ++static inline int btree_node_lock_type(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, @@ -25415,10 +25754,10 @@ index 000000000..67c970d72 +{ + struct bch_fs *c = trans->c; + u64 start_time; -+ bool ret; ++ int ret; + + if (six_trylock_type(&b->c.lock, type)) -+ return true; ++ return 0; + + start_time = local_clock(); + @@ -25428,13 +25767,14 @@ index 000000000..67c970d72 + trans->locking_level = level; + trans->locking_lock_type = type; + trans->locking = b; -+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; ++ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p); + trans->locking = NULL; + + if (ret) -+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++ return ret; + -+ return ret; ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++ return 0; +} + +/* @@ -25457,26 +25797,34 @@ index 000000000..67c970d72 + return false; +} + -+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, -+ struct btree *, struct bpos, unsigned, -+ enum six_lock_type, -+ six_lock_should_sleep_fn, void *, -+ unsigned long); ++int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, ++ struct btree *, struct bpos, unsigned, ++ enum six_lock_type, ++ six_lock_should_sleep_fn, void *, ++ unsigned long); + -+static inline bool btree_node_lock(struct btree_trans *trans, ++static inline int btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ ++ int ret = 0; ++ + EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + -+ return likely(six_trylock_type(&b->c.lock, type)) || -+ btree_node_lock_increment(trans, b, level, type) || -+ __bch2_btree_node_lock(trans, path, b, pos, level, type, -+ should_sleep_fn, p, ip); ++ if (likely(six_trylock_type(&b->c.lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || ++ !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type, ++ should_sleep_fn, p, ip))) { ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ path->l[b->c.level].lock_taken_time = ktime_get_ns(); ++#endif ++ } ++ ++ return ret; +} + +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); @@ -25529,14 +25877,12 @@ index 000000000..67c970d72 +} + +#endif /* _BCACHEFS_BTREE_LOCKING_H */ -+ -+ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000..1e4d1fecc +index 000000000000..a2826dfe13cb --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,687 @@ +@@ -0,0 +1,697 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -25790,6 +26136,9 @@ index 000000000..1e4d1fecc + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ u64 lock_taken_time; ++#endif + } l[BTREE_MAX_DEPTH]; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; @@ -25923,10 +26272,13 @@ index 000000000..1e4d1fecc + +#define BTREE_TRANS_MEM_MAX (1U << 16) + ++#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 ++ +struct btree_trans { + struct bch_fs *c; + const char *fn; + struct list_head list; ++ u64 last_begin_time; + struct btree *locking; + unsigned locking_path_idx; + struct bpos locking_pos; @@ -25941,9 +26293,12 @@ index 000000000..1e4d1fecc + u8 traverse_all_idx; + bool used_mempool:1; + bool in_traverse_all:1; -+ bool restarted:1; + bool memory_allocation_failure:1; + bool is_initial_gc:1; ++ enum bch_errcode restarted:16; ++ u32 restart_count; ++ unsigned long last_restarted_ip; ++ + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: @@ -25973,6 +26328,7 @@ index 000000000..1e4d1fecc + unsigned journal_u64s; + unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; ++ int lock_name_idx; +}; + +#define BTREE_FLAGS() \ @@ -26226,10 +26582,10 @@ index 000000000..1e4d1fecc +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000..28f958577 +index 000000000000..89941fb8caa0 --- /dev/null +++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,156 @@ +@@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H @@ -26322,7 +26678,6 @@ index 000000000..28f958577 + * This is main entry point for btree updates. + * + * Return values: -+ * -EINTR: locking changed, this function should be called again. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ @@ -26338,30 +26693,33 @@ index 000000000..28f958577 + return __bch2_trans_commit(trans); +} + -+#define lockrestart_do(_trans, _do) \ -+({ \ -+ int _ret; \ -+ \ -+ do { \ -+ bch2_trans_begin(_trans); \ -+ _ret = (_do); \ -+ } while (_ret == -EINTR); \ -+ \ -+ _ret; \ -+}) -+ -+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + ++#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_flags))) ++ +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +({ \ + struct btree_trans trans; \ + int _ret; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ -+ _do); \ ++ _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ ++ bch2_trans_exit(&trans); \ ++ \ ++ _ret; \ ++}) ++ ++#define bch2_trans_run(_c, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = (_do); \ + bch2_trans_exit(&trans); \ + \ + _ret; \ @@ -26388,10 +26746,10 @@ index 000000000..28f958577 +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000..965fdfbfa +index 000000000000..5525635ec04a --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2253 @@ +@@ -0,0 +1,2266 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -26572,12 +26930,13 @@ index 000000000..965fdfbfa + six_unlock_intent(&b->c.lock); +} + -+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ++static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + struct disk_reservation *res, + struct closure *cl, + bool interior_node, + unsigned flags) +{ ++ struct bch_fs *c = trans->c; + struct write_point *wp; + struct btree *b; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; @@ -26607,7 +26966,7 @@ index 000000000..965fdfbfa + mutex_unlock(&c->btree_reserve_cache_lock); + +retry: -+ wp = bch2_alloc_sectors_start(c, ++ wp = bch2_alloc_sectors_start_trans(trans, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, @@ -26806,18 +27165,16 @@ index 000000000..965fdfbfa + } +} + -+static int bch2_btree_reserve_get(struct btree_update *as, ++static int bch2_btree_reserve_get(struct btree_trans *trans, ++ struct btree_update *as, + unsigned nr_nodes[2], -+ unsigned flags) ++ unsigned flags, ++ struct closure *cl) +{ + struct bch_fs *c = as->c; -+ struct closure cl; + struct btree *b; + unsigned interior; -+ int ret; -+ -+ closure_init_stack(&cl); -+retry: ++ int ret = 0; + + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + @@ -26828,18 +27185,17 @@ index 000000000..965fdfbfa + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: + */ -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); + if (ret) -+ goto err; ++ return ret; + + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { -+ b = __bch2_btree_node_alloc(c, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT -+ ? NULL : &cl, -+ interior, flags); ++ b = __bch2_btree_node_alloc(trans, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ? NULL : cl, ++ interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; @@ -26848,18 +27204,8 @@ index 000000000..965fdfbfa + p->b[p->nr++] = b; + } + } -+ -+ bch2_btree_cache_cannibalize_unlock(c); -+ closure_sync(&cl); -+ return 0; +err: + bch2_btree_cache_cannibalize_unlock(c); -+ closure_sync(&cl); -+ -+ if (ret == -EAGAIN) -+ goto retry; -+ -+ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl); + return ret; +} + @@ -27004,7 +27350,7 @@ index 000000000..965fdfbfa + * which may require allocations as well. + */ + bch2_trans_init(&trans, c, 0, 512); -+ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, ++ ret = commit_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| @@ -27374,6 +27720,7 @@ index 000000000..965fdfbfa + unsigned update_level = level; + int journal_flags = flags & JOURNAL_WATERMARK_MASK; + int ret = 0; ++ u32 restart_count = trans->restart_count; + + BUG_ON(!path->should_be_locked); + @@ -27401,7 +27748,7 @@ index 000000000..965fdfbfa + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, + path->btree_id, &path->pos); -+ ret = btree_trans_restart(trans); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + return ERR_PTR(ret); + } + @@ -27410,9 +27757,10 @@ index 000000000..965fdfbfa + else if (!down_read_trylock(&c->gc_lock)) { + bch2_trans_unlock(trans); + down_read(&c->gc_lock); -+ if (!bch2_trans_relock(trans)) { ++ ret = bch2_trans_relock(trans); ++ if (ret) { + up_read(&c->gc_lock); -+ return ERR_PTR(-EINTR); ++ return ERR_PTR(ret); + } + } + @@ -27447,16 +27795,24 @@ index 000000000..965fdfbfa + if (ret) + goto err; + -+ bch2_trans_unlock(trans); -+ + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, -+ journal_flags); ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); + if (ret) { -+ bch2_btree_update_free(as); -+ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); -+ btree_trans_restart(trans); -+ return ERR_PTR(ret); ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); ++ if (ret) { ++ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); ++ goto err; ++ } ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ goto err; + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, @@ -27466,15 +27822,31 @@ index 000000000..965fdfbfa + if (ret) + goto err; + -+ ret = bch2_btree_reserve_get(as, nr_nodes, flags); ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); ++ if (ret && ret != -EINTR) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ bch2_trans_unlock(trans); ++ ++ do { ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); ++ closure_sync(&cl); ++ } while (ret == -EAGAIN); ++ ++ if (ret) { ++ trace_btree_reserve_get_fail(trans->fn, _RET_IP_, ++ nr_nodes[0] + nr_nodes[1]); ++ goto err; ++ } ++ } ++ ++ ret = bch2_trans_relock(trans); + if (ret) + goto err; + -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ ++ bch2_trans_verify_not_restarted(trans, restart_count); + return as; +err: + bch2_btree_update_free(as); @@ -28224,7 +28596,7 @@ index 000000000..965fdfbfa + + bch2_btree_update_done(as); +out: -+ bch2_btree_path_downgrade(iter->path); ++ bch2_btree_path_downgrade(trans, iter->path); + return ret; +} + @@ -28337,7 +28709,7 @@ index 000000000..965fdfbfa + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); + -+ btree_node_unlock(iter2.path, iter2.path->level); ++ btree_node_unlock(trans, iter2.path, iter2.path->level); + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; + btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE); @@ -28411,10 +28783,8 @@ index 000000000..965fdfbfa + int ret = 0; + + if (!btree_node_intent_locked(path, b->c.level) && -+ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) { -+ btree_trans_restart(trans); -+ return -EINTR; -+ } ++ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + + closure_init_stack(&cl); + @@ -28427,8 +28797,9 @@ index 000000000..965fdfbfa + if (ret) { + bch2_trans_unlock(trans); + closure_sync(&cl); -+ if (!bch2_trans_relock(trans)) -+ return -EINTR; ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ return ret; + } + + new_hash = bch2_btree_node_mem_alloc(c, false); @@ -28647,7 +29018,7 @@ index 000000000..965fdfbfa +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 -index 000000000..adfc6c24a +index 000000000000..adfc6c24a7a4 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h @@ -0,0 +1,321 @@ @@ -28974,10 +29345,10 @@ index 000000000..adfc6c24a +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 -index 000000000..aed26b579 +index 000000000000..e2ecbd3bca77 --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1815 @@ +@@ -0,0 +1,1800 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -28990,6 +29361,7 @@ index 000000000..aed26b579 +#include "btree_locking.h" +#include "buckets.h" +#include "debug.h" ++#include "errcode.h" +#include "error.h" +#include "extent_update.h" +#include "journal.h" @@ -29262,9 +29634,10 @@ index 000000000..aed26b579 + if (ret) + return ret; + -+ if (!bch2_trans_relock(trans)) { ++ ret = bch2_trans_relock(trans); ++ if (ret) { + trace_trans_restart_journal_preres_get(trans->fn, trace_ip); -+ return -EINTR; ++ return ret; + } + + return 0; @@ -29356,12 +29729,7 @@ index 000000000..aed26b579 + trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, + path->btree_id, &path->pos, + old_u64s, new_u64s); -+ /* -+ * Not using btree_trans_restart() because we can't unlock here, we have -+ * write locks held: -+ */ -+ trans->restarted = true; -+ return -EINTR; ++ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced); +} + +/* Triggers: */ @@ -29553,8 +29921,7 @@ index 000000000..aed26b579 + + if (race_fault()) { + trace_trans_restart_fault_inject(trans->fn, trace_ip); -+ trans->restarted = true; -+ return -EINTR; ++ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* @@ -29786,6 +30153,7 @@ index 000000000..aed26b579 +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; ++ int ret; + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) @@ -29795,10 +30163,11 @@ index 000000000..aed26b579 + if (have_conflicting_read_lock(trans, i->path)) + goto fail; + -+ btree_node_lock_type(trans, i->path, ++ ret = btree_node_lock_type(trans, i->path, + insert_l(i)->b, + i->path->pos, i->level, + SIX_LOCK_write, NULL, NULL); ++ BUG_ON(ret); + } + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); @@ -29814,7 +30183,7 @@ index 000000000..aed26b579 + } + + trace_trans_restart_would_deadlock_write(trans->fn); -+ return btree_trans_restart(trans); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) @@ -29945,10 +30314,7 @@ index 000000000..aed26b579 + switch (ret) { + case BTREE_INSERT_BTREE_NODE_FULL: + ret = bch2_btree_split_leaf(trans, i->path, trans->flags); -+ if (!ret) -+ return 0; -+ -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_trans_restart_btree_node_split(trans->fn, trace_ip, + i->btree_id, &i->path->pos); + break; @@ -29959,19 +30325,16 @@ index 000000000..aed26b579 + if (ret) + break; + -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_mark_replicas(trans->fn, trace_ip); -+ ret = -EINTR; ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_mark_replicas(trans->fn, trace_ip); + break; + case BTREE_INSERT_NEED_JOURNAL_RES: + bch2_trans_unlock(trans); + + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && + !(trans->flags & JOURNAL_WATERMARK_reserved)) { -+ trans->restarted = true; -+ ret = -EAGAIN; ++ ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + @@ -29979,11 +30342,9 @@ index 000000000..aed26b579 + if (ret) + break; + -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_journal_res_get(trans->fn, trace_ip); -+ ret = -EINTR; ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_journal_res_get(trans->fn, trace_ip); + break; + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); @@ -29995,18 +30356,16 @@ index 000000000..aed26b579 + if (ret < 0) + break; + -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_journal_reclaim(trans->fn, trace_ip); -+ ret = -EINTR; ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_journal_reclaim(trans->fn, trace_ip); + break; + default: + BUG_ON(ret >= 0); + break; + } + -+ BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + BUG_ON(ret == -ENOSPC && + !(trans->flags & BTREE_INSERT_NOWAIT) && + (trans->flags & BTREE_INSERT_NOFAIL)); @@ -30026,13 +30385,11 @@ index 000000000..aed26b579 + + bch2_trans_unlock(trans); + -+ ret = bch2_fs_read_write_early(c); ++ ret = bch2_fs_read_write_early(c) ?: ++ bch2_trans_relock(trans); + if (ret) + return ret; + -+ if (!bch2_trans_relock(trans)) -+ return -EINTR; -+ + percpu_ref_get(&c->writes); + return 0; +} @@ -30104,7 +30461,7 @@ index 000000000..aed26b579 + if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { + trace_trans_restart_upgrade(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); -+ ret = btree_trans_restart(trans); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + goto out; + } + @@ -30614,8 +30971,7 @@ index 000000000..aed26b579 + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); -+ btree_trans_restart(trans); -+ return -EINTR; ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + iter->key_cache_path->should_be_locked = true; @@ -30743,7 +31099,7 @@ index 000000000..aed26b579 + break; + } + -+ if (ret == -EINTR) { ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + goto retry; + } @@ -30795,10 +31151,10 @@ index 000000000..aed26b579 +} diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000..1ea7e2baf +index 000000000000..b4be2122c2d5 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2114 @@ +@@ -0,0 +1,2113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -31345,22 +31701,6 @@ index 000000000..1ea7e2baf + } + } + -+ if (new_a.data_type == BCH_DATA_free && -+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) -+ closure_wake_up(&c->freelist_wait); -+ -+ if (new_a.data_type == BCH_DATA_need_discard && -+ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) -+ bch2_do_discards(c); -+ -+ if (old_a.data_type != BCH_DATA_cached && -+ new_a.data_type == BCH_DATA_cached && -+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) -+ bch2_do_invalidates(c); -+ -+ if (new_a.data_type == BCH_DATA_need_gc_gens) -+ bch2_do_gc_gens(c); -+ + percpu_down_read(&c->mark_lock); + if (!gc && new_a.gen != old_a.gen) + *bucket_gen(ca, new.k->p.offset) = new_a.gen; @@ -31400,6 +31740,22 @@ index 000000000..1ea7e2baf + } + } + ++ if (new_a.data_type == BCH_DATA_free && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) ++ closure_wake_up(&c->freelist_wait); ++ ++ if (new_a.data_type == BCH_DATA_need_discard && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) ++ bch2_do_discards(c); ++ ++ if (old_a.data_type != BCH_DATA_cached && ++ new_a.data_type == BCH_DATA_cached && ++ should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) ++ bch2_do_invalidates(c); ++ ++ if (new_a.data_type == BCH_DATA_need_gc_gens) ++ bch2_do_gc_gens(c); ++ + return 0; +} + @@ -32662,7 +33018,7 @@ index 000000000..1ea7e2baf + enum bch_data_type type, + unsigned sectors) +{ -+ return __bch2_trans_do(trans, NULL, NULL, 0, ++ return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); +} + @@ -32740,8 +33096,7 @@ index 000000000..1ea7e2baf + +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +{ -+ return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, -+ __bch2_trans_mark_dev_sb(&trans, ca)); ++ return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); +} + +/* Disk reservations: */ @@ -32915,7 +33270,7 @@ index 000000000..1ea7e2baf +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000..6881502d9 +index 000000000000..6881502d95f1 --- /dev/null +++ b/fs/bcachefs/buckets.h @@ -0,0 +1,300 @@ @@ -33221,7 +33576,7 @@ index 000000000..6881502d9 +#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h new file mode 100644 -index 000000000..1dbba7d90 +index 000000000000..1dbba7d906dd --- /dev/null +++ b/fs/bcachefs/buckets_types.h @@ -0,0 +1,103 @@ @@ -33330,7 +33685,7 @@ index 000000000..1dbba7d90 +#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c new file mode 100644 -index 000000000..2e5b95508 +index 000000000000..2e5b955080de --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,167 @@ @@ -33503,7 +33858,7 @@ index 000000000..2e5b95508 +} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h new file mode 100644 -index 000000000..d2ae19cbe +index 000000000000..d2ae19cbe18c --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ @@ -33524,7 +33879,7 @@ index 000000000..d2ae19cbe +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h new file mode 100644 -index 000000000..fea7f944d +index 000000000000..fea7f944d0ed --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ @@ -33553,7 +33908,7 @@ index 000000000..fea7f944d +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 -index 000000000..dbb7e5e0b +index 000000000000..dbb7e5e0b35b --- /dev/null +++ b/fs/bcachefs/chardev.c @@ -0,0 +1,760 @@ @@ -34319,7 +34674,7 @@ index 000000000..dbb7e5e0b +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h new file mode 100644 -index 000000000..3a4890d39 +index 000000000000..3a4890d39ff9 --- /dev/null +++ b/fs/bcachefs/chardev.h @@ -0,0 +1,31 @@ @@ -34356,13 +34711,14 @@ index 000000000..3a4890d39 +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 -index 000000000..7c2af6754 +index 000000000000..b5850a761b91 --- /dev/null +++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,707 @@ +@@ -0,0 +1,712 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" ++#include "errcode.h" +#include "super.h" +#include "super-io.h" + @@ -34889,7 +35245,7 @@ index 000000000..7c2af6754 + + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); ++ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + @@ -34914,20 +35270,24 @@ index 000000000..7c2af6754 + +static int bch2_alloc_ciphers(struct bch_fs *c) +{ ++ int ret; ++ + if (!c->chacha20) + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ if (IS_ERR(c->chacha20)) { -+ bch_err(c, "error requesting chacha20 module: %li", -+ PTR_ERR(c->chacha20)); -+ return PTR_ERR(c->chacha20); ++ ret = PTR_ERR_OR_ZERO(c->chacha20); ++ ++ if (ret) { ++ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); ++ return ret; + } + + if (!c->poly1305) + c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); -+ if (IS_ERR(c->poly1305)) { -+ bch_err(c, "error requesting poly1305 module: %li", -+ PTR_ERR(c->poly1305)); -+ return PTR_ERR(c->poly1305); ++ ret = PTR_ERR_OR_ZERO(c->poly1305); ++ ++ if (ret) { ++ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); ++ return ret; + } + + return 0; @@ -34988,7 +35348,7 @@ index 000000000..7c2af6754 + if (keyed) { + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); ++ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + @@ -35040,9 +35400,9 @@ index 000000000..7c2af6754 + pr_verbose_init(c->opts, ""); + + c->sha256 = crypto_alloc_shash("sha256", 0, 0); -+ if (IS_ERR(c->sha256)) { -+ bch_err(c, "error requesting sha256 module"); -+ ret = PTR_ERR(c->sha256); ++ ret = PTR_ERR_OR_ZERO(c->sha256); ++ if (ret) { ++ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); + goto out; + } + @@ -35069,7 +35429,7 @@ index 000000000..7c2af6754 +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 -index 000000000..c86c3c05d +index 000000000000..c86c3c05d620 --- /dev/null +++ b/fs/bcachefs/checksum.h @@ -0,0 +1,204 @@ @@ -35279,7 +35639,7 @@ index 000000000..c86c3c05d +#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c new file mode 100644 -index 000000000..f3ffdbc38 +index 000000000000..f3ffdbc38485 --- /dev/null +++ b/fs/bcachefs/clock.c @@ -0,0 +1,191 @@ @@ -35476,7 +35836,7 @@ index 000000000..f3ffdbc38 +} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h new file mode 100644 -index 000000000..70a0f7436 +index 000000000000..70a0f7436c84 --- /dev/null +++ b/fs/bcachefs/clock.h @@ -0,0 +1,38 @@ @@ -35520,7 +35880,7 @@ index 000000000..70a0f7436 +#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h new file mode 100644 -index 000000000..5fae0012d +index 000000000000..5fae0012d808 --- /dev/null +++ b/fs/bcachefs/clock_types.h @@ -0,0 +1,37 @@ @@ -35563,7 +35923,7 @@ index 000000000..5fae0012d +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000..f692f35a6 +index 000000000000..f692f35a6a98 --- /dev/null +++ b/fs/bcachefs/compress.c @@ -0,0 +1,639 @@ @@ -36208,7 +36568,7 @@ index 000000000..f692f35a6 +} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h new file mode 100644 -index 000000000..4bab1f61b +index 000000000000..4bab1f61b3b5 --- /dev/null +++ b/fs/bcachefs/compress.h @@ -0,0 +1,18 @@ @@ -36232,7 +36592,7 @@ index 000000000..4bab1f61b +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c new file mode 100644 -index 000000000..745f856e6 +index 000000000000..745f856e6d3e --- /dev/null +++ b/fs/bcachefs/counters.c @@ -0,0 +1,107 @@ @@ -36345,7 +36705,7 @@ index 000000000..745f856e6 +}; diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h new file mode 100644 -index 000000000..4778aa19b +index 000000000000..4778aa19bf34 --- /dev/null +++ b/fs/bcachefs/counters.h @@ -0,0 +1,17 @@ @@ -36368,7 +36728,7 @@ index 000000000..4778aa19b +#endif // _BCACHEFS_COUNTERS_H diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 -index 000000000..519ab9b96 +index 000000000000..519ab9b96e67 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,77 @@ @@ -36451,10 +36811,10 @@ index 000000000..519ab9b96 +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 -index 000000000..cc9ae6dad +index 000000000000..3b442b01ca86 --- /dev/null +++ b/fs/bcachefs/data_update.c -@@ -0,0 +1,379 @@ +@@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -36480,13 +36840,13 @@ index 000000000..cc9ae6dad + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; -+ struct snapshots_seen s; ++ snapshot_id_list s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + -+ snapshots_seen_init(&s); ++ darray_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; @@ -36498,7 +36858,6 @@ index 000000000..cc9ae6dad + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { -+next: + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) @@ -36509,11 +36868,9 @@ index 000000000..cc9ae6dad + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; -+ u32 *i; + -+ darray_for_each(s.ids, i) -+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) -+ goto next; ++ if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) ++ continue; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + @@ -36536,13 +36893,13 @@ index 000000000..cc9ae6dad + if (ret) + break; + -+ ret = snapshots_seen_add(c, &s, k.k->p.snapshot); ++ ret = snapshot_list_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); -+ darray_exit(&s.ids); ++ darray_exit(&s); + + return ret; +} @@ -36696,7 +37053,7 @@ index 000000000..cc9ae6dad + bch2_ob_add_backpointer(c, ec_ob, &insert->k); + } +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; @@ -36732,7 +37089,7 @@ index 000000000..cc9ae6dad + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); -+ BUG_ON(ret == -EINTR); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + return ret; +} + @@ -36836,7 +37193,7 @@ index 000000000..cc9ae6dad +} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h new file mode 100644 -index 000000000..e64505453 +index 000000000000..e64505453a55 --- /dev/null +++ b/fs/bcachefs/data_update.h @@ -0,0 +1,38 @@ @@ -36880,10 +37237,10 @@ index 000000000..e64505453 +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 -index 000000000..05cae0ed4 +index 000000000000..cd37a1016e25 --- /dev/null +++ b/fs/bcachefs/debug.c -@@ -0,0 +1,707 @@ +@@ -0,0 +1,764 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcachefs debug code @@ -37075,6 +37432,7 @@ index 000000000..05cae0ed4 + struct bch_fs *c; + enum btree_id id; + struct bpos from; ++ struct bpos prev_node; + u64 iter; + + struct printbuf buf; @@ -37144,39 +37502,30 @@ index 000000000..05cae0ed4 + i->size = size; + i->ret = 0; + -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ + bch2_trans_init(&trans, i->c, 0, 0); + -+ bch2_trans_iter_init(&trans, &iter, i->id, i->from, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ k = bch2_btree_iter_peek(&iter); -+ -+ while (k.k && !(err = bkey_err(k))) { -+ bch2_bkey_val_to_text(&i->buf, i->c, k); -+ prt_char(&i->buf, '\n'); -+ -+ k = bch2_btree_iter_next(&iter); -+ i->from = iter.pos; -+ ++ err = for_each_btree_key2(&trans, iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ({ + err = flush_buf(i); + if (err) + break; + + if (!i->size) + break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_bkey_val_to_text(&i->buf, i->c, k); ++ prt_newline(&i->buf); ++ 0; ++ })); ++ i->from = iter.pos; ++ ++ if (!err) ++ err = flush_buf(i); + + bch2_trans_exit(&trans); + -+ return err < 0 ? err : i->ret; ++ return err ?: i->ret; +} + +static const struct file_operations btree_debug_ops = { @@ -37246,7 +37595,6 @@ index 000000000..05cae0ed4 + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct btree *prev_node = NULL; + int err; + + i->ubuf = buf; @@ -37262,44 +37610,36 @@ index 000000000..05cae0ed4 + + bch2_trans_init(&trans, i->c, 0, 0); + -+ bch2_trans_iter_init(&trans, &iter, i->id, i->from, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ while ((k = bch2_btree_iter_peek(&iter)).k && -+ !(err = bkey_err(k))) { ++ err = for_each_btree_key2(&trans, iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = &iter.path->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + -+ if (l->b != prev_node) { -+ bch2_btree_node_to_text(&i->buf, i->c, l->b); -+ err = flush_buf(i); -+ if (err) -+ break; -+ } -+ prev_node = l->b; -+ -+ bch2_bfloat_to_text(&i->buf, l->b, _k); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ bch2_btree_iter_advance(&iter); -+ i->from = iter.pos; -+ + err = flush_buf(i); + if (err) + break; + + if (!i->size) + break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) { ++ bch2_btree_node_to_text(&i->buf, i->c, l->b); ++ i->prev_node = l->b->key.k.p; ++ } ++ ++ bch2_bfloat_to_text(&i->buf, l->b, _k); ++ 0; ++ })); ++ i->from = iter.pos; ++ ++ if (!err) ++ err = flush_buf(i); + + bch2_trans_exit(&trans); + -+ return err < 0 ? err : i->ret; ++ return err ?: i->ret; +} + +static const struct file_operations bfloat_failed_debug_ops = { @@ -37522,6 +37862,75 @@ index 000000000..05cae0ed4 + .read = bch2_journal_pins_read, +}; + ++static int lock_held_stats_open(struct inode *inode, struct file *file) ++{ ++ struct bch_fs *c = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ ++ if (!i) ++ return -ENOMEM; ++ ++ i->iter = 0; ++ i->c = c; ++ i->buf = PRINTBUF; ++ file->private_data = i; ++ ++ return 0; ++} ++ ++static int lock_held_stats_release(struct inode *inode, struct file *file) ++{ ++ struct dump_iter *i = file->private_data; ++ ++ printbuf_exit(&i->buf); ++ kfree(i); ++ ++ return 0; ++} ++ ++static ssize_t lock_held_stats_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct lock_held_stats *lhs = &i->c->lock_held_stats; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) { ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ prt_printf(&i->buf, "%s:", lhs->names[i->iter]); ++ prt_newline(&i->buf); ++ printbuf_indent_add(&i->buf, 8); ++ bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]); ++ printbuf_indent_sub(&i->buf, 8); ++ prt_newline(&i->buf); ++ i->iter++; ++ } ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations lock_held_stats_op = { ++ .owner = THIS_MODULE, ++ .open = lock_held_stats_open, ++ .release = lock_held_stats_release, ++ .read = lock_held_stats_read, ++}; ++ +void bch2_fs_debug_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -37550,6 +37959,11 @@ index 000000000..05cae0ed4 + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, + c->btree_debug, &journal_pins_ops); + ++ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { ++ debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir, ++ c, &lock_held_stats_op); ++ } ++ + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; @@ -37593,7 +38007,7 @@ index 000000000..05cae0ed4 +} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h new file mode 100644 -index 000000000..0b86736e5 +index 000000000000..0b86736e5e1b --- /dev/null +++ b/fs/bcachefs/debug.h @@ -0,0 +1,30 @@ @@ -37629,7 +38043,7 @@ index 000000000..0b86736e5 +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000..0cbb765cd +index 000000000000..4d942d224a08 --- /dev/null +++ b/fs/bcachefs/dirent.c @@ -0,0 +1,565 @@ @@ -38106,7 +38520,7 @@ index 000000000..0cbb765cd + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (!ret) + bch2_trans_iter_exit(&trans, &iter); @@ -38191,7 +38605,7 @@ index 000000000..0cbb765cd + } + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -38200,7 +38614,7 @@ index 000000000..0cbb765cd +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 -index 000000000..b1466932c +index 000000000000..b1466932c768 --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,67 @@ @@ -38273,7 +38687,7 @@ index 000000000..b1466932c +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 -index 000000000..7bd441367 +index 000000000000..7bd4413671d2 --- /dev/null +++ b/fs/bcachefs/disk_groups.c @@ -0,0 +1,506 @@ @@ -38785,7 +39199,7 @@ index 000000000..7bd441367 +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 -index 000000000..de9154805 +index 000000000000..de915480514b --- /dev/null +++ b/fs/bcachefs/disk_groups.h @@ -0,0 +1,90 @@ @@ -38881,10 +39295,10 @@ index 000000000..de9154805 +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000..6ce352c52 +index 000000000000..f33acf1af110 --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1695 @@ +@@ -0,0 +1,1673 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ @@ -39459,18 +39873,14 @@ index 000000000..6ce352c52 + struct btree_iter *iter) +{ + size_t idx = iter->pos.offset; -+ int ret = 0; + + if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) -+ return ret; ++ return 0; + + bch2_trans_unlock(trans); -+ ret = -EINTR; + -+ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL)) -+ return ret; -+ -+ return -ENOMEM; ++ return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?: ++ bch2_trans_relock(trans); +} + +static ssize_t stripe_idx_to_delete(struct bch_fs *c) @@ -39613,7 +40023,7 @@ index 000000000..6ce352c52 + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + -+ for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos, ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + if (start_pos.offset) { @@ -39627,12 +40037,13 @@ index 000000000..6ce352c52 + } + + if (bkey_deleted(k.k)) -+ goto found_slot; ++ break; + } + -+ goto err; -+found_slot: -+ start_pos = iter.pos; ++ c->ec_stripe_hint = iter.pos.offset; ++ ++ if (ret) ++ goto err; + + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) @@ -39641,8 +40052,6 @@ index 000000000..6ce352c52 + stripe->k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); -+ -+ c->ec_stripe_hint = start_pos.offset; +err: + bch2_trans_iter_exit(trans, &iter); + @@ -39709,80 +40118,62 @@ index 000000000..6ce352c52 + }; +} + -+static int ec_stripe_update_ptrs(struct bch_fs *c, ++static int ec_stripe_update_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct ec_stripe_buf *s, ++ struct bpos end) ++{ ++ const struct bch_extent_ptr *ptr_c; ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ struct bkey_i *n; ++ int ret, dev, block; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ return 1; ++ ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) ++ return 0; ++ ++ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); ++ /* ++ * It doesn't generally make sense to erasure code cached ptrs: ++ * XXX: should we be incrementing a counter? ++ */ ++ if (!ptr_c || ptr_c->cached) ++ return 0; ++ ++ dev = s->key.v.ptrs[block].dev; ++ ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(n, k); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); ++ ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); ++ BUG_ON(!ec_ptr); ++ ++ extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block); ++ ++ return bch2_trans_update(trans, iter, n, 0); ++} ++ ++static int ec_stripe_update_extents(struct bch_fs *c, + struct ec_stripe_buf *s, + struct bkey *pos) +{ -+ struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct bkey_s_extent e; -+ struct bkey_buf sk; -+ struct bpos next_pos; -+ int ret = 0, dev, block; + -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ /* XXX this doesn't support the reflink btree */ -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ bkey_start_pos(pos), -+ BTREE_ITER_INTENT); -+retry: -+ while (bch2_trans_begin(&trans), -+ (k = bch2_btree_iter_peek(&iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { -+ const struct bch_extent_ptr *ptr_c; -+ struct bch_extent_ptr *ptr, *ec_ptr = NULL; -+ -+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } -+ -+ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); -+ /* -+ * It doesn't generally make sense to erasure code cached ptrs: -+ * XXX: should we be incrementing a counter? -+ */ -+ if (!ptr_c || ptr_c->cached) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } -+ -+ dev = s->key.v.ptrs[block].dev; -+ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ e = bkey_i_to_s_extent(sk.k); -+ -+ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); -+ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); -+ BUG_ON(!ec_ptr); -+ -+ extent_stripe_ptr_add(e, s, ec_ptr, block); -+ -+ bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); -+ next_pos = sk.k->k.p; -+ -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, sk.k, 0) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ if (!ret) -+ bch2_btree_iter_set_pos(&iter, next_pos); -+ if (ret) -+ break; -+ } -+ if (ret == -EINTR) -+ goto retry; -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ return ret; ++ return bch2_trans_run(c, ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_extents, bkey_start_pos(pos), ++ BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ ec_stripe_update_extent(&trans, &iter, k, s, pos->p))); +} + +/* @@ -39853,9 +40244,10 @@ index 000000000..6ce352c52 + } + + for_each_keylist_key(&s->keys, k) { -+ ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k); ++ ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k); + if (ret) { -+ bch_err(c, "error creating stripe: error %i updating pointers", ret); ++ bch_err(c, "error creating stripe: error updating pointers: %s", ++ bch2_err_str(ret)); + break; + } + } @@ -40582,7 +40974,7 @@ index 000000000..6ce352c52 +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000..a4c13d61a +index 000000000000..a4c13d61af10 --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,230 @@ @@ -40818,7 +41210,7 @@ index 000000000..a4c13d61a +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 -index 000000000..edd93da66 +index 000000000000..edd93da663c1 --- /dev/null +++ b/fs/bcachefs/ec_types.h @@ -0,0 +1,46 @@ @@ -40868,30 +41260,139 @@ index 000000000..edd93da66 +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c +new file mode 100644 +index 000000000000..9da8a5973af0 +--- /dev/null ++++ b/fs/bcachefs/errcode.c +@@ -0,0 +1,51 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "errcode.h" ++ ++#include ++ ++static const char * const bch2_errcode_strs[] = { ++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, ++ BCH_ERRCODES() ++#undef x ++ NULL ++}; ++ ++#define BCH_ERR_0 0 ++ ++static unsigned bch2_errcode_parents[] = { ++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class, ++ BCH_ERRCODES() ++#undef x ++}; ++ ++const char *bch2_err_str(int err) ++{ ++ const char *errstr; ++ err = abs(err); ++ ++ BUG_ON(err >= BCH_ERR_MAX); ++ ++ if (err >= BCH_ERR_START) ++ errstr = bch2_errcode_strs[err - BCH_ERR_START]; ++ else if (err) ++ errstr = errname(err); ++ else ++ errstr = "(No error)"; ++ return errstr ?: "(Invalid error)"; ++} ++ ++bool __bch2_err_matches(int err, int class) ++{ ++ err = abs(err); ++ class = abs(class); ++ ++ BUG_ON(err >= BCH_ERR_MAX); ++ BUG_ON(class >= BCH_ERR_MAX); ++ ++ while (err >= BCH_ERR_START && err != class) ++ err = bch2_errcode_parents[err - BCH_ERR_START]; ++ ++ return err == class; ++} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 -index 000000000..f7d12915c +index 000000000000..95925c8434b3 --- /dev/null +++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,12 @@ +@@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + -+enum { -+ /* Bucket allocator: */ -+ OPEN_BUCKETS_EMPTY = 2048, -+ FREELIST_EMPTY, /* Allocator thread not keeping up */ -+ INSUFFICIENT_DEVICES, ++#define BCH_ERRCODES() \ ++ x(0, open_buckets_empty) \ ++ x(0, freelist_empty) \ ++ x(freelist_empty, no_buckets_found) \ ++ x(0, insufficient_devices) \ ++ x(0, transaction_restart) \ ++ x(transaction_restart, transaction_restart_fault_inject) \ ++ x(transaction_restart, transaction_restart_relock) \ ++ x(transaction_restart, transaction_restart_relock_path) \ ++ x(transaction_restart, transaction_restart_relock_path_intent) \ ++ x(transaction_restart, transaction_restart_relock_after_fill) \ ++ x(transaction_restart, transaction_restart_too_many_iters) \ ++ x(transaction_restart, transaction_restart_lock_node_reused) \ ++ x(transaction_restart, transaction_restart_fill_relock) \ ++ x(transaction_restart, transaction_restart_fill_mem_alloc_fail)\ ++ x(transaction_restart, transaction_restart_mem_realloced) \ ++ x(transaction_restart, transaction_restart_in_traverse_all) \ ++ x(transaction_restart, transaction_restart_would_deadlock) \ ++ x(transaction_restart, transaction_restart_would_deadlock_write)\ ++ x(transaction_restart, transaction_restart_upgrade) \ ++ x(transaction_restart, transaction_restart_key_cache_fill) \ ++ x(transaction_restart, transaction_restart_key_cache_raced) \ ++ x(transaction_restart, transaction_restart_key_cache_realloced)\ ++ x(transaction_restart, transaction_restart_journal_preres_get) \ ++ x(transaction_restart, transaction_restart_nested) \ ++ x(0, lock_fail_node_reused) \ ++ x(0, lock_fail_root_changed) \ ++ x(0, journal_reclaim_would_deadlock) \ ++ x(0, fsck) \ ++ x(fsck, fsck_fix) \ ++ x(fsck, fsck_ignore) \ ++ x(fsck, fsck_errors_not_fixed) \ ++ x(fsck, fsck_repair_unimplemented) \ ++ x(fsck, fsck_repair_impossible) \ ++ x(0, need_snapshot_cleanup) \ ++ x(0, need_topology_repair) ++ ++enum bch_errcode { ++ BCH_ERR_START = 2048, ++#define x(class, err) BCH_ERR_##err, ++ BCH_ERRCODES() ++#undef x ++ BCH_ERR_MAX +}; + ++const char *bch2_err_str(int); ++bool __bch2_err_matches(int, int); ++ ++static inline bool _bch2_err_matches(int err, int class) ++{ ++ return err && __bch2_err_matches(err, class); ++} ++ ++#define bch2_err_matches(_err, _class) \ ++({ \ ++ BUILD_BUG_ON(!__builtin_constant_p(_class)); \ ++ _bch2_err_matches(_err, _class); \ ++}) ++ +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 -index 000000000..8279a9ba7 +index 000000000000..f6a895b2ceb7 --- /dev/null +++ b/fs/bcachefs/error.c -@@ -0,0 +1,185 @@ +@@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "error.h" @@ -40962,8 +41463,7 @@ index 000000000..8279a9ba7 +#include "tools-util.h" +#endif + -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, -+ const char *fmt, ...) ++int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; @@ -40977,10 +41477,10 @@ index 000000000..8279a9ba7 + + if (c->opts.errors == BCH_ON_ERROR_continue) { + bch_err(c, "fixing"); -+ return FSCK_ERR_FIX; ++ return -BCH_ERR_fsck_fix; + } else { + bch2_inconsistent_error(c); -+ return FSCK_ERR_EXIT; ++ return -BCH_ERR_fsck_errors_not_fixed; + } + } + @@ -41050,14 +41550,14 @@ index 000000000..8279a9ba7 + + if (fix) { + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ return FSCK_ERR_FIX; ++ return -BCH_ERR_fsck_fix; + } else { + set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_ERROR, &c->flags); + return c->opts.fix_errors == FSCK_OPT_EXIT || + !(flags & FSCK_CAN_IGNORE) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_IGNORE; ++ ? -BCH_ERR_fsck_errors_not_fixed ++ : -BCH_ERR_fsck_ignore; + } +} + @@ -41079,10 +41579,10 @@ index 000000000..8279a9ba7 +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 -index 000000000..6e63c3818 +index 000000000000..b603d738c549 --- /dev/null +++ b/fs/bcachefs/error.h -@@ -0,0 +1,238 @@ +@@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H @@ -41176,14 +41676,6 @@ index 000000000..6e63c3818 + * be able to repair: + */ + -+enum { -+ BCH_FSCK_OK = 0, -+ BCH_FSCK_ERRORS_NOT_FIXED = 1, -+ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, -+ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, -+ BCH_FSCK_UNKNOWN_VERSION = 4, -+}; -+ +enum fsck_err_opts { + FSCK_OPT_EXIT, + FSCK_OPT_YES, @@ -41191,13 +41683,6 @@ index 000000000..6e63c3818 + FSCK_OPT_ASK, +}; + -+enum fsck_err_ret { -+ FSCK_ERR_IGNORE = 0, -+ FSCK_ERR_FIX = 1, -+ FSCK_ERR_EXIT = 2, -+ FSCK_ERR_START_TOPOLOGY_REPAIR = 3, -+}; -+ +struct fsck_err_state { + struct list_head list; + const char *fmt; @@ -41212,21 +41697,21 @@ index 000000000..6e63c3818 +#define FSCK_NO_RATELIMIT (1 << 3) + +__printf(3, 4) __cold -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *, -+ unsigned, const char *, ...); ++int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); +void bch2_flush_fsck_errs(struct bch_fs *); + +#define __fsck_err(c, _flags, msg, ...) \ +({ \ -+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ ++ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ + \ -+ if (_fix == FSCK_ERR_EXIT) { \ ++ if (_ret != -BCH_ERR_fsck_fix && \ ++ _ret != -BCH_ERR_fsck_ignore) { \ + bch_err(c, "Unable to continue, halting"); \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ ret = _ret; \ + goto fsck_err; \ + } \ + \ -+ _fix; \ ++ _ret == -BCH_ERR_fsck_fix; \ +}) + +/* These macros return true if error should be fixed: */ @@ -41323,7 +41808,7 @@ index 000000000..6e63c3818 +#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c new file mode 100644 -index 000000000..2fd5d9672 +index 000000000000..2fd5d9672a44 --- /dev/null +++ b/fs/bcachefs/extent_update.c @@ -0,0 +1,178 @@ @@ -41507,7 +41992,7 @@ index 000000000..2fd5d9672 +} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h new file mode 100644 -index 000000000..6f5cf4493 +index 000000000000..6f5cf449361a --- /dev/null +++ b/fs/bcachefs/extent_update.h @@ -0,0 +1,12 @@ @@ -41525,7 +42010,7 @@ index 000000000..6f5cf4493 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000..2ca13014b +index 000000000000..2ca13014b9c4 --- /dev/null +++ b/fs/bcachefs/extents.c @@ -0,0 +1,1324 @@ @@ -42855,7 +43340,7 @@ index 000000000..2ca13014b +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000..3c17b8113 +index 000000000000..3c17b81130bb --- /dev/null +++ b/fs/bcachefs/extents.h @@ -0,0 +1,685 @@ @@ -43546,7 +44031,7 @@ index 000000000..3c17b8113 +#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h new file mode 100644 -index 000000000..43d6c341e +index 000000000000..43d6c341ecca --- /dev/null +++ b/fs/bcachefs/extents_types.h @@ -0,0 +1,40 @@ @@ -43592,7 +44077,7 @@ index 000000000..43d6c341e +#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h new file mode 100644 -index 000000000..05429c963 +index 000000000000..05429c9631cd --- /dev/null +++ b/fs/bcachefs/eytzinger.h @@ -0,0 +1,281 @@ @@ -43879,7 +44364,7 @@ index 000000000..05429c963 +#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h new file mode 100644 -index 000000000..cdb272708 +index 000000000000..cdb272708a4b --- /dev/null +++ b/fs/bcachefs/fifo.h @@ -0,0 +1,127 @@ @@ -44012,7 +44497,7 @@ index 000000000..cdb272708 +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 -index 000000000..53ffc6842 +index 000000000000..53ffc684223c --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,496 @@ @@ -44514,7 +44999,7 @@ index 000000000..53ffc6842 +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h new file mode 100644 -index 000000000..dde237859 +index 000000000000..dde237859514 --- /dev/null +++ b/fs/bcachefs/fs-common.h @@ -0,0 +1,43 @@ @@ -44563,7 +45048,7 @@ index 000000000..dde237859 +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 -index 000000000..bcfd9e5f3 +index 000000000000..f37bc43e27f4 --- /dev/null +++ b/fs/bcachefs/fs-io.c @@ -0,0 +1,3496 @@ @@ -44978,7 +45463,7 @@ index 000000000..bcfd9e5f3 + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + @@ -45614,10 +46099,9 @@ index 000000000..bcfd9e5f3 + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; ++ ret = bch2_trans_relock(trans); ++ if (ret) + break; -+ } + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); @@ -45670,7 +46154,7 @@ index 000000000..bcfd9e5f3 +err: + bch2_trans_iter_exit(trans, &iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { @@ -46642,7 +47126,7 @@ index 000000000..bcfd9e5f3 + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (err == -EINTR) ++ if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + @@ -47018,7 +47502,7 @@ index 000000000..bcfd9e5f3 + start = iter.pos; + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -47408,7 +47892,8 @@ index 000000000..bcfd9e5f3 + bch2_trans_copy_iter(&dst, &src); + bch2_trans_copy_iter(&del, &src); + -+ while (ret == 0 || ret == -EINTR) { ++ while (ret == 0 || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; @@ -47610,7 +48095,7 @@ index 000000000..bcfd9e5f3 +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); + bch2_disk_reservation_put(c, &disk_res); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + @@ -47890,7 +48375,7 @@ index 000000000..bcfd9e5f3 + } + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -48005,7 +48490,7 @@ index 000000000..bcfd9e5f3 + } + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -48065,7 +48550,7 @@ index 000000000..bcfd9e5f3 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h new file mode 100644 -index 000000000..7f2d7f454 +index 000000000000..7f2d7f454be4 --- /dev/null +++ b/fs/bcachefs/fs-io.h @@ -0,0 +1,56 @@ @@ -48127,7 +48612,7 @@ index 000000000..7f2d7f454 +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 -index 000000000..9f329a624 +index 000000000000..9f329a624c12 --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c @@ -0,0 +1,523 @@ @@ -48656,7 +49141,7 @@ index 000000000..9f329a624 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h new file mode 100644 -index 000000000..f201980ef +index 000000000000..f201980ef2c3 --- /dev/null +++ b/fs/bcachefs/fs-ioctl.h @@ -0,0 +1,81 @@ @@ -48743,7 +49228,7 @@ index 000000000..f201980ef +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000..bb94ba58a +index 000000000000..6d57bd87bfd5 --- /dev/null +++ b/fs/bcachefs/fs.c @@ -0,0 +1,1939 @@ @@ -48757,6 +49242,7 @@ index 000000000..bb94ba58a +#include "buckets.h" +#include "chardev.h" +#include "dirent.h" ++#include "errcode.h" +#include "extents.h" +#include "fs.h" +#include "fs-common.h" @@ -48902,7 +49388,7 @@ index 000000000..bb94ba58a + + bch2_trans_iter_exit(&trans, &iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -49072,7 +49558,7 @@ index 000000000..bb94ba58a + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +err_before_quota: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + goto err_trans; + } @@ -49192,7 +49678,7 @@ index 000000000..bb94ba58a + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_link_trans(&trans, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, @@ -49241,7 +49727,7 @@ index 000000000..bb94ba58a + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_init(&trans, c, 4, 1024); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, ++ ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, @@ -49363,7 +49849,7 @@ index 000000000..bb94ba58a + goto err; + } + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_rename_trans(&trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, @@ -49503,7 +49989,7 @@ index 000000000..bb94ba58a +btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err_trans; @@ -49734,7 +50220,7 @@ index 000000000..bb94ba58a + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (!ret && have_extent) @@ -50084,7 +50570,7 @@ index 000000000..bb94ba58a + memcpy(name, d.v->d_name, name_len); + name[name_len] = '\0'; +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter1); @@ -50619,10 +51105,9 @@ index 000000000..bb94ba58a + sb->s_shrink.seeks = 0; + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); -+ if (IS_ERR(vinode)) { -+ bch_err(c, "error mounting: error getting root inode %i", -+ (int) PTR_ERR(vinode)); -+ ret = PTR_ERR(vinode); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) { ++ bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); + goto err_put_super; + } + @@ -50688,7 +51173,7 @@ index 000000000..bb94ba58a +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 -index 000000000..9f4b57e30 +index 000000000000..9f4b57e30e2a --- /dev/null +++ b/fs/bcachefs/fs.h @@ -0,0 +1,208 @@ @@ -50902,10 +51387,10 @@ index 000000000..9f4b57e30 +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000..81bfd6ea2 +index 000000000000..bb8cab7cb405 --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2413 @@ +@@ -0,0 +1,2390 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -51044,9 +51529,9 @@ index 000000000..81bfd6ea2 + + ret = bch2_inode_unpack(k, inode); +err: -+ if (ret && ret != -EINTR) -+ bch_err(trans->c, "error %i fetching inode %llu", -+ ret, inode_nr); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error fetching inode %llu: %s", ++ inode_nr, bch2_err_str(ret)); + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -51072,9 +51557,9 @@ index 000000000..81bfd6ea2 + if (!ret) + *snapshot = iter.pos.snapshot; +err: -+ if (ret && ret != -EINTR) -+ bch_err(trans->c, "error %i fetching inode %llu:%u", -+ ret, inode_nr, *snapshot); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error fetching inode %llu:%u: %s", ++ inode_nr, *snapshot, bch2_err_str(ret)); + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -51128,17 +51613,19 @@ index 000000000..81bfd6ea2 + struct bch_inode_unpacked *inode, + u32 snapshot) +{ -+ int ret = __bch2_trans_do(trans, NULL, NULL, ++ int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __write_inode(trans, inode, snapshot)); + if (ret) -+ bch_err(trans->c, "error in fsck: error %i updating inode", ret); ++ bch_err(trans->c, "error in fsck: error updating inode: %s", ++ bch2_err_str(ret)); + return ret; +} + +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) +{ ++ struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; @@ -51171,7 +51658,7 @@ index 000000000..81bfd6ea2 + goto err; + + if (!bkey_is_inode(k.k)) { -+ bch2_fs_inconsistent(trans->c, ++ bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; @@ -51181,11 +51668,8 @@ index 000000000..81bfd6ea2 + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ -+ if (inode_u.bi_subvol) { -+ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); -+ if (ret) -+ goto err; -+ } ++ if (inode_u.bi_subvol) ++ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; @@ -51196,7 +51680,7 @@ index 000000000..81bfd6ea2 + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + return ret; @@ -51223,8 +51707,8 @@ index 000000000..81bfd6ea2 + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); +err: -+ if (ret && ret != -EINTR) -+ bch_err(c, "error %i from __remove_dirent()", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret)); + return ret; +} + @@ -51259,8 +51743,8 @@ index 000000000..81bfd6ea2 + goto create_lostfound; + } + -+ if (ret && ret != -EINTR) -+ bch_err(c, "error looking up lost+found: %i", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); + if (ret) + return ret; + @@ -51282,8 +51766,8 @@ index 000000000..81bfd6ea2 + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0); -+ if (ret && ret != -EINTR) -+ bch_err(c, "error creating lost+found: %i", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); + return ret; +} + @@ -51342,13 +51826,13 @@ index 000000000..81bfd6ea2 + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ -+ int ret = __bch2_trans_do(trans, NULL, NULL, ++ int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + __reattach_inode(trans, inode, inode_snapshot)); + if (ret) { -+ bch_err(trans->c, "error %i reattaching inode %llu", -+ ret, inode->bi_inum); ++ bch_err(trans->c, "error reattaching inode %llu: %s", ++ inode->bi_inum, bch2_err_str(ret)); + return ret; + } + @@ -51379,19 +51863,82 @@ index 000000000..81bfd6ea2 + return ret; +} + -+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) ++struct snapshots_seen_entry { ++ u32 id; ++ u32 equiv; ++}; ++ ++struct snapshots_seen { ++ struct bpos pos; ++ DARRAY(struct snapshots_seen_entry) ids; ++}; ++ ++static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ -+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; ++ darray_exit(&s->ids); ++} ++ ++static inline void snapshots_seen_init(struct snapshots_seen *s) ++{ ++ memset(s, 0, sizeof(*s)); ++} ++ ++static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) ++{ ++ struct snapshots_seen_entry *i, n = { id, id }; ++ int ret; ++ ++ darray_for_each(s->ids, i) { ++ if (n.equiv < i->equiv) ++ break; ++ ++ if (i->equiv == n.equiv) { ++ bch_err(c, "adding duplicate snapshot in snapshots_seen_add()"); ++ return -EINVAL; ++ } ++ } ++ ++ ret = darray_insert_item(&s->ids, i - s->ids.data, n); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; ++} ++ ++static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, ++ enum btree_id btree_id, struct bpos pos) ++{ ++ struct snapshots_seen_entry *i, n = { ++ .id = pos.snapshot, ++ .equiv = bch2_snapshot_equiv(c, pos.snapshot), ++ }; ++ int ret; + + if (bkey_cmp(s->pos, pos)) + s->ids.nr = 0; ++ ++ pos.snapshot = n.equiv; + s->pos = pos; + -+ /* Might get called multiple times due to lock restarts */ -+ if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot) -+ return 0; ++ darray_for_each(s->ids, i) ++ if (i->equiv == n.equiv) { ++ if (i->id != n.id) { ++ bch_err(c, "snapshot deletion did not run correctly:\n" ++ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", ++ bch2_btree_ids[btree_id], ++ pos.inode, pos.offset, ++ i->id, n.id, n.equiv); ++ return -BCH_ERR_need_snapshot_cleanup; ++ } + -+ return snapshots_seen_add(c, s, pos.snapshot); ++ return 0; ++ } ++ ++ ret = darray_push(&s->ids, n); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; +} + +/** @@ -51404,15 +51951,15 @@ index 000000000..81bfd6ea2 + u32 id, u32 ancestor) +{ + ssize_t i; ++ u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0; + + BUG_ON(id > ancestor); -+ -+ id = snapshot_t(c, id)->equiv; -+ ancestor = snapshot_t(c, ancestor)->equiv; ++ BUG_ON(!bch2_snapshot_is_equiv(c, id)); ++ BUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); + + /* @ancestor should be the snapshot most recently added to @seen */ -+ BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor); -+ BUG_ON(seen->pos.snapshot != ancestor); ++ BUG_ON(ancestor != seen->pos.snapshot); ++ BUG_ON(ancestor != top); + + if (id == ancestor) + return true; @@ -51421,10 +51968,10 @@ index 000000000..81bfd6ea2 + return false; + + for (i = seen->ids.nr - 2; -+ i >= 0 && seen->ids.data[i] >= id; ++ i >= 0 && seen->ids.data[i].equiv >= id; + --i) -+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) && -+ bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor)) ++ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) && ++ bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor)) + return false; + + return true; @@ -51449,8 +51996,9 @@ index 000000000..81bfd6ea2 + : bch2_snapshot_is_ancestor(c, src, dst); +} + -+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ -+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\ ++#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ ++ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ ++ (_i)->snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + +struct inode_walker_entry { @@ -51485,7 +52033,7 @@ index 000000000..81bfd6ea2 + + return darray_push(&w->inodes, ((struct inode_walker_entry) { + .inode = u, -+ .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, ++ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + })); +} + @@ -51495,10 +52043,11 @@ index 000000000..81bfd6ea2 + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; -+ unsigned i, ancestor_pos; ++ u32 restart_count = trans->restart_count; ++ unsigned i; + int ret; + -+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; ++ pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot); + + if (pos.inode == w->cur_inum) { + w->first_this_inode = false; @@ -51522,6 +52071,10 @@ index 000000000..81bfd6ea2 + + w->cur_inum = pos.inode; + w->first_this_inode = true; ++ ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ +lookup_snapshot: + for (i = 0; i < w->inodes.nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) @@ -51531,17 +52084,20 @@ index 000000000..81bfd6ea2 + BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); + + if (pos.snapshot != w->inodes.data[i].snapshot) { -+ ancestor_pos = i; ++ struct inode_walker_entry e = w->inodes.data[i]; ++ ++ e.snapshot = pos.snapshot; ++ e.count = 0; ++ ++ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", ++ pos.inode, pos.snapshot, w->inodes.data[i].snapshot); + + while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) + --i; + -+ ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]); ++ ret = darray_insert_item(&w->inodes, i, e); + if (ret) + return ret; -+ -+ w->inodes.data[i].snapshot = pos.snapshot; -+ w->inodes.data[i].count = 0; + } + + return i; @@ -51561,17 +52117,19 @@ index 000000000..81bfd6ea2 + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ + if (k.k->p.offset != inum) + break; + -+ if (!bkey_is_inode(k.k)) ++ if (!ref_visible(c, s, s->pos.snapshot, equiv)) + continue; + -+ if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { ++ if (bkey_is_inode(k.k)) + add_inode(c, w, k); -+ if (k.k->p.snapshot >= s->pos.snapshot) -+ break; -+ } ++ ++ if (equiv >= s->pos.snapshot) ++ break; + } + bch2_trans_iter_exit(trans, &iter); + @@ -51586,7 +52144,7 @@ index 000000000..81bfd6ea2 + struct printbuf buf = PRINTBUF; + int ret = 0; + -+ if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, ++ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, @@ -51678,44 +52236,40 @@ index 000000000..81bfd6ea2 + "hashed to %llu\n%s", + bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE) -+ return 0; -+ -+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); -+ if (ret) { -+ bch_err(c, "hash_redo_key err %i", ret); -+ return ret; ++ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ++ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ret = -BCH_ERR_transaction_restart_nested; + } -+ ret = -EINTR; +fsck_err: + goto out; +} + +static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bkey_s_c k, + struct bch_inode_unpacked *prev, ++ struct snapshots_seen *s, + bool full) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k; + struct bch_inode_unpacked u; + bool do_update = false; + int ret; + -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ return 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ + ret = check_key_has_snapshot(trans, iter, k); + if (ret < 0) + goto err; + if (ret) + return 0; + ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); ++ if (ret) ++ goto err; ++ + /* + * if snapshot id isn't a leaf node, skip it - deletion in + * particular is not atomic, so on the internal snapshot nodes @@ -51754,7 +52308,8 @@ index 000000000..81bfd6ea2 + + ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); + if (ret) -+ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ bch_err(c, "error in fsck: error while deleting inode: %s", ++ bch2_err_str(ret)); + return ret; + } + @@ -51777,7 +52332,8 @@ index 000000000..81bfd6ea2 + POS(u.bi_inum, U64_MAX), + 0, NULL); + if (ret) { -+ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ bch_err(c, "error in fsck: error truncating inode: %s", ++ bch2_err_str(ret)); + return ret; + } + @@ -51802,8 +52358,8 @@ index 000000000..81bfd6ea2 + + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); + if (sectors < 0) { -+ bch_err(c, "error in fsck: error %i recounting inode sectors", -+ (int) sectors); ++ bch_err(c, "error in fsck: error recounting inode sectors: %s", ++ bch2_err_str(sectors)); + return sectors; + } + @@ -51820,15 +52376,15 @@ index 000000000..81bfd6ea2 + } + + if (do_update) { -+ ret = write_inode(trans, &u, iter->pos.snapshot); ++ ret = __write_inode(trans, &u, iter->pos.snapshot); + if (ret) -+ bch_err(c, "error in fsck: error %i " -+ "updating inode", ret); ++ bch_err(c, "error in fsck: error updating inode: %s", ++ bch2_err_str(ret)); + } +err: +fsck_err: + if (ret) -+ bch_err(c, "error %i from check_inode()", ret); ++ bch_err(c, "error from check_inode(): %s", bch2_err_str(ret)); + return ret; +} + @@ -51838,90 +52394,66 @@ index 000000000..81bfd6ea2 + struct btree_trans trans; + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; ++ struct snapshots_seen s; ++ struct bkey_s_c k; + int ret; + ++ snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_inode(&trans, &iter, &prev, full)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, ++ POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_inode(&trans, &iter, k, &prev, &s, full)); + + bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); + if (ret) -+ bch_err(c, "error %i from check_inodes()", ret); ++ bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret)); + return ret; +} + -+static int check_subvol(struct btree_trans *trans, -+ struct btree_iter *iter) ++/* ++ * Checking for overlapping extents needs to be reimplemented ++ */ ++#if 0 ++static int fix_overlapping_extent(struct btree_trans *trans, ++ struct bkey_s_c k, struct bpos cut_at) +{ -+ struct bkey_s_c k; -+ struct bkey_s_c_subvolume subvol; ++ struct btree_iter iter; ++ struct bkey_i *u; + int ret; + -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ return 0; -+ -+ ret = bkey_err(k); ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + -+ if (k.k->type != KEY_TYPE_subvolume) -+ return 0; ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); + -+ subvol = bkey_s_c_to_subvolume(k); + -+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { -+ ret = bch2_subvolume_delete(trans, iter->pos.offset); -+ if (ret && ret != -EINTR) -+ bch_err(trans->c, "error deleting subvolume %llu: %i", -+ iter->pos.offset, ret); -+ if (ret) -+ return ret; -+ } ++ /* ++ * We don't want to go through the extent_handle_overwrites path: ++ * ++ * XXX: this is going to screw up disk accounting, extent triggers ++ * assume things about extent overwrites - we should be running the ++ * triggers manually here ++ */ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + -+ return 0; -+} -+ -+noinline_for_stack -+static int check_subvols(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes, -+ POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_subvol(&trans, &iter)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); ++ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ bch2_trans_iter_exit(trans, &iter); + return ret; +} ++#endif + +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, @@ -51981,15 +52513,15 @@ index 000000000..81bfd6ea2 +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; -+ int ret = 0, ret2 = 0; ++ u32 restart_count = trans->restart_count; ++ int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { + if (i->inode.bi_sectors == i->count) + continue; + -+ count2 = lockrestart_do(trans, -+ bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); ++ count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot); + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", @@ -52002,101 +52534,33 @@ index 000000000..81bfd6ea2 + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->cur_inum, i->snapshot, -+ i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) -+ continue; -+ -+ i->inode.bi_sectors = i->count; -+ ret = write_inode(trans, &i->inode, i->snapshot); -+ if (ret) -+ break; -+ ret2 = -EINTR; -+ } -+fsck_err: -+ if (ret) -+ bch_err(c, "error %i from check_i_sectors()", ret); -+ return ret ?: ret2; -+} -+ -+struct extent_end { -+ u32 snapshot; -+ u64 offset; -+}; -+ -+typedef DARRAY(struct extent_end) extent_ends; -+ -+static int extent_ends_at(extent_ends *extent_ends, -+ struct bkey_s_c k) -+{ -+ struct extent_end *i, n = (struct extent_end) { -+ .snapshot = k.k->p.snapshot, -+ .offset = k.k->p.offset, -+ }; -+ -+ darray_for_each(*extent_ends, i) { -+ if (i->snapshot == k.k->p.snapshot) { -+ *i = n; -+ return 0; -+ } -+ -+ if (i->snapshot >= k.k->p.snapshot) -+ break; -+ } -+ -+ return darray_insert_item(extent_ends, i - extent_ends->data, n); -+} -+ -+static int check_extent_start(struct btree_trans *trans, -+ struct snapshots_seen *s, -+ extent_ends *extent_ends, -+ struct bkey_s_c k, -+ struct btree_iter *iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct extent_end *i; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ darray_for_each(*extent_ends, i) { -+ if (fsck_err_on(i->offset > bkey_start_offset(k.k) && -+ key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot), c, -+ "overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s", -+ i->snapshot, -+ i->offset, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ bkey_reassemble(update, k); -+ ret = bch2_trans_update_extent(trans, iter, update, 0); -+ if (!ret) -+ goto err; ++ i->inode.bi_sectors, i->count)) { ++ i->inode.bi_sectors = i->count; ++ ret = write_inode(trans, &i->inode, i->snapshot); ++ if (ret) ++ break; + } + } -+err: +fsck_err: -+ printbuf_exit(&buf); -+ return ret; ++ if (ret) { ++ bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret)); ++ return ret; ++ } ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ return 0; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, + struct inode_walker *inode, -+ struct snapshots_seen *s, -+ extent_ends *extent_ends) ++ struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; ++ struct bpos equiv; + int ret = 0; -+peek: -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ goto out; -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto err; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { @@ -52104,7 +52568,10 @@ index 000000000..81bfd6ea2 + goto out; + } + -+ ret = snapshots_seen_update(c, s, k.k->p); ++ equiv = k.k->p; ++ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + @@ -52112,23 +52579,28 @@ index 000000000..81bfd6ea2 + goto out; + + if (inode->cur_inum != k.k->p.inode) { -+ extent_ends->nr = 0; -+ + ret = check_i_sectors(trans, inode); + if (ret) + goto err; + } + -+ if (!iter->path->should_be_locked) { -+ /* -+ * hack: check_i_sectors may have handled a transaction restart, -+ * it shouldn't be but we need to fix the new i_sectors check -+ * code and delete the old bch2_count_inode_sectors() first -+ */ -+ goto peek; -+ } ++ BUG_ON(!iter->path->should_be_locked); ++#if 0 ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; + -+ ret = __walk_inode(trans, inode, k.k->p); ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = fix_overlapping_extent(trans, k, prev.k->k.p) ++ ?: -BCH_ERR_transaction_restart_nested; ++ goto out; ++ } ++ } ++#endif ++ ret = __walk_inode(trans, inode, equiv); + if (ret < 0) + goto err; + @@ -52160,42 +52632,56 @@ index 000000000..81bfd6ea2 + goto out; + } + -+ if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { -+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { -+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ k.k->type != KEY_TYPE_reservation && -+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, -+ "extent type %u offset %llu past end of inode %llu, i_size %llu", -+ k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { -+ bch2_fs_lazy_rw(c); -+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, -+ SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, -+ k.k->p.snapshot), -+ POS(k.k->p.inode, U64_MAX), -+ 0, NULL) ?: -EINTR; -+ goto out; ++ /* ++ * Check inodes in reverse order, from oldest snapshots to newest, so ++ * that we emit the fewest number of whiteouts necessary: ++ */ ++ for (i = inode->inodes.data + inode->inodes.nr - 1; ++ i >= inode->inodes.data; ++ --i) { ++ if (i->snapshot > equiv.snapshot || ++ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) ++ continue; ++ ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type past end of inode %llu:%u, i_size %llu\n %s", ++ i->inode.bi_inum, i->snapshot, i->inode.bi_size, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ struct btree_iter iter2; ++ ++ bch2_trans_copy_iter(&iter2, iter); ++ bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ++ ret = bch2_btree_iter_traverse(&iter2) ?: ++ bch2_btree_delete_at(trans, &iter2, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &iter2); ++ if (ret) ++ goto err; ++ ++ if (i->snapshot != equiv.snapshot) { ++ ret = snapshots_seen_add(c, s, i->snapshot); ++ if (ret) ++ goto err; + } + } + } + -+ ret = check_extent_start(trans, s, extent_ends, k, iter); -+ if (ret) -+ goto err; -+ + if (bkey_extent_is_allocation(k.k)) -+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) ++ for_each_visible_inode(c, s, inode, equiv.snapshot, i) + i->count += k.k->size; ++#if 0 ++ bch2_bkey_buf_reassemble(&prev, c, k); ++#endif + -+ ret = extent_ends_at(extent_ends, k); -+ if (ret) -+ goto err; +out: +err: +fsck_err: + printbuf_exit(&buf); + -+ if (ret && ret != -EINTR) -+ bch_err(c, "error %i from check_extent()", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_extent(): %s", bch2_err_str(ret)); + return ret; +} + @@ -52210,36 +52696,34 @@ index 000000000..81bfd6ea2 + struct snapshots_seen s; + struct btree_trans trans; + struct btree_iter iter; -+ extent_ends extent_ends = { 0 }; ++ struct bkey_s_c k; + int ret = 0; + ++#if 0 ++ struct bkey_buf prev; ++ bch2_bkey_buf_init(&prev); ++ prev.k->k = KEY(0, 0, 0); ++#endif + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch_verbose(c, "checking extents"); + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_extent(&trans, &iter, &w, &s, &extent_ends)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(&trans, &iter); -+ darray_exit(&extent_ends); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_extent(&trans, &iter, k, &w, &s)); ++#if 0 ++ bch2_bkey_buf_exit(&prev, c); ++#endif + inode_walker_exit(&w); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + + if (ret) -+ bch_err(c, "error %i from check_extents()", ret); ++ bch_err(c, "error from check_extents(): %s", bch2_err_str(ret)); + return ret; +} + @@ -52247,7 +52731,8 @@ index 000000000..81bfd6ea2 +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; -+ int ret = 0, ret2 = 0; ++ u32 restart_count = trans->restart_count; ++ int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { @@ -52273,13 +52758,16 @@ index 000000000..81bfd6ea2 + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; -+ ret2 = -EINTR; + } + } +fsck_err: -+ if (ret) -+ bch_err(c, "error %i from check_subdir_count()", ret); -+ return ret ?: ret2; ++ if (ret) { ++ bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret)); ++ return ret; ++ } ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ return 0; +} + +static int check_dirent_target(struct btree_trans *trans, @@ -52396,31 +52884,24 @@ index 000000000..81bfd6ea2 +fsck_err: + printbuf_exit(&buf); + -+ if (ret && ret != -EINTR) -+ bch_err(c, "error %i from check_target()", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_target(): %s", bch2_err_str(ret)); + return ret; +} + +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; ++ struct bpos equiv; + int ret = 0; -+peek: -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ goto out; -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto err; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { @@ -52428,7 +52909,10 @@ index 000000000..81bfd6ea2 + goto out; + } + -+ ret = snapshots_seen_update(c, s, k.k->p); ++ equiv = k.k->p; ++ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + @@ -52441,12 +52925,9 @@ index 000000000..81bfd6ea2 + goto err; + } + -+ if (!iter->path->should_be_locked) { -+ /* hack: see check_extent() */ -+ goto peek; -+ } ++ BUG_ON(!iter->path->should_be_locked); + -+ ret = __walk_inode(trans, dir, k.k->p); ++ ret = __walk_inode(trans, dir, equiv); + if (ret < 0) + goto err; + @@ -52546,7 +53027,8 @@ index 000000000..81bfd6ea2 + goto err; + + if (fsck_err_on(!target->inodes.nr, c, -+ "dirent points to missing inode:\n%s", ++ "dirent points to missing inode: (equiv %u)\n%s", ++ equiv.snapshot, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { @@ -52564,7 +53046,7 @@ index 000000000..81bfd6ea2 + } + + if (d.v->d_type == DT_DIR) -+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) ++ for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + +out: @@ -52572,8 +53054,8 @@ index 000000000..81bfd6ea2 +fsck_err: + printbuf_exit(&buf); + -+ if (ret && ret != -EINTR) -+ bch_err(c, "error %i from check_dirent()", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret)); + return ret; +} + @@ -52590,6 +53072,7 @@ index 000000000..81bfd6ea2 + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; ++ struct bkey_s_c k; + int ret = 0; + + bch_verbose(c, "checking dirents"); @@ -52597,22 +53080,13 @@ index 000000000..81bfd6ea2 + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_dirent(&trans, &iter, &hash_info, -+ &dir, &target, &s)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); + + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); @@ -52620,26 +53094,18 @@ index 000000000..81bfd6ea2 + inode_walker_exit(&target); + + if (ret) -+ bch_err(c, "error %i from check_dirents()", ret); ++ bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret)); + return ret; +} + +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *inode) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c k; + int ret; + -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ return 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; @@ -52663,8 +53129,8 @@ index 000000000..81bfd6ea2 + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: -+ if (ret && ret != -EINTR) -+ bch_err(c, "error %i from check_xattr()", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret)); + return ret; +} + @@ -52678,33 +53144,25 @@ index 000000000..81bfd6ea2 + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; ++ struct bkey_s_c k; + int ret = 0; + + bch_verbose(c, "checking xattrs"); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ do { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_xattr(&trans, &iter, &hash_info, -+ &inode)); -+ if (ret) -+ break; -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_xattr(&trans, &iter, k, &hash_info, &inode)); + + bch2_trans_exit(&trans); + + if (ret) -+ bch_err(c, "error %i from check_xattrs()", ret); ++ bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret)); + return ret; +} + @@ -52731,12 +53189,12 @@ index 000000000..81bfd6ea2 + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); -+ ret = __bch2_trans_do(trans, NULL, NULL, ++ ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + if (ret) { -+ bch_err(c, "error writing root subvol: %i", ret); ++ bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); + goto err; + } + @@ -52755,7 +53213,7 @@ index 000000000..81bfd6ea2 + + ret = __write_inode(trans, &root_inode, snapshot); + if (ret) -+ bch_err(c, "error writing root inode: %i", ret); ++ bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); + } +err: +fsck_err: @@ -52820,7 +53278,7 @@ index 000000000..81bfd6ea2 + struct bch_fs *c = trans->c; + int ret = 0; + -+ snapshot = snapshot_t(c, snapshot)->equiv; ++ snapshot = bch2_snapshot_equiv(c, snapshot); + p->nr = 0; + + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && @@ -52894,7 +53352,7 @@ index 000000000..81bfd6ea2 + if (!fsck_err(c, "directory structure loop")) + return 0; + -+ ret = __bch2_trans_do(trans, NULL, NULL, ++ ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); @@ -52908,7 +53366,7 @@ index 000000000..81bfd6ea2 + } +fsck_err: + if (ret) -+ bch_err(c, "%s: err %i", __func__, ret); ++ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret)); + return ret; +} + @@ -52952,8 +53410,6 @@ index 000000000..81bfd6ea2 + } + bch2_trans_iter_exit(&trans, &iter); + -+ BUG_ON(ret == -EINTR); -+ + darray_exit(&path); + + bch2_trans_exit(&trans); @@ -53105,7 +53561,7 @@ index 000000000..81bfd6ea2 + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ ret = snapshots_seen_update(c, &s, k.k->p); ++ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; + @@ -53117,7 +53573,7 @@ index 000000000..81bfd6ea2 + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), -+ d.k->p.snapshot); ++ bch2_snapshot_equiv(c, d.k->p.snapshot)); + break; + } + } @@ -53131,6 +53587,47 @@ index 000000000..81bfd6ea2 + return ret; +} + ++static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct nlink_table *links, ++ size_t *idx, u64 range_end) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ struct nlink *link = &links->d[*idx]; ++ int ret = 0; ++ ++ if (k.k->p.offset >= range_end) ++ return 1; ++ ++ if (!bkey_is_inode(k.k)) ++ return 0; ++ ++ BUG_ON(bch2_inode_unpack(k, &u)); ++ ++ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ return 0; ++ ++ if (!u.bi_nlink) ++ return 0; ++ ++ while ((cmp_int(link->inum, k.k->p.offset) ?: ++ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { ++ BUG_ON(*idx == links->nr); ++ link = &links->d[++*idx]; ++ } ++ ++ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, ++ "inode %llu type %s has wrong i_nlink (%u, should be %u)", ++ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], ++ bch2_inode_nlink_get(&u), link->count)) { ++ bch2_inode_nlink_set(&u, link->count); ++ ret = __write_inode(trans, &u, k.k->p.snapshot); ++ } ++fsck_err: ++ return ret; ++} ++ +noinline_for_stack +static int check_nlinks_update_hardlinks(struct bch_fs *c, + struct nlink_table *links, @@ -53139,56 +53636,25 @@ index 000000000..81bfd6ea2 + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct bch_inode_unpacked u; -+ struct nlink *link = links->d; ++ size_t idx = 0; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, -+ POS(0, range_start), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ if (k.k->p.offset >= range_end) -+ break; ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, ++ POS(0, range_start), ++ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); + -+ if (!bkey_is_inode(k.k)) -+ continue; -+ -+ BUG_ON(bch2_inode_unpack(k, &u)); -+ -+ if (S_ISDIR(le16_to_cpu(u.bi_mode))) -+ continue; -+ -+ if (!u.bi_nlink) -+ continue; -+ -+ while ((cmp_int(link->inum, k.k->p.offset) ?: -+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { -+ link++; -+ BUG_ON(link >= links->d + links->nr); -+ } -+ -+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, -+ "inode %llu type %s has wrong i_nlink (%u, should be %u)", -+ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], -+ bch2_inode_nlink_get(&u), link->count)) { -+ bch2_inode_nlink_set(&u, link->count); -+ -+ ret = write_inode(&trans, &u, k.k->p.snapshot); -+ if (ret) -+ bch_err(c, "error in fsck: error %i updating inode", ret); -+ } -+ } -+fsck_err: -+ bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + -+ if (ret) ++ if (ret < 0) { + bch_err(c, "error in fsck: btree error %i while walking inodes", ret); ++ return ret; ++ } + -+ return ret; ++ return 0; +} + +noinline_for_stack @@ -53228,21 +53694,13 @@ index 000000000..81bfd6ea2 + return ret; +} + -+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) ++static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) +{ -+ struct bkey_s_c k; + struct bkey_s_c_reflink_p p; + struct bkey_i_reflink_p *u; + int ret; + -+ k = bch2_btree_iter_peek(iter); -+ if (!k.k) -+ return 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ + if (k.k->type != KEY_TYPE_reflink_p) + return 0; + @@ -53278,20 +53736,11 @@ index 000000000..81bfd6ea2 + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ if (k.k->type == KEY_TYPE_reflink_p) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ fix_reflink_p_key(&trans, &iter)); -+ if (ret) -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ fix_reflink_p_key(&trans, &iter, k)); + + bch2_trans_exit(&trans); + return ret; @@ -53303,9 +53752,12 @@ index 000000000..81bfd6ea2 + */ +int bch2_fsck_full(struct bch_fs *c) +{ -+ return bch2_fs_snapshots_check(c) ?: ++ int ret; ++again: ++ ret = bch2_fs_check_snapshots(c) ?: ++ bch2_fs_check_subvols(c) ?: ++ bch2_delete_dead_snapshots(c) ?: + check_inodes(c, true) ?: -+ check_subvols(c) ?: + check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: @@ -53313,15 +53765,25 @@ index 000000000..81bfd6ea2 + check_directory_structure(c) ?: + check_nlinks(c) ?: + fix_reflink_p(c); ++ ++ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ goto again; ++ } ++ ++ return ret; +} + +int bch2_fsck_walk_inodes_only(struct bch_fs *c) +{ -+ return check_inodes(c, false); ++ return bch2_fs_check_snapshots(c) ?: ++ bch2_fs_check_subvols(c) ?: ++ bch2_delete_dead_snapshots(c) ?: ++ check_inodes(c, false); +} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h new file mode 100644 -index 000000000..264f2706b +index 000000000000..264f2706b12d --- /dev/null +++ b/fs/bcachefs/fsck.h @@ -0,0 +1,8 @@ @@ -53335,7 +53797,7 @@ index 000000000..264f2706b +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000..6a2b94908 +index 000000000000..083106006747 --- /dev/null +++ b/fs/bcachefs/inode.c @@ -0,0 +1,771 @@ @@ -53980,7 +54442,7 @@ index 000000000..6a2b94908 + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: -+ if (ret && ret != -EINTR) ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + @@ -54051,7 +54513,7 @@ index 000000000..6a2b94908 + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(&trans, &iter); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -54112,7 +54574,7 @@ index 000000000..6a2b94908 +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000..2ac2fc105 +index 000000000000..2ac2fc10513b --- /dev/null +++ b/fs/bcachefs/inode.h @@ -0,0 +1,189 @@ @@ -54307,10 +54769,10 @@ index 000000000..2ac2fc105 +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000..50fa57234 +index 000000000000..971f8ba00dbd --- /dev/null +++ b/fs/bcachefs/io.c -@@ -0,0 +1,2417 @@ +@@ -0,0 +1,2422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations @@ -54625,7 +55087,7 @@ index 000000000..50fa57234 +} + +/* -+ * Returns -EINTR if we had to drop locks: ++ * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, @@ -54638,7 +55100,8 @@ index 000000000..50fa57234 + int ret = 0, ret2 = 0; + u32 snapshot; + -+ while (!ret || ret == -EINTR) { ++ while (!ret || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; @@ -54697,7 +55160,10 @@ index 000000000..50fa57234 + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + -+ return ret == -EINTR ? 0 : ret; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ ++ return ret; +} + +int bch2_write_index_default(struct bch_write_op *op) @@ -54728,7 +55194,7 @@ index 000000000..50fa57234 + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; @@ -54743,7 +55209,7 @@ index 000000000..50fa57234 + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; @@ -54893,7 +55359,7 @@ index 000000000..50fa57234 + u64 sectors_start = keylist_sectors(keys); + int ret = op->index_update_fn(op); + -+ BUG_ON(ret == -EINTR); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + BUG_ON(keylist_sectors(keys) && !ret); + + op->written += sectors_start - keylist_sectors(keys); @@ -56629,10 +57095,9 @@ index 000000000..50fa57234 + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ -+ if (!bch2_trans_relock(&trans)) { -+ ret = -EINTR; ++ ret = bch2_trans_relock(&trans); ++ if (ret) + break; -+ } + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); @@ -56686,7 +57151,9 @@ index 000000000..50fa57234 +err: + bch2_trans_iter_exit(&trans, &iter); + -+ if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ ret == READ_RETRY || ++ ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_exit(&trans); @@ -56730,7 +57197,7 @@ index 000000000..50fa57234 +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 -index 000000000..fb5114518 +index 000000000000..fb5114518666 --- /dev/null +++ b/fs/bcachefs/io.h @@ -0,0 +1,189 @@ @@ -56925,7 +57392,7 @@ index 000000000..fb5114518 +#endif /* _BCACHEFS_IO_H */ diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h new file mode 100644 -index 000000000..78bff13d3 +index 000000000000..78bff13d36f2 --- /dev/null +++ b/fs/bcachefs/io_types.h @@ -0,0 +1,161 @@ @@ -57092,7 +57559,7 @@ index 000000000..78bff13d3 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000..b561ed787 +index 000000000000..937ed1395e46 --- /dev/null +++ b/fs/bcachefs/journal.c @@ -0,0 +1,1429 @@ @@ -57981,7 +58448,7 @@ index 000000000..b561ed787 + + if (!new_fs) { + for (i = 0; i < nr_got; i++) { -+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + bu[i], BCH_DATA_journal, + ca->mi.bucket_size)); @@ -58527,7 +58994,7 @@ index 000000000..b561ed787 +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 -index 000000000..d3caa7ea7 +index 000000000000..d3caa7ea7ce9 --- /dev/null +++ b/fs/bcachefs/journal.h @@ -0,0 +1,521 @@ @@ -59054,7 +59521,7 @@ index 000000000..d3caa7ea7 +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000..0ff78a274 +index 000000000000..6fa2c54c1af4 --- /dev/null +++ b/fs/bcachefs/journal_io.c @@ -0,0 +1,1735 @@ @@ -59257,7 +59724,7 @@ index 000000000..0ff78a274 + bch_err(c, "corrupt metadata before write:\n" \ + msg, ##__VA_ARGS__); \ + if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + } \ + break; \ @@ -59918,7 +60385,7 @@ index 000000000..0ff78a274 + end - offset, sectors_read, + READ); + switch (ret) { -+ case BCH_FSCK_OK: ++ case 0: + sectors = vstruct_sectors(j, c->block_bits); + break; + case JOURNAL_ENTRY_REREAD: @@ -60795,7 +61262,7 @@ index 000000000..0ff78a274 +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 -index 000000000..30e995c81 +index 000000000000..30e995c81fc4 --- /dev/null +++ b/fs/bcachefs/journal_io.h @@ -0,0 +1,59 @@ @@ -60860,14 +61327,15 @@ index 000000000..30e995c81 +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000..fdc94e831 +index 000000000000..6f0ab411c98e --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,849 @@ +@@ -0,0 +1,852 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" ++#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" @@ -61606,15 +62074,17 @@ index 000000000..fdc94e831 +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; ++ int ret; + + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); -+ if (IS_ERR(p)) { -+ bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p)); -+ return PTR_ERR(p); ++ ret = PTR_ERR_OR_ZERO(p); ++ if (ret) { ++ bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); ++ return ret; + } + + get_task_struct(p); @@ -61715,7 +62185,7 @@ index 000000000..fdc94e831 +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h new file mode 100644 -index 000000000..0fd1af120 +index 000000000000..0fd1af120db5 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.h @@ -0,0 +1,86 @@ @@ -61807,7 +62277,7 @@ index 000000000..0fd1af120 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 -index 000000000..001cecec1 +index 000000000000..001cecec1291 --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,220 @@ @@ -62033,7 +62503,7 @@ index 000000000..001cecec1 +} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h new file mode 100644 -index 000000000..a39192e9f +index 000000000000..a39192e9f6f4 --- /dev/null +++ b/fs/bcachefs/journal_sb.h @@ -0,0 +1,24 @@ @@ -62063,7 +62533,7 @@ index 000000000..a39192e9f +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c new file mode 100644 -index 000000000..d9b4042a2 +index 000000000000..5c555b3703c0 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -0,0 +1,322 @@ @@ -62341,7 +62811,7 @@ index 000000000..d9b4042a2 + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); @@ -62391,7 +62861,7 @@ index 000000000..d9b4042a2 +} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h new file mode 100644 -index 000000000..afb886ec8 +index 000000000000..afb886ec8e25 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -0,0 +1,22 @@ @@ -62419,7 +62889,7 @@ index 000000000..afb886ec8 +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h new file mode 100644 -index 000000000..a6cdb885a +index 000000000000..a6cdb885ad41 --- /dev/null +++ b/fs/bcachefs/journal_types.h @@ -0,0 +1,340 @@ @@ -62765,7 +63235,7 @@ index 000000000..a6cdb885a +#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c new file mode 100644 -index 000000000..cda77835b +index 000000000000..cda77835b9ea --- /dev/null +++ b/fs/bcachefs/keylist.c @@ -0,0 +1,67 @@ @@ -62838,7 +63308,7 @@ index 000000000..cda77835b +#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h new file mode 100644 -index 000000000..195799bb2 +index 000000000000..195799bb20bc --- /dev/null +++ b/fs/bcachefs/keylist.h @@ -0,0 +1,76 @@ @@ -62920,7 +63390,7 @@ index 000000000..195799bb2 +#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h new file mode 100644 -index 000000000..4b3ff7d8a +index 000000000000..4b3ff7d8a875 --- /dev/null +++ b/fs/bcachefs/keylist_types.h @@ -0,0 +1,16 @@ @@ -62942,10 +63412,10 @@ index 000000000..4b3ff7d8a +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 -index 000000000..5a09b5500 +index 000000000000..53e607d72274 --- /dev/null +++ b/fs/bcachefs/lru.c -@@ -0,0 +1,219 @@ +@@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -63078,25 +63548,18 @@ index 000000000..5a09b5500 +} + +static int bch2_check_lru_key(struct btree_trans *trans, -+ struct btree_iter *lru_iter) ++ struct btree_iter *lru_iter, ++ struct bkey_s_c lru_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; -+ struct bkey_s_c lru_k, k; ++ struct bkey_s_c k; + struct bch_alloc_v4 a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct bpos alloc_pos; + int ret; + -+ lru_k = bch2_btree_iter_peek(lru_iter); -+ if (!lru_k.k) -+ return 0; -+ -+ ret = bkey_err(lru_k); -+ if (ret) -+ return ret; -+ + alloc_pos = POS(lru_k.k->p.inode, + le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx)); + @@ -63150,16 +63613,10 @@ index 000000000..5a09b5500 + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_check_lru_key(&trans, &iter)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_lru_key(&trans, &iter, k)); + + bch2_trans_exit(&trans); + return ret; @@ -63167,7 +63624,7 @@ index 000000000..5a09b5500 +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 -index 000000000..3decb7b1d +index 000000000000..3decb7b1dde2 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,19 @@ @@ -63192,10 +63649,10 @@ index 000000000..3decb7b1d +#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c new file mode 100644 -index 000000000..5345697f2 +index 000000000000..8b258d966d04 --- /dev/null +++ b/fs/bcachefs/migrate.c -@@ -0,0 +1,193 @@ +@@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for moving data off a device. @@ -63206,6 +63663,7 @@ index 000000000..5345697f2 +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" ++#include "errcode.h" +#include "extents.h" +#include "io.h" +#include "journal.h" @@ -63233,83 +63691,74 @@ index 000000000..5345697f2 + return 0; +} + -+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, -+ enum btree_id btree_id) ++static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ unsigned dev_idx, ++ int flags) +{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf sk; -+ int ret = 0; ++ struct bch_fs *c = trans->c; ++ struct bkey_i *n; ++ int ret; + -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ if (!bch2_bkey_has_device(k, dev_idx)) ++ return 0; + -+ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; + -+ while ((bch2_trans_begin(&trans), -+ (k = bch2_btree_iter_peek(&iter)).k) && -+ !(ret = bkey_err(k))) { -+ if (!bch2_bkey_has_device(k, dev_idx)) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } ++ bkey_reassemble(n, k); + -+ bch2_bkey_buf_reassemble(&sk, c, k); ++ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); ++ if (ret) ++ return ret; + -+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), -+ dev_idx, flags, false); -+ if (ret) -+ break; ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(n)); + -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); ++ /* ++ * Since we're not inserting through an extent iterator ++ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), ++ * we aren't using the extent overwrite path to delete, we're ++ * just using the normal key deletion path: ++ */ ++ if (bkey_deleted(&n->k)) ++ n->k.size = 0; + -+ /* -+ * Since we're not inserting through an extent iterator -+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), -+ * we aren't using the extent overwrite path to delete, we're -+ * just using the normal key deletion path: -+ */ -+ if (bkey_deleted(&sk.k->k)) -+ sk.k->k.size = 0; -+ -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, sk.k, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * don't want to leave ret == -EINTR, since if we raced and -+ * something else overwrote the key we could spuriously return -+ * -EINTR below: -+ */ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; ++ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ -+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?: -+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ enum btree_id id; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!btree_type_has_ptrs(id)) ++ continue; ++ ++ ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; +} + +static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) @@ -63352,19 +63801,20 @@ index 000000000..5345697f2 + } + + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); -+ if (ret == -EINTR) { ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + + if (ret) { -+ bch_err(c, "Error updating btree node key: %i", ret); ++ bch_err(c, "Error updating btree node key: %s", ++ bch2_err_str(ret)); + break; + } +next: + bch2_btree_iter_next_node(&iter); + } -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); @@ -63379,7 +63829,7 @@ index 000000000..5345697f2 + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); + -+ BUG_ON(ret == -EINTR); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + return ret; +} @@ -63391,7 +63841,7 @@ index 000000000..5345697f2 +} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h new file mode 100644 -index 000000000..027efaa0d +index 000000000000..027efaa0d575 --- /dev/null +++ b/fs/bcachefs/migrate.h @@ -0,0 +1,7 @@ @@ -63404,10 +63854,10 @@ index 000000000..027efaa0d +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000..9748b8653 +index 000000000000..2fc247451390 --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,951 @@ +@@ -0,0 +1,952 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -63419,6 +63869,7 @@ index 000000000..9748b8653 +#include "btree_update_interior.h" +#include "disk_groups.h" +#include "ec.h" ++#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "io.h" @@ -63780,7 +64231,7 @@ index 000000000..9748b8653 + ret = lookup_inode(trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret) @@ -63828,7 +64279,7 @@ index 000000000..9748b8653 + break; + + ret = bkey_err(k); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; @@ -63859,7 +64310,7 @@ index 000000000..9748b8653 + ret2 = bch2_move_extent(&trans, ctxt, io_opts, + btree_id, k, data_opts); + if (ret2) { -+ if (ret2 == -EINTR) ++ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + + if (ret2 == -ENOMEM) { @@ -63984,7 +64435,7 @@ index 000000000..9748b8653 + + ret = bch2_get_next_backpointer(&trans, bucket, gen, + &bp_offset, &bp); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; @@ -63999,7 +64450,7 @@ index 000000000..9748b8653 + k = bch2_backpointer_get_key(&trans, &iter, + bucket, bp_offset, bp); + ret = bkey_err(k); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; @@ -64026,7 +64477,7 @@ index 000000000..9748b8653 + + ret = bch2_move_extent(&trans, ctxt, io_opts, + bp.btree_id, k, data_opts); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ @@ -64045,7 +64496,7 @@ index 000000000..9748b8653 + b = bch2_backpointer_get_node(&trans, &iter, + bucket, bp_offset, bp); + ret = PTR_ERR_OR_ZERO(b); -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; @@ -64055,7 +64506,7 @@ index 000000000..9748b8653 + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); + bch2_trans_iter_exit(&trans, &iter); + -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; @@ -64150,14 +64601,14 @@ index 000000000..9748b8653 + goto next; + + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; +next: + bch2_btree_iter_next_node(&iter); + } -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); @@ -64169,7 +64620,7 @@ index 000000000..9748b8653 + bch2_trans_exit(&trans); + + if (ret) -+ bch_err(c, "error %i in bch2_move_btree", ret); ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); + + bch2_btree_interior_updates_flush(c); + @@ -64361,7 +64812,7 @@ index 000000000..9748b8653 +} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h new file mode 100644 -index 000000000..c0fec69bb +index 000000000000..c0fec69bbb6a --- /dev/null +++ b/fs/bcachefs/move.h @@ -0,0 +1,67 @@ @@ -64434,7 +64885,7 @@ index 000000000..c0fec69bb +#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h new file mode 100644 -index 000000000..9df6d1813 +index 000000000000..9df6d18137a5 --- /dev/null +++ b/fs/bcachefs/move_types.h @@ -0,0 +1,19 @@ @@ -64459,10 +64910,10 @@ index 000000000..9df6d1813 +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000..f9ad4cb26 +index 000000000000..f913864eaa4f --- /dev/null +++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,282 @@ +@@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector @@ -64478,6 +64929,7 @@ index 000000000..f9ad4cb26 +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" ++#include "errcode.h" +#include "error.h" +#include "extents.h" +#include "eytzinger.h" @@ -64627,7 +65079,7 @@ index 000000000..f9ad4cb26 + bch2_moving_ctxt_exit(&ctxt); + + if (ret < 0) -+ bch_err(c, "error %i from bch2_move_data() in copygc", ret); ++ bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); + + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + return ret; @@ -64716,6 +65168,7 @@ index 000000000..f9ad4cb26 +int bch2_copygc_start(struct bch_fs *c) +{ + struct task_struct *t; ++ int ret; + + if (c->copygc_thread) + return 0; @@ -64727,9 +65180,10 @@ index 000000000..f9ad4cb26 + return -ENOMEM; + + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); -+ if (IS_ERR(t)) { -+ bch_err(c, "error creating copygc thread: %li", PTR_ERR(t)); -+ return PTR_ERR(t); ++ ret = PTR_ERR_OR_ZERO(t); ++ if (ret) { ++ bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); ++ return ret; + } + + get_task_struct(t); @@ -64747,14 +65201,15 @@ index 000000000..f9ad4cb26 +} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h new file mode 100644 -index 000000000..922738247 +index 000000000000..e85c8136a46e --- /dev/null +++ b/fs/bcachefs/movinggc.h -@@ -0,0 +1,9 @@ +@@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVINGGC_H +#define _BCACHEFS_MOVINGGC_H + ++unsigned long bch2_copygc_wait_amount(struct bch_fs *); +void bch2_copygc_stop(struct bch_fs *); +int bch2_copygc_start(struct bch_fs *); +void bch2_fs_copygc_init(struct bch_fs *); @@ -64762,7 +65217,7 @@ index 000000000..922738247 +#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 -index 000000000..407b221e8 +index 000000000000..407b221e8f6c --- /dev/null +++ b/fs/bcachefs/opts.c @@ -0,0 +1,578 @@ @@ -65346,10 +65801,10 @@ index 000000000..407b221e8 +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 -index 000000000..2f5f49cb7 +index 000000000000..5b8586ecb374 --- /dev/null +++ b/fs/bcachefs/opts.h -@@ -0,0 +1,504 @@ +@@ -0,0 +1,509 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H @@ -65693,6 +66148,11 @@ index 000000000..2f5f49cb7 + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ ++ x(direct_io, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Use O_DIRECT (userspace only)") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ @@ -65856,13 +66316,14 @@ index 000000000..2f5f49cb7 +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000..d764dc7ab +index 000000000000..454c76e03be9 --- /dev/null +++ b/fs/bcachefs/quota.c -@@ -0,0 +1,859 @@ +@@ -0,0 +1,823 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" ++#include "errcode.h" +#include "inode.h" +#include "quota.h" +#include "subvolume.h" @@ -66232,6 +66693,9 @@ index 000000000..d764dc7ab + + BUG_ON(k.k->p.inode >= QTYP_NR); + ++ if (!((1U << k.k->p.inode) & enabled_qtypes(c))) ++ return 0; ++ + switch (k.k->type) { + case KEY_TYPE_quota: + dq = bkey_s_c_to_quota(k); @@ -66255,30 +66719,6 @@ index 000000000..d764dc7ab + return 0; +} + -+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0), -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->p.inode != type) -+ break; -+ -+ ret = __bch2_quota_set(c, k); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ +void bch2_fs_quota_exit(struct bch_fs *c) +{ + unsigned i; @@ -66317,22 +66757,14 @@ index 000000000..d764dc7ab +} + +static int bch2_fs_quota_read_inode(struct btree_trans *trans, -+ struct btree_iter *iter) ++ struct btree_iter *iter, ++ struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_subvolume subvolume; -+ struct bkey_s_c k; + int ret; + -+ k = bch2_btree_iter_peek(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (!k.k) -+ return 1; -+ + ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + if (ret) + return ret; @@ -66361,36 +66793,28 @@ index 000000000..d764dc7ab + +int bch2_fs_quota_read(struct bch_fs *c) +{ -+ unsigned i, qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; + struct btree_trans trans; + struct btree_iter iter; ++ struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); + bch2_sb_quota_read(c); + mutex_unlock(&c->sb_lock); + -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_init_type(c, i); -+ if (ret) -+ return ret; -+ } -+ + bch2_trans_init(&trans, c, 0, 0); + -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ do { -+ ret = lockrestart_do(&trans, -+ bch2_fs_quota_read_inode(&trans, &iter)); -+ } while (!ret); -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, ++ POS_MIN, BTREE_ITER_PREFETCH, k, ++ __bch2_quota_set(c, k)) ?: ++ for_each_btree_key2(&trans, iter, BTREE_ID_inodes, ++ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ bch2_fs_quota_read_inode(&trans, &iter, k)); ++ if (ret) ++ bch_err(c, "err in quota_read: %s", bch2_err_str(ret)); + + bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; ++ return ret; +} + +/* Enable/disable/delete quotas for an entire filesystem: */ @@ -66721,7 +67145,7 @@ index 000000000..d764dc7ab +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 -index 000000000..8c67ae1da +index 000000000000..8c67ae1da7c7 --- /dev/null +++ b/fs/bcachefs/quota.h @@ -0,0 +1,71 @@ @@ -66798,7 +67222,7 @@ index 000000000..8c67ae1da +#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h new file mode 100644 -index 000000000..6a136083d +index 000000000000..6a136083d389 --- /dev/null +++ b/fs/bcachefs/quota_types.h @@ -0,0 +1,43 @@ @@ -66847,10 +67271,10 @@ index 000000000..6a136083d +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000..31da40933 +index 000000000000..ecc64dd92b05 --- /dev/null +++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,358 @@ +@@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -66859,6 +67283,7 @@ index 000000000..31da40933 +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" ++#include "errcode.h" +#include "extents.h" +#include "io.h" +#include "move.h" @@ -67184,6 +67609,7 @@ index 000000000..31da40933 +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; ++ int ret; + + if (c->rebalance.thread) + return 0; @@ -67192,9 +67618,10 @@ index 000000000..31da40933 + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); -+ if (IS_ERR(p)) { -+ bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p)); -+ return PTR_ERR(p); ++ ret = PTR_ERR_OR_ZERO(p); ++ if (ret) { ++ bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); ++ return ret; + } + + get_task_struct(p); @@ -67211,7 +67638,7 @@ index 000000000..31da40933 +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h new file mode 100644 -index 000000000..7ade0bb81 +index 000000000000..7ade0bb81cce --- /dev/null +++ b/fs/bcachefs/rebalance.h @@ -0,0 +1,28 @@ @@ -67245,7 +67672,7 @@ index 000000000..7ade0bb81 +#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h new file mode 100644 -index 000000000..7462a92e9 +index 000000000000..7462a92e9598 --- /dev/null +++ b/fs/bcachefs/rebalance_types.h @@ -0,0 +1,26 @@ @@ -67277,10 +67704,10 @@ index 000000000..7462a92e9 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000..eea025a83 +index 000000000000..b070bdf01500 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1584 @@ +@@ -0,0 +1,1597 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -67294,6 +67721,7 @@ index 000000000..eea025a83 +#include "buckets.h" +#include "dirent.h" +#include "ec.h" ++#include "errcode.h" +#include "error.h" +#include "fs-common.h" +#include "fsck.h" @@ -67370,9 +67798,9 @@ index 000000000..eea025a83 + return keys->d + idx_to_pos(keys, idx); +} + -+static size_t bch2_journal_key_search(struct journal_keys *keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) ++static size_t __bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + @@ -67390,7 +67818,14 @@ index 000000000..eea025a83 + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + -+ return idx_to_pos(keys, l); ++ return l; ++} ++ ++static size_t bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, @@ -67399,22 +67834,21 @@ index 000000000..eea025a83 +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; ++ struct journal_key *k; +search: + if (!*idx) -+ *idx = bch2_journal_key_search(keys, btree_id, level, pos); ++ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + -+ while (*idx < keys->size && -+ keys->d[*idx].btree_id == btree_id && -+ keys->d[*idx].level == level && -+ bpos_cmp(keys->d[*idx].k->k.p, end_pos) <= 0) { -+ if (bpos_cmp(keys->d[*idx].k->k.p, pos) >= 0 && -+ !keys->d[*idx].overwritten) -+ return keys->d[*idx].k; ++ while (*idx < keys->nr && ++ (k = idx_to_key(keys, *idx), ++ k->btree_id == btree_id && ++ k->level == level && ++ bpos_cmp(k->k->k.p, end_pos) <= 0)) { ++ if (bpos_cmp(k->k->k.p, pos) >= 0 && ++ !k->overwritten) ++ return k->k; + + (*idx)++; -+ if (*idx == keys->gap) -+ *idx += keys->size - keys->nr; -+ + iters++; + if (iters == 10) { + *idx = 0; @@ -68436,7 +68870,7 @@ index 000000000..eea025a83 +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); -+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; ++ ret = -BCH_ERR_fsck_repair_impossible; + goto err; + + } @@ -68711,10 +69145,16 @@ index 000000000..eea025a83 + bch2_journal_entries_free(c); + } + kfree(clean); ++ ++ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { ++ bch2_fs_read_write_early(c); ++ bch2_delete_dead_snapshots_async(c); ++ } ++ + if (ret) -+ bch_err(c, "Error in recovery: %s (%i)", err, ret); ++ bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret)); + else -+ bch_verbose(c, "ret %i", ret); ++ bch_verbose(c, "ret %s", bch2_err_str(ret)); + return ret; +err: +fsck_err: @@ -68867,7 +69307,7 @@ index 000000000..eea025a83 +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 -index 000000000..8c0348e8b +index 000000000000..8c0348e8b84c --- /dev/null +++ b/fs/bcachefs/recovery.h @@ -0,0 +1,58 @@ @@ -68931,10 +69371,10 @@ index 000000000..8c0348e8b +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 -index 000000000..2038e3502 +index 000000000000..d5c14bb2992d --- /dev/null +++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,421 @@ +@@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" @@ -69236,7 +69676,8 @@ index 000000000..2038e3502 + bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + -+ while ((ret == 0 || ret == -EINTR) && ++ while ((ret == 0 || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) && + bkey_cmp(dst_iter.pos, dst_end) < 0) { + struct disk_reservation disk_res = { 0 }; + @@ -69346,7 +69787,7 @@ index 000000000..2038e3502 + } + + bch2_trans_iter_exit(&trans, &inode_iter); -+ } while (ret2 == -EINTR); ++ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&new_src, c); @@ -69358,7 +69799,7 @@ index 000000000..2038e3502 +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 -index 000000000..f9848dc3e +index 000000000000..f9848dc3eebb --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,76 @@ @@ -69440,7 +69881,7 @@ index 000000000..f9848dc3e +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 -index 000000000..9cb47ba62 +index 000000000000..9cb47ba62bc3 --- /dev/null +++ b/fs/bcachefs/replicas.c @@ -0,0 +1,1073 @@ @@ -70519,7 +70960,7 @@ index 000000000..9cb47ba62 +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h new file mode 100644 -index 000000000..87820b2e1 +index 000000000000..87820b2e1ad3 --- /dev/null +++ b/fs/bcachefs/replicas.h @@ -0,0 +1,106 @@ @@ -70631,7 +71072,7 @@ index 000000000..87820b2e1 +#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h new file mode 100644 -index 000000000..0535b1d37 +index 000000000000..0535b1d3760e --- /dev/null +++ b/fs/bcachefs/replicas_types.h @@ -0,0 +1,10 @@ @@ -70647,7 +71088,7 @@ index 000000000..0535b1d37 +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c new file mode 100644 -index 000000000..c062edb3f +index 000000000000..c062edb3fbc2 --- /dev/null +++ b/fs/bcachefs/siphash.c @@ -0,0 +1,173 @@ @@ -70826,7 +71267,7 @@ index 000000000..c062edb3f +} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h new file mode 100644 -index 000000000..3dfaf34a4 +index 000000000000..3dfaf34a43b2 --- /dev/null +++ b/fs/bcachefs/siphash.h @@ -0,0 +1,87 @@ @@ -70919,7 +71360,7 @@ index 000000000..3dfaf34a4 +#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h new file mode 100644 -index 000000000..591bbb9f8 +index 000000000000..591bbb9f8beb --- /dev/null +++ b/fs/bcachefs/str_hash.h @@ -0,0 +1,351 @@ @@ -71276,30 +71717,28 @@ index 000000000..591bbb9f8 +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000..60b60de83 +index 000000000000..b5b0f5e39f97 --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,1095 @@ +@@ -0,0 +1,1108 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" ++#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "subvolume.h" + +/* Snapshot tree: */ + -+static void bch2_delete_dead_snapshots_work(struct work_struct *); -+static void bch2_delete_dead_snapshots(struct bch_fs *); -+ +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + -+ prt_printf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", ++ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), @@ -71416,7 +71855,7 @@ index 000000000..60b60de83 + if (!id) + return 0; + -+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ ret = snapshot_lookup(trans, id, &v); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) @@ -71425,157 +71864,206 @@ index 000000000..60b60de83 + return !BCH_SNAPSHOT_DELETED(&v); +} + -+static int bch2_snapshots_set_equiv(struct btree_trans *trans) ++static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; ++ unsigned i, nr_live = 0, live_idx = 0; + struct bkey_s_c_snapshot snap; -+ unsigned i; -+ int ret; ++ u32 id = k.k->p.offset, child[2]; + -+ for_each_btree_key(trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ u32 id = k.k->p.offset, child[2]; -+ unsigned nr_live = 0, live_idx = 0; ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; + -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; ++ snap = bkey_s_c_to_snapshot(k); + -+ snap = bkey_s_c_to_snapshot(k); -+ child[0] = le32_to_cpu(snap.v->children[0]); -+ child[1] = le32_to_cpu(snap.v->children[1]); ++ child[0] = le32_to_cpu(snap.v->children[0]); ++ child[1] = le32_to_cpu(snap.v->children[1]); + -+ for (i = 0; i < 2; i++) { -+ ret = snapshot_live(trans, child[i]); -+ if (ret < 0) -+ goto err; ++ for (i = 0; i < 2; i++) { ++ int ret = snapshot_live(trans, child[i]); ++ if (ret < 0) ++ return ret; + -+ if (ret) -+ live_idx = i; -+ nr_live += ret; -+ } -+ -+ snapshot_t(c, id)->equiv = nr_live == 1 -+ ? snapshot_t(c, child[live_idx])->equiv -+ : id; ++ if (ret) ++ live_idx = i; ++ nr_live += ret; + } -+err: -+ bch2_trans_iter_exit(trans, &iter); + -+ if (ret) -+ bch_err(c, "error walking snapshots: %i", ret); -+ -+ return ret; ++ snapshot_t(c, id)->equiv = nr_live == 1 ++ ? snapshot_t(c, child[live_idx])->equiv ++ : id; ++ return 0; +} + +/* fsck: */ -+static int bch2_snapshot_check(struct btree_trans *trans, -+ struct bkey_s_c_snapshot s) ++static int check_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) +{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_snapshot s; + struct bch_subvolume subvol; + struct bch_snapshot v; ++ struct printbuf buf = PRINTBUF; ++ bool should_have_subvol; + u32 i, id; -+ int ret; ++ int ret = 0; + -+ id = le32_to_cpu(s.v->subvol); -+ ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol)); -+ if (ret == -ENOENT) -+ bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", -+ s.k->p.offset, id); -+ if (ret) -+ return ret; -+ -+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { -+ bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", -+ s.k->p.offset); -+ return -EINVAL; -+ } ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; + ++ s = bkey_s_c_to_snapshot(k); + id = le32_to_cpu(s.v->parent); + if (id) { -+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ ret = snapshot_lookup(trans, id, &v); + if (ret == -ENOENT) -+ bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", -+ s.k->p.offset, id); ++ bch_err(c, "snapshot with nonexistent parent:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + if (ret) -+ return ret; ++ goto err; + + if (le32_to_cpu(v.children[0]) != s.k->p.offset && + le32_to_cpu(v.children[1]) != s.k->p.offset) { -+ bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", ++ bch_err(c, "snapshot parent %u missing pointer to child %llu", + id, s.k->p.offset); -+ return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + } + + for (i = 0; i < 2 && s.v->children[i]; i++) { + id = le32_to_cpu(s.v->children[i]); + -+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); ++ ret = snapshot_lookup(trans, id, &v); + if (ret == -ENOENT) -+ bch_err(trans->c, "snapshot node %llu has nonexistent child %u", ++ bch_err(c, "snapshot node %llu has nonexistent child %u", + s.k->p.offset, id); + if (ret) -+ return ret; ++ goto err; + + if (le32_to_cpu(v.parent) != s.k->p.offset) { -+ bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", ++ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), s.k->p.offset); -+ return -EINVAL; ++ ret = -EINVAL; ++ goto err; + } + } + ++ should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && ++ !BCH_SNAPSHOT_DELETED(s.v); ++ ++ if (should_have_subvol) { ++ id = le32_to_cpu(s.v->subvol); ++ ret = bch2_subvolume_get(trans, id, 0, false, &subvol); ++ if (ret == -ENOENT) ++ bch_err(c, "snapshot points to nonexistent subvolume:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); ++ if (ret) ++ goto err; ++ ++ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { ++ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", ++ s.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } else { ++ if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ++ struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); ++ ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&u->k_i, s.s_c); ++ u->v.subvol = 0; ++ ret = bch2_trans_update(trans, iter, &u->k_i, 0); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (BCH_SNAPSHOT_DELETED(s.v)) ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_fs_check_snapshots(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_snapshot(&trans, &iter, k)); ++ ++ if (ret) ++ bch_err(c, "error %i checking snapshots", ret); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int check_subvol(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_subvolume subvol; ++ struct bch_snapshot snapshot; ++ unsigned snapid; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_subvolume) ++ return 0; ++ ++ subvol = bkey_s_c_to_subvolume(k); ++ snapid = le32_to_cpu(subvol.v->snapshot); ++ ret = snapshot_lookup(trans, snapid, &snapshot); ++ ++ if (ret == -ENOENT) ++ bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u", ++ k.k->p.offset, snapid); ++ if (ret) ++ return ret; ++ ++ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ++ ret = bch2_subvolume_delete(trans, iter->pos.offset); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error deleting subvolume %llu: %s", ++ iter->pos.offset, bch2_err_str(ret)); ++ if (ret) ++ return ret; ++ } ++ + return 0; +} + -+int bch2_fs_snapshots_check(struct bch_fs *c) ++int bch2_fs_check_subvols(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ struct bch_snapshot s; -+ unsigned id; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_subvol(&trans, &iter, k)); + -+ ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) { -+ bch_err(c, "error %i checking snapshots", ret); -+ goto err; -+ } -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, -+ POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_subvolume) -+ continue; -+again_2: -+ id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); -+ ret = snapshot_lookup(&trans, id, &s); -+ -+ if (ret == -EINTR) { -+ k = bch2_btree_iter_peek(&iter); -+ goto again_2; -+ } else if (ret == -ENOENT) -+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u", -+ k.k->p.offset, id); -+ else if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+err: + bch2_trans_exit(&trans); ++ + return ret; +} + @@ -71589,49 +72077,19 @@ index 000000000..60b60de83 + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ bool have_deleted = false; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) -+ break; ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?: ++ bch2_snapshot_set_equiv(&trans, k)); + -+ if (k.k->type != KEY_TYPE_snapshot) { -+ bch_err(c, "found wrong key type %u in snapshot node table", -+ k.k->type); -+ continue; -+ } -+ -+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) -+ have_deleted = true; -+ -+ ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) -+ goto err; -+ -+ ret = bch2_snapshots_set_equiv(&trans); -+ if (ret) -+ goto err; -+err: + bch2_trans_exit(&trans); + -+ if (!ret && have_deleted) { -+ bch_info(c, "restarting deletion of dead snapshots"); -+ if (c->opts.fsck) { -+ bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); -+ } else { -+ bch2_delete_dead_snapshots(c); -+ } -+ } -+ ++ if (ret) ++ bch_err(c, "error starting snapshots: %s", bch2_err_str(ret)); + return ret; +} + @@ -71668,8 +72126,10 @@ index 000000000..60b60de83 + goto err; + + bkey_reassemble(&s->k_i, k); -+ + SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); ++ s->v.subvol = 0; ++ + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; @@ -71833,6 +72293,7 @@ index 000000000..60b60de83 + + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); ++ n->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + if (ret) @@ -71843,126 +72304,100 @@ index 000000000..60b60de83 + return ret; +} + -+static int snapshot_id_add(snapshot_id_list *s, u32 id) -+{ -+ BUG_ON(snapshot_list_has_id(s, id)); -+ -+ return darray_push(s, id); -+} -+ -+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, -+ snapshot_id_list *deleted, -+ enum btree_id btree_id) ++static int snapshot_delete_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ snapshot_id_list *deleted, ++ snapshot_id_list *equiv_seen, ++ struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ snapshot_id_list equiv_seen = { 0 }; -+ struct bpos last_pos = POS_MIN; -+ int ret = 0; ++ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; + -+ /* -+ * XXX: We should also delete whiteouts that no longer overwrite -+ * anything -+ */ ++ if (bkey_cmp(k.k->p, *last_pos)) ++ equiv_seen->nr = 0; ++ *last_pos = k.k->p; + -+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ while ((bch2_trans_begin(trans), -+ (k = bch2_btree_iter_peek(&iter)).k) && -+ !(ret = bkey_err(k))) { -+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; -+ -+ if (bkey_cmp(k.k->p, last_pos)) -+ equiv_seen.nr = 0; -+ last_pos = k.k->p; -+ -+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || -+ snapshot_list_has_id(&equiv_seen, equiv)) { -+ if (btree_id == BTREE_ID_inodes && -+ bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) -+ continue; -+ -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_delete_at(trans, &iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); -+ if (ret) -+ break; -+ } else { -+ ret = snapshot_id_add(&equiv_seen, equiv); -+ if (ret) -+ break; -+ } -+ -+ bch2_btree_iter_advance(&iter); ++ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || ++ snapshot_list_has_id(equiv_seen, equiv)) { ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ } else { ++ return snapshot_list_add(c, equiv_seen, equiv); + } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ darray_exit(&equiv_seen); -+ -+ return ret; +} + -+static void bch2_delete_dead_snapshots_work(struct work_struct *work) ++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot snap; ++ u32 children[2]; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v) || ++ BCH_SNAPSHOT_SUBVOL(snap.v)) ++ return 0; ++ ++ children[0] = le32_to_cpu(snap.v->children[0]); ++ children[1] = le32_to_cpu(snap.v->children[1]); ++ ++ ret = snapshot_live(trans, children[0]) ?: ++ snapshot_live(trans, children[1]); ++ if (ret < 0) ++ return ret; ++ ++ if (!ret) ++ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); ++ return 0; ++} ++ ++int bch2_delete_dead_snapshots(struct bch_fs *c) +{ -+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + snapshot_id_list deleted = { 0 }; -+ u32 i, id, children[2]; ++ u32 i, id; + int ret = 0; + ++ if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) ++ return 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ ret = bch2_fs_read_write_early(c); ++ if (ret) { ++ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ } ++ + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ if (BCH_SNAPSHOT_DELETED(snap.v) || -+ BCH_SNAPSHOT_SUBVOL(snap.v)) -+ continue; -+ -+ children[0] = le32_to_cpu(snap.v->children[0]); -+ children[1] = le32_to_cpu(snap.v->children[1]); -+ -+ ret = snapshot_live(&trans, children[0]) ?: -+ snapshot_live(&trans, children[1]); -+ if (ret < 0) -+ break; -+ if (ret) -+ continue; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); -+ if (ret) { -+ bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ NULL, NULL, 0, ++ bch2_delete_redundant_snapshot(&trans, &iter, k)); + if (ret) { -+ bch_err(c, "error walking snapshots: %i", ret); ++ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); + goto err; + } + -+ ret = bch2_snapshots_set_equiv(&trans); -+ if (ret) ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_snapshot_set_equiv(&trans, k)); ++ if (ret) { ++ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); + goto err; ++ } + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { @@ -71971,7 +72406,7 @@ index 000000000..60b60de83 + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { -+ ret = snapshot_id_add(&deleted, k.k->p.offset); ++ ret = snapshot_list_add(c, &deleted, k.k->p.offset); + if (ret) + break; + } @@ -71979,39 +72414,59 @@ index 000000000..60b60de83 + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { -+ bch_err(c, "error walking snapshots: %i", ret); ++ bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { ++ struct bpos last_pos = POS_MIN; ++ snapshot_id_list equiv_seen = { 0 }; ++ + if (!btree_type_has_snapshots(id)) + continue; + -+ ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); ++ ret = for_each_btree_key_commit(&trans, iter, ++ id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); ++ ++ darray_exit(&equiv_seen); ++ + if (ret) { -+ bch_err(c, "error deleting snapshot keys: %i", ret); ++ bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); + goto err; + } + } + + for (i = 0; i < deleted.nr; i++) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.data[i])); + if (ret) { -+ bch_err(c, "error deleting snapshot %u: %i", -+ deleted.data[i], ret); ++ bch_err(c, "error deleting snapshot %u: %s", ++ deleted.data[i], bch2_err_str(ret)); + goto err; + } + } ++ ++ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); +err: + darray_exit(&deleted); + bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void bch2_delete_dead_snapshots_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); ++ ++ bch2_delete_dead_snapshots(c); + percpu_ref_put(&c->writes); +} + -+static void bch2_delete_dead_snapshots(struct bch_fs *c) ++void bch2_delete_dead_snapshots_async(struct bch_fs *c) +{ -+ if (unlikely(!percpu_ref_tryget_live(&c->writes))) ++ if (!percpu_ref_tryget_live(&c->writes)) + return; + + if (!queue_work(system_long_wq, &c->snapshot_delete_work)) @@ -72021,7 +72476,14 @@ index 000000000..60b60de83 +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ -+ bch2_delete_dead_snapshots(trans->c); ++ struct bch_fs *c = trans->c; ++ ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ ++ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags)) ++ return 0; ++ ++ bch2_delete_dead_snapshots_async(c); + return 0; +} + @@ -72112,7 +72574,6 @@ index 000000000..60b60de83 + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + struct btree_trans_commit_hook *h; -+ struct bkey_i *delete; + u32 snapid; + int ret = 0; + @@ -72134,14 +72595,7 @@ index 000000000..60b60de83 + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ ret = PTR_ERR_OR_ZERO(delete); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete->k); -+ delete->k.p = iter.pos; -+ ret = bch2_trans_update(trans, &iter, delete, 0); ++ ret = bch2_btree_delete_at(trans, &iter, 0); + if (ret) + goto err; + @@ -72182,7 +72636,7 @@ index 000000000..60b60de83 + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { -+ bch_err(c, "error %i deleting subvolume %u", ret, *id); ++ bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); + break; + } + } @@ -72207,7 +72661,7 @@ index 000000000..60b60de83 + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) -+ ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); ++ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) @@ -72377,10 +72831,10 @@ index 000000000..60b60de83 +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000..b1739d29c +index 000000000000..02a636644988 --- /dev/null +++ b/fs/bcachefs/subvolume.h -@@ -0,0 +1,126 @@ +@@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H @@ -72410,6 +72864,16 @@ index 000000000..b1739d29c + return snapshot_t(c, id)->parent; +} + ++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->equiv; ++} ++ ++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) ++{ ++ return id == snapshot_t(c, id)->equiv; ++} ++ +static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s = snapshot_t(c, id); @@ -72441,31 +72905,6 @@ index 000000000..b1739d29c + return id == ancestor; +} + -+struct snapshots_seen { -+ struct bpos pos; -+ DARRAY(u32) ids; -+}; -+ -+static inline void snapshots_seen_exit(struct snapshots_seen *s) -+{ -+ kfree(s->ids.data); -+ s->ids.data = NULL; -+} -+ -+static inline void snapshots_seen_init(struct snapshots_seen *s) -+{ -+ memset(s, 0, sizeof(*s)); -+} -+ -+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) -+{ -+ int ret = darray_push(&s->ids, id); -+ if (ret) -+ bch_err(c, "error reallocating snapshots_seen table (size %zu)", -+ s->ids.size); -+ return ret; -+} -+ +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) +{ + u32 *i; @@ -72476,7 +72915,30 @@ index 000000000..b1739d29c + return false; +} + -+int bch2_fs_snapshots_check(struct bch_fs *); ++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ u32 *i; ++ ++ darray_for_each(*s, i) ++ if (bch2_snapshot_is_ancestor(c, id, *i)) ++ return true; ++ return false; ++} ++ ++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ int ret; ++ ++ BUG_ON(snapshot_list_has_id(s, id)); ++ ret = darray_push(s, id); ++ if (ret) ++ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); ++ return ret; ++} ++ ++int bch2_fs_check_snapshots(struct bch_fs *); ++int bch2_fs_check_subvols(struct bch_fs *); ++ +void bch2_fs_snapshots_exit(struct bch_fs *); +int bch2_fs_snapshots_start(struct bch_fs *); + @@ -72499,6 +72961,9 @@ index 000000000..b1739d29c +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + ++int bch2_delete_dead_snapshots(struct bch_fs *); ++void bch2_delete_dead_snapshots_async(struct bch_fs *); ++ +int bch2_subvolume_delete(struct btree_trans *, u32); +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, @@ -72509,7 +72974,7 @@ index 000000000..b1739d29c +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 -index 000000000..f7562b5d5 +index 000000000000..f7562b5d51df --- /dev/null +++ b/fs/bcachefs/subvolume_types.h @@ -0,0 +1,9 @@ @@ -72524,7 +72989,7 @@ index 000000000..f7562b5d5 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000..8b8130993 +index 000000000000..8b8130993a59 --- /dev/null +++ b/fs/bcachefs/super-io.c @@ -0,0 +1,1602 @@ @@ -74132,7 +74597,7 @@ index 000000000..8b8130993 +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 -index 000000000..14a25f6fe +index 000000000000..14a25f6fe29a --- /dev/null +++ b/fs/bcachefs/super-io.h @@ -0,0 +1,126 @@ @@ -74264,10 +74729,10 @@ index 000000000..14a25f6fe +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000..290897403 +index 000000000000..7c6348001ae3 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,1970 @@ +@@ -0,0 +1,1950 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -74294,6 +74759,7 @@ index 000000000..290897403 +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" ++#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "fs-io.h" @@ -75200,31 +75666,10 @@ index 000000000..290897403 + up_write(&c->state_lock); + return ret; +err: -+ switch (ret) { -+ case BCH_FSCK_ERRORS_NOT_FIXED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("mount with -o fix_errors to repair\n"); -+ break; -+ case BCH_FSCK_REPAIR_UNIMPLEMENTED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); -+ break; -+ case BCH_FSCK_REPAIR_IMPOSSIBLE: -+ bch_err(c, "filesystem contains errors, but repair impossible"); -+ break; -+ case BCH_FSCK_UNKNOWN_VERSION: -+ bch_err(c, "unknown metadata version"); -+ break; -+ case -ENOMEM: -+ bch_err(c, "cannot allocate memory"); -+ break; -+ case -EIO: -+ bch_err(c, "IO error"); -+ break; -+ } ++ bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); + -+ if (ret >= 0) -+ ret = -EIO; ++ if (ret < -BCH_ERR_START) ++ ret = -EINVAL; + goto out; +} + @@ -75708,7 +76153,7 @@ index 000000000..290897403 + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) -+ bch_err(c, "error %i removing dev alloc info", ret); ++ bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); + + return ret; +} @@ -75736,7 +76181,7 @@ index 000000000..290897403 + + ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + if (ret) { -+ bch_err(ca, "Remove failed: error %i dropping data", ret); ++ bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); + goto err; + } + @@ -75748,7 +76193,7 @@ index 000000000..290897403 + + ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); + if (ret) { -+ bch_err(ca, "Remove failed: error %i flushing journal", ret); ++ bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); + goto err; + } + @@ -75760,7 +76205,7 @@ index 000000000..290897403 + + ret = bch2_replicas_gc2(c); + if (ret) { -+ bch_err(ca, "Remove failed: error %i from replicas gc", ret); ++ bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); + goto err; + } + @@ -75824,7 +76269,7 @@ index 000000000..290897403 + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { -+ bch_err(c, "device add error: error reading super: %i", ret); ++ bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); + goto err; + } + @@ -75917,13 +76362,13 @@ index 000000000..290897403 + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { -+ bch_err(c, "device add error: error marking new superblock: %i", ret); ++ bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); + goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { -+ bch_err(c, "device add error: error initializing free space: %i", ret); ++ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + goto err_late; + } + @@ -75985,8 +76430,8 @@ index 000000000..290897403 + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { -+ bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb", -+ path, ret); ++ bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", ++ path, bch2_err_str(ret)); + goto err; + } + @@ -76055,7 +76500,7 @@ index 000000000..290897403 + + ret = bch2_dev_buckets_resize(c, ca, nbuckets); + if (ret) { -+ bch_err(ca, "Resize error: %i", ret); ++ bch_err(ca, "Resize error: %s", bch2_err_str(ret)); + goto err; + } + @@ -76240,7 +76685,7 @@ index 000000000..290897403 +module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h new file mode 100644 -index 000000000..8501adaff +index 000000000000..8501adaff4c2 --- /dev/null +++ b/fs/bcachefs/super.h @@ -0,0 +1,264 @@ @@ -76510,7 +76955,7 @@ index 000000000..8501adaff +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 -index 000000000..89419fc79 +index 000000000000..89419fc7930d --- /dev/null +++ b/fs/bcachefs/super_types.h @@ -0,0 +1,51 @@ @@ -76567,7 +77012,7 @@ index 000000000..89419fc79 +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000..2c650055f +index 000000000000..2c650055f530 --- /dev/null +++ b/fs/bcachefs/sysfs.c @@ -0,0 +1,943 @@ @@ -77516,7 +77961,7 @@ index 000000000..2c650055f +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h new file mode 100644 -index 000000000..222cd5062 +index 000000000000..222cd5062702 --- /dev/null +++ b/fs/bcachefs/sysfs.h @@ -0,0 +1,48 @@ @@ -77570,10 +78015,10 @@ index 000000000..222cd5062 +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 -index 000000000..1954891ce +index 000000000000..56058a56f2a2 --- /dev/null +++ b/fs/bcachefs/tests.c -@@ -0,0 +1,947 @@ +@@ -0,0 +1,976 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + @@ -77618,29 +78063,29 @@ index 000000000..1954891ce + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { -+ bch_err(c, "update error in test_delete: %i", ret); ++ bch_err(c, "update error in test_delete: %s", bch2_err_str(ret)); + goto err; + } + + pr_info("deleting once"); -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error (first) in test_delete: %i", ret); ++ bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret)); + goto err; + } + + pr_info("deleting twice"); -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error (second) in test_delete: %i", ret); ++ bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret)); + goto err; + } +err: @@ -77664,22 +78109,22 @@ index 000000000..1954891ce + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { -+ bch_err(c, "update error in test_delete_written: %i", ret); ++ bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret)); + goto err; + } + + bch2_trans_unlock(&trans); + bch2_journal_flush_all_pins(&c->journal); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { -+ bch_err(c, "delete error in test_delete_written: %i", ret); ++ bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret)); + goto err; + } +err: @@ -77712,7 +78157,7 @@ index 000000000..1954891ce + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate: %i", ret); ++ bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret)); + goto err; + } + } @@ -77721,20 +78166,30 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ({ + BUG_ON(k.k->p.offset != i++); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; + } + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) -+ BUG_ON(k.k->p.offset != --i); ++ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, U64_MAX, U32_MAX), 0, k, ++ ({ ++ BUG_ON(k.k->p.offset != --i); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } + + BUG_ON(i); +err: @@ -77768,7 +78223,7 @@ index 000000000..1954891ce + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_extents: %i", ret); ++ bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret)); + goto err; + } + } @@ -77777,19 +78232,31 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), 0, k, ret) { ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; + } + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { -+ BUG_ON(k.k->p.offset != i); -+ i = bkey_start_offset(k.k); ++ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, ++ SPOS(0, U64_MAX, U32_MAX), 0, k, ++ ({ ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); ++ goto err; + } + + BUG_ON(i); @@ -77823,7 +78290,7 @@ index 000000000..1954891ce + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_slots: %i", ret); ++ bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret)); + goto err; + } + } @@ -77832,15 +78299,16 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; + } -+ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(i != nr * 2); + @@ -77848,17 +78316,23 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ BTREE_ITER_SLOTS, k, ret) { ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS, k, ({ ++ if (i >= nr * 2) ++ break; ++ + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); + + i++; -+ if (i == nr * 2) -+ break; ++ 0; ++ })); ++ if (ret < 0) { ++ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); ++ goto err; + } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = 0; +err: + bch2_trans_exit(&trans); + return ret; @@ -77889,7 +78363,7 @@ index 000000000..1954891ce + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { -+ bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); ++ bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret)); + goto err; + } + } @@ -77898,13 +78372,17 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), 0, k, ret) { ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; + } -+ bch2_trans_iter_exit(&trans, &iter); + + BUG_ON(i != nr); + @@ -77912,19 +78390,23 @@ index 000000000..1954891ce + + i = 0; + -+ for_each_btree_key(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), -+ BTREE_ITER_SLOTS, k, ret) { ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS, k, ({ ++ if (i == nr) ++ break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; -+ -+ if (i == nr) -+ break; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); ++ goto err; + } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = 0; +err: + bch2_trans_exit(&trans); + return 0; @@ -77944,10 +78426,10 @@ index 000000000..1954891ce + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + BUG_ON(k.k); + -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + BUG_ON(k.k); + + bch2_trans_iter_exit(&trans, &iter); @@ -77965,10 +78447,10 @@ index 000000000..1954891ce + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); + -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + BUG_ON(k.k); + -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + BUG_ON(k.k); + + bch2_trans_iter_exit(&trans, &iter); @@ -77995,7 +78477,7 @@ index 000000000..1954891ce + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) -+ bch_err(c, "insert error in insert_test_extent: %i", ret); ++ bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret)); + return ret; +} + @@ -78058,7 +78540,7 @@ index 000000000..1954891ce + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + + BUG_ON(k.k->p.snapshot != U32_MAX); + @@ -78094,7 +78576,7 @@ index 000000000..1954891ce + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { -+ bch_err(c, "err %i from test_snapshot_filter", ret); ++ bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret)); + return ret; + } + @@ -78128,10 +78610,10 @@ index 000000000..1954891ce + k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); + if (ret) { -+ bch_err(c, "error in rand_insert: %i", ret); ++ bch_err(c, "error in rand_insert: %s", bch2_err_str(ret)); + break; + } + } @@ -78157,7 +78639,7 @@ index 000000000..1954891ce + k[j].k.p.snapshot = U32_MAX; + } + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: @@ -78167,7 +78649,7 @@ index 000000000..1954891ce + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); + if (ret) { -+ bch_err(c, "error in rand_insert_multi: %i", ret); ++ bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret)); + break; + } + } @@ -78191,10 +78673,10 @@ index 000000000..1954891ce + for (i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + ret = bkey_err(k); + if (ret) { -+ bch_err(c, "error in rand_lookup: %i", ret); ++ bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret)); + break; + } + } @@ -78214,10 +78696,10 @@ index 000000000..1954891ce + + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + -+ k = bch2_btree_iter_peek(iter); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(iter))); + ret = bkey_err(k); -+ if (ret && ret != -EINTR) -+ bch_err(trans->c, "lookup error in rand_mixed: %i", ret); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret)); + if (ret) + return ret; + @@ -78244,10 +78726,10 @@ index 000000000..1954891ce + + for (i = 0; i < nr; i++) { + rand = test_rand(); -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + if (ret) { -+ bch_err(c, "update error in rand_mixed: %i", ret); ++ bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret)); + break; + } + } @@ -78265,7 +78747,7 @@ index 000000000..1954891ce + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek(&iter); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + ret = bkey_err(k); + if (ret) + goto err; @@ -78290,10 +78772,10 @@ index 000000000..1954891ce + for (i = 0; i < nr; i++) { + struct bpos pos = SPOS(0, test_rand(), U32_MAX); + -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ ret = commit_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); + if (ret) { -+ bch_err(c, "error in rand_delete: %i", ret); ++ bch_err(c, "error in rand_delete: %s", bch2_err_str(ret)); + break; + } + } @@ -78309,28 +78791,23 @@ index 000000000..1954891ce + struct bkey_s_c k; + struct bkey_i_cookie insert; + int ret = 0; -+ u64 i = 0; + + bkey_cookie_init(&insert.k_i); + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ insert.k.p = iter.pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, &insert.k_i, 0)); -+ if (ret) { -+ bch_err(c, "error in seq_insert: %i", ret); -+ break; -+ } -+ -+ if (++i == nr) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ++ NULL, NULL, 0, ++ ({ ++ if (iter.pos.offset >= nr) ++ break; ++ insert.k.p = iter.pos; ++ bch2_trans_update(&trans, &iter, &insert.k_i, 0); ++ })); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -78345,10 +78822,11 @@ index 000000000..1954891ce + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0, k, ret) -+ ; -+ bch2_trans_iter_exit(&trans, &iter); ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ++ 0); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -78363,22 +78841,18 @@ index 000000000..1954891ce + + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ BTREE_ITER_INTENT, k, ret) { -+ struct bkey_i_cookie u; ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_INTENT, k, ++ NULL, NULL, 0, ++ ({ ++ struct bkey_i_cookie u; + -+ bkey_reassemble(&u.k_i, k); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, &u.k_i, 0)); -+ if (ret) { -+ bch_err(c, "error in seq_overwrite: %i", ret); -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); ++ bkey_reassemble(&u.k_i, k); ++ bch2_trans_update(&trans, &iter, &u.k_i, 0); ++ })); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); + + bch2_trans_exit(&trans); + return ret; @@ -78392,7 +78866,7 @@ index 000000000..1954891ce + SPOS(0, 0, U32_MAX), SPOS_MAX, + 0, NULL); + if (ret) -+ bch_err(c, "error in seq_delete: %i", ret); ++ bch_err(c, "error in seq_delete: %s", bch2_err_str(ret)); + return ret; +} + @@ -78429,7 +78903,7 @@ index 000000000..1954891ce + + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); + if (ret) { -+ bch_err(j->c, "%ps: error %i", j->fn, ret); ++ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); + j->ret = ret; + } + @@ -78523,7 +78997,7 @@ index 000000000..1954891ce +#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h new file mode 100644 -index 000000000..c73b18aea +index 000000000000..c73b18aea7e0 --- /dev/null +++ b/fs/bcachefs/tests.h @@ -0,0 +1,15 @@ @@ -78544,7 +79018,7 @@ index 000000000..c73b18aea +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c new file mode 100644 -index 000000000..59e8dfa3d +index 000000000000..59e8dfa3d245 --- /dev/null +++ b/fs/bcachefs/trace.c @@ -0,0 +1,12 @@ @@ -78562,10 +79036,10 @@ index 000000000..59e8dfa3d +#include diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000..8ef4b5915 +index 000000000000..ee2c7d9e7050 --- /dev/null +++ b/fs/bcachefs/util.c -@@ -0,0 +1,958 @@ +@@ -0,0 +1,964 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache @@ -78944,31 +79418,37 @@ index 000000000..8ef4b5915 + u64 q, last_q = 0; + int i; + -+ prt_printf(out, "count:\t\t%llu\n", ++ prt_printf(out, "count:\t\t%llu", + stats->count); -+ prt_printf(out, "rate:\t\t%llu/sec\n", ++ prt_newline(out); ++ prt_printf(out, "rate:\t\t%llu/sec", + freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ prt_newline(out); + + prt_printf(out, "frequency:\t"); + pr_time_units(out, freq); + -+ prt_printf(out, "\navg duration:\t"); ++ prt_newline(out); ++ prt_printf(out, "avg duration:\t"); + pr_time_units(out, stats->average_duration); + -+ prt_printf(out, "\nmax duration:\t"); ++ prt_newline(out); ++ prt_printf(out, "max duration:\t"); + pr_time_units(out, stats->max_duration); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + -+ prt_printf(out, "\nquantiles (%s):\t", u->name); ++ prt_newline(out); ++ prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + q = max(stats->quantiles.entries[i].m, last_q); -+ prt_printf(out, "%llu%s", -+ div_u64(q, u->nsecs), -+ is_last ? "\n" : " "); ++ prt_printf(out, "%llu ", ++ div_u64(q, u->nsecs)); ++ if (is_last) ++ prt_newline(out); + last_q = q; + } +} @@ -79526,7 +80006,7 @@ index 000000000..8ef4b5915 +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000..1fe66fd91 +index 000000000000..1fe66fd91ccc --- /dev/null +++ b/fs/bcachefs/util.h @@ -0,0 +1,783 @@ @@ -80315,7 +80795,7 @@ index 000000000..1fe66fd91 +#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 -index 000000000..5143b603b +index 000000000000..5143b603bf67 --- /dev/null +++ b/fs/bcachefs/varint.c @@ -0,0 +1,121 @@ @@ -80442,7 +80922,7 @@ index 000000000..5143b603b +} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h new file mode 100644 -index 000000000..92a182fb3 +index 000000000000..92a182fb3d7a --- /dev/null +++ b/fs/bcachefs/varint.h @@ -0,0 +1,11 @@ @@ -80459,7 +80939,7 @@ index 000000000..92a182fb3 +#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h new file mode 100644 -index 000000000..53a694d71 +index 000000000000..53a694d71967 --- /dev/null +++ b/fs/bcachefs/vstructs.h @@ -0,0 +1,63 @@ @@ -80528,7 +81008,7 @@ index 000000000..53a694d71 +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000..123612716 +index 000000000000..186ffab542d5 --- /dev/null +++ b/fs/bcachefs/xattr.c @@ -0,0 +1,648 @@ @@ -80878,7 +81358,7 @@ index 000000000..123612716 + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: -+ if (ret == -EINTR) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); @@ -81182,7 +81662,7 @@ index 000000000..123612716 +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 -index 000000000..66d7a1e30 +index 000000000000..66d7a1e30350 --- /dev/null +++ b/fs/bcachefs/xattr.h @@ -0,0 +1,50 @@ @@ -81237,7 +81717,7 @@ index 000000000..66d7a1e30 + +#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/d_path.c b/fs/d_path.c -index e4e0ebad1..1bd9e85f2 100644 +index e4e0ebad1f15..1bd9e85f2f65 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -5,6 +5,7 @@ @@ -81290,7 +81770,7 @@ index e4e0ebad1..1bd9e85f2 100644 * Helper function for dentry_operations.d_dname() members */ diff --git a/fs/dcache.c b/fs/dcache.c -index 93f4f5ee0..d90ed65e2 100644 +index 93f4f5ee07bf..d90ed65e2a75 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3193,9 +3193,8 @@ void d_genocide(struct dentry *parent) @@ -81319,7 +81799,7 @@ index 93f4f5ee0..d90ed65e2 100644 } EXPORT_SYMBOL(d_tmpfile); diff --git a/fs/inode.c b/fs/inode.c -index bd4da9c52..ac0da28a1 100644 +index bd4da9c5207e..ac0da28a1ac6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -56,8 +56,23 @@ @@ -81806,7 +82286,7 @@ index bd4da9c52..ac0da28a1 100644 14, HASH_ZERO, diff --git a/include/linux/bio.h b/include/linux/bio.h -index 00450fd86..c11103a87 100644 +index 00450fd86bb4..c11103a8720a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,7 +483,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, @@ -81824,7 +82304,7 @@ index 00450fd86..c11103a87 100644 static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 108e3d114..20f76bd27 100644 +index 108e3d114bfc..20f76bd27b9a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -873,6 +873,7 @@ extern const char *blk_op_str(unsigned int op); @@ -81839,7 +82319,7 @@ diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h similarity index 94% rename from drivers/md/bcache/closure.h rename to include/linux/closure.h -index c88cdc4ae..36b4a83f9 100644 +index c88cdc4ae4ec..36b4a83f9b77 100644 --- a/drivers/md/bcache/closure.h +++ b/include/linux/closure.h @@ -155,7 +155,7 @@ struct closure { @@ -81947,7 +82427,7 @@ index c88cdc4ae..36b4a83f9 100644 + #endif /* _LINUX_CLOSURE_H */ diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index 445e80517..57e7d0b94 100644 +index 445e80517cab..57e7d0b94119 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -371,4 +371,9 @@ @@ -81961,7 +82441,7 @@ index 445e80517..57e7d0b94 100644 + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index f5bba5148..6c661059a 100644 +index f5bba51480b2..6c661059a55b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -248,6 +248,7 @@ extern struct dentry * d_make_root(struct inode *); @@ -81981,7 +82461,7 @@ index f5bba5148..6c661059a 100644 /* Allocation counts.. */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h -index fe848901f..5a3cc0e1d 100644 +index fe848901fcc3..5a3cc0e1da9b 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -98,6 +98,12 @@ enum fid_type { @@ -81998,7 +82478,7 @@ index fe848901f..5a3cc0e1d 100644 * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) diff --git a/include/linux/fs.h b/include/linux/fs.h -index bbde95387..98f62ebf9 100644 +index bbde95387a23..98f62ebf9224 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -637,7 +637,8 @@ struct inode { @@ -82039,7 +82519,7 @@ index bbde95387..98f62ebf9 100644 } diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h -index 107613f7d..c74b73769 100644 +index 107613f7d792..c74b7376990d 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -38,6 +38,7 @@ @@ -82150,7 +82630,7 @@ index 107613f7d..c74b73769 100644 /** diff --git a/include/linux/kernel.h b/include/linux/kernel.h -index fe6efb24d..9ba5a53c6 100644 +index fe6efb24d151..9ba5a53c6ad5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -202,11 +202,17 @@ static inline void might_fault(void) { } @@ -82185,7 +82665,7 @@ index fe6efb24d..9ba5a53c6 100644 /* diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h -index ae1b54144..8ee2bf5af 100644 +index ae1b541446c9..8ee2bf5af131 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) @@ -82218,7 +82698,7 @@ index ae1b54144..8ee2bf5af 100644 { bit_spin_lock(0, (unsigned long *)b); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 467b94257..c46b0c76c 100644 +index 467b94257105..c46b0c76c064 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -336,6 +336,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); @@ -82241,7 +82721,7 @@ index 467b94257..c46b0c76c 100644 enum xhlock_context_t { diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h new file mode 100644 -index 000000000..f39d8edfb +index 000000000000..f39d8edfba02 --- /dev/null +++ b/include/linux/pretty-printers.h @@ -0,0 +1,10 @@ @@ -82257,7 +82737,7 @@ index 000000000..f39d8edfb +#endif /* _LINUX_PRETTY_PRINTERS_H */ diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h new file mode 100644 -index 000000000..861c5d75f +index 000000000000..861c5d75f852 --- /dev/null +++ b/include/linux/printbuf.h @@ -0,0 +1,283 @@ @@ -82545,7 +83025,7 @@ index 000000000..861c5d75f + +#endif /* _LINUX_PRINTBUF_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index a8911b1f3..252bac976 100644 +index a8911b1f35aa..252bac976763 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -859,6 +859,7 @@ struct task_struct { @@ -82558,7 +83038,7 @@ index a8911b1f3..252bac976 100644 struct vmacache vmacache; diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h deleted file mode 100644 -index 5b31c5147..000000000 +index 5b31c5147969..000000000000 --- a/include/linux/seq_buf.h +++ /dev/null @@ -1,162 +0,0 @@ @@ -82725,7 +83205,7 @@ index 5b31c5147..000000000 - -#endif /* _LINUX_SEQ_BUF_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h -index 76fbf92b0..12967748f 100644 +index 76fbf92b04d9..12967748f9f7 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,8 @@ @@ -82768,7 +83248,7 @@ index 76fbf92b0..12967748f 100644 #endif diff --git a/include/linux/six.h b/include/linux/six.h new file mode 100644 -index 000000000..477c33eb0 +index 000000000000..477c33eb00d7 --- /dev/null +++ b/include/linux/six.h @@ -0,0 +1,203 @@ @@ -82976,7 +83456,7 @@ index 000000000..477c33eb0 + +#endif /* _LINUX_SIX_H */ diff --git a/include/linux/string.h b/include/linux/string.h -index b6572aeca..0a737d5b9 100644 +index b6572aeca2f5..0a737d5b9203 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -195,7 +195,12 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); @@ -82993,7 +83473,7 @@ index b6572aeca..0a737d5b9 100644 int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h -index 4d72258d4..52e0f1d28 100644 +index 4d72258d42fd..52e0f1d283b9 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -10,6 +10,7 @@ @@ -83033,7 +83513,7 @@ index 4d72258d4..52e0f1d28 100644 unsigned int flags, const char *only) { diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h -index e6e95a9f0..48471e32f 100644 +index e6e95a9f07a5..48471e32f8e4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -496,7 +496,7 @@ struct dynevent_cmd; @@ -83046,7 +83526,7 @@ index e6e95a9f0..48471e32f 100644 unsigned int n_fields; enum dynevent_type type; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h -index 5a2c650d9..d2b51007b 100644 +index 5a2c650d9e1c..d2b51007b3b9 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -2,10 +2,12 @@ @@ -83118,7 +83598,7 @@ index 5a2c650d9..d2b51007b 100644 extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index b159c2789..0f4151e98 100644 +index b159c2789961..0f4151e98331 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -144,6 +144,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1); @@ -83129,12 +83609,92 @@ index b159c2789..0f4151e98 100644 extern void *vmalloc_32(unsigned long size) __alloc_size(1); extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h +index 24a509f559ee..0b20ee6854d6 100644 +--- a/include/net/9p/9p.h ++++ b/include/net/9p/9p.h +@@ -539,12 +539,12 @@ struct p9_rstatfs { + struct p9_fcall { + u32 size; + u8 id; ++ bool used_mempool; + u16 tag; + + size_t offset; + size_t capacity; + +- struct kmem_cache *cache; + u8 *sdata; + }; + +diff --git a/include/net/9p/client.h b/include/net/9p/client.h +index ec1d1706f43c..832dcc866a20 100644 +--- a/include/net/9p/client.h ++++ b/include/net/9p/client.h +@@ -9,6 +9,7 @@ + #ifndef NET_9P_CLIENT_H + #define NET_9P_CLIENT_H + ++#include + #include + #include + +@@ -76,7 +77,7 @@ enum p9_req_status_t { + struct p9_req_t { + int status; + int t_err; +- struct kref refcount; ++ refcount_t refcount; + wait_queue_head_t wq; + struct p9_fcall tc; + struct p9_fcall rc; +@@ -107,6 +108,14 @@ struct p9_client { + void *trans; + struct kmem_cache *fcall_cache; + ++ /* ++ * We need two identical mempools because it's not safe to allocate ++ * multiple elements from the same pool (without freeing the first); ++ * that will deadlock if multiple threads need the last element at the ++ * same time. ++ */ ++ mempool_t pools[2]; ++ + union { + struct { + int rfd; +@@ -222,20 +231,21 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, + kgid_t gid, struct p9_qid *qid); + int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status); + int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); +-void p9_fcall_fini(struct p9_fcall *fc); ++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx); + struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag); + + static inline void p9_req_get(struct p9_req_t *r) + { +- kref_get(&r->refcount); ++ refcount_inc(&r->refcount); + } + + static inline int p9_req_try_get(struct p9_req_t *r) + { +- return kref_get_unless_zero(&r->refcount); ++ return refcount_inc_not_zero(&r->refcount); + } + +-int p9_req_put(struct p9_req_t *r); ++int p9_req_put(struct p9_client *c, struct p9_req_t *r); + + void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); + diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h new file mode 100644 -index 000000000..66ad356e9 +index 000000000000..140834e7406e --- /dev/null +++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,1020 @@ +@@ -0,0 +1,1048 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -83447,24 +84007,27 @@ index 000000000..66ad356e9 +); + +TRACE_EVENT(btree_reserve_get_fail, -+ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), -+ TP_ARGS(c, required, cl), ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ size_t required), ++ TP_ARGS(trans_fn, caller_ip, required), + + TP_STRUCT__entry( -+ __field(dev_t, dev ) ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) + __field(size_t, required ) -+ __field(struct closure *, cl ) + ), + + TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->required = required; -+ __entry->cl = cl; ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->required = required; + ), + -+ TP_printk("%d,%d required %zu by %p", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->required, __entry->cl) ++ TP_printk("%s %pS required %zu", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->required) +); + +DEFINE_EVENT(btree_node, btree_split, @@ -83593,55 +84156,68 @@ index 000000000..66ad356e9 + +TRACE_EVENT(bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, ++ u64 free, + u64 avail, ++ u64 copygc_wait_amount, ++ s64 copygc_waiting_for, + u64 seen, + u64 open, + u64 need_journal_commit, + u64 nouse, + bool nonblocking, -+ int ret), -+ TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret), ++ const char *err), ++ TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, ++ seen, open, need_journal_commit, nouse, nonblocking, err), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, reserve, 16 ) ++ __field(u64, free ) + __field(u64, avail ) ++ __field(u64, copygc_wait_amount ) ++ __field(s64, copygc_waiting_for ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, nouse ) + __field(bool, nonblocking ) -+ __field(int, ret ) ++ __array(char, err, 16 ) + ), + + TP_fast_assign( + __entry->dev = ca->dev; + strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ __entry->free = free; + __entry->avail = avail; ++ __entry->copygc_wait_amount = copygc_wait_amount; ++ __entry->copygc_waiting_for = copygc_waiting_for; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->nouse = nouse; + __entry->nonblocking = nonblocking; -+ __entry->ret = ret; ++ strlcpy(__entry->err, err, sizeof(__entry->err)); + ), + -+ TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i", ++ TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->reserve, ++ __entry->free, + __entry->avail, ++ __entry->copygc_wait_amount, ++ __entry->copygc_waiting_for, + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->nouse, + __entry->nonblocking, -+ __entry->ret) ++ __entry->err) +); + +TRACE_EVENT(discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, -+ u64 need_journal_commit, u64 discarded, int ret), -+ TP_ARGS(c, seen, open, need_journal_commit, discarded, ret), ++ u64 need_journal_commit, u64 discarded, const char *err), ++ TP_ARGS(c, seen, open, need_journal_commit, discarded, err), + + TP_STRUCT__entry( + __field(dev_t, dev ) @@ -83649,7 +84225,7 @@ index 000000000..66ad356e9 + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, discarded ) -+ __field(int, ret ) ++ __array(char, err, 16 ) + ), + + TP_fast_assign( @@ -83658,16 +84234,16 @@ index 000000000..66ad356e9 + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; -+ __entry->ret = ret; ++ strlcpy(__entry->err, err, sizeof(__entry->err)); + ), + -+ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i", ++ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->discarded, -+ __entry->ret) ++ __entry->err) +); + +TRACE_EVENT(invalidate_bucket, @@ -83815,6 +84391,12 @@ index 000000000..66ad356e9 + TP_ARGS(trans_fn, caller_ip) +); + ++DEFINE_EVENT(transaction_event, transaction_restart_injected, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip), @@ -83863,6 +84445,12 @@ index 000000000..66ad356e9 + TP_ARGS(trans_fn, caller_ip) +); + ++DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ +DECLARE_EVENT_CLASS(transaction_restart_iter, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, @@ -84156,7 +84744,7 @@ index 000000000..66ad356e9 +/* This part must be outside protection */ +#include diff --git a/init/init_task.c b/init/init_task.c -index 73cc8f035..3e3aed110 100644 +index 73cc8f03511a..3e3aed110153 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -85,6 +85,7 @@ struct task_struct init_task @@ -84168,7 +84756,7 @@ index 73cc8f035..3e3aed110 100644 .fn = do_no_restart_syscall, }, diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks -index 4198f0273..b2abd9a5d 100644 +index 4198f0273ecd..b2abd9a5d9ab 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB @@ -84179,7 +84767,7 @@ index 4198f0273..b2abd9a5d 100644 +config SIXLOCKS + bool diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile -index d51cabf28..cadbf6520 100644 +index d51cabf28f38..cadbf6520c4b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o @@ -84188,7 +84776,7 @@ index d51cabf28..cadbf6520 100644 obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o +obj-$(CONFIG_SIXLOCKS) += six.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index c06cab654..9426050d3 100644 +index c06cab6546ed..9426050d30d9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6459,6 +6459,26 @@ void debug_check_no_locks_held(void) @@ -84220,7 +84808,7 @@ index c06cab654..9426050d3 100644 { diff --git a/kernel/locking/six.c b/kernel/locking/six.c new file mode 100644 -index 000000000..fca120872 +index 000000000000..fca1208720b6 --- /dev/null +++ b/kernel/locking/six.c @@ -0,0 +1,759 @@ @@ -84984,7 +85572,7 @@ index 000000000..fca120872 +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); diff --git a/kernel/module.c b/kernel/module.c -index 6529c84c5..df4959bda 100644 +index 6529c84c536f..df4959bda595 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2834,9 +2834,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) @@ -84998,8 +85586,28 @@ index 6529c84c5..df4959bda 100644 } bool __weak module_init_section(const char *name) +diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c +index 9ed5ce989415..3428568bb3f1 100644 +--- a/kernel/stacktrace.c ++++ b/kernel/stacktrace.c +@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, + put_task_stack(tsk); + return c.len; + } ++EXPORT_SYMBOL(stack_trace_save_tsk); + + /** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array +@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, + save_stack_trace_tsk(task, &trace); + return trace.nr_entries; + } ++EXPORT_SYMBOL(stack_trace_save_tsk); + + /** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index 114c31bdf..7c7fd7b66 100644 +index c0c98b0c86e7..9d91153228fd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1672,15 +1672,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) @@ -85119,7 +85727,7 @@ index 114c31bdf..7c7fd7b66 100644 printk(KERN_TRACE "%s", s->buffer); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c -index e34e8182e..eabeeb97b 100644 +index e34e8182ee4b..eabeeb97b55e 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -295,21 +295,19 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, @@ -85209,7 +85817,7 @@ index e34e8182e..eabeeb97b 100644 cmd->run_command = run_command; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c -index b458a9afa..70cfd1241 100644 +index b458a9afa2c0..70cfd1241018 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1059,7 +1059,7 @@ static void append_filter_err(struct trace_array *tr, @@ -85222,7 +85830,7 @@ index b458a9afa..70cfd1241 100644 kfree(filter->filter_string); filter->filter_string = buf; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c -index 5e8c07aef..ddb2a2737 100644 +index 5e8c07aef071..ddb2a2737b82 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -5,13 +5,14 @@ @@ -85303,7 +85911,7 @@ index 5e8c07aef..ddb2a2737 100644 return ret; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c -index 203204cad..9f270fdde 100644 +index 203204cadf92..9f270fdde99b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1022,9 +1022,9 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, @@ -85320,7 +85928,7 @@ index 203204cad..9f270fdde 100644 trace_seq_puts(s, " */\n"); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c -index 134397432..6e4485b04 100644 +index 13439743285c..6e4485b042d8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -915,7 +915,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) @@ -85333,7 +85941,7 @@ index 134397432..6e4485b04 100644 /** diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c -index 9c90b3a7d..48c08f29c 100644 +index 9c90b3a7dce2..48c08f29c342 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -25,11 +25,9 @@ @@ -85612,7 +86220,7 @@ index 9c90b3a7d..48c08f29c 100644 return 0; } diff --git a/lib/Kconfig b/lib/Kconfig -index 55f0bba8f..9161ac314 100644 +index 55f0bba8f8c0..9161ac314358 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -491,6 +491,9 @@ config ASSOCIATIVE_ARRAY @@ -85626,7 +86234,7 @@ index 55f0bba8f..9161ac314 100644 bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 7e2829701..2bef39841 100644 +index 7e282970177a..2bef39841f8e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1723,6 +1723,15 @@ config DEBUG_CREDENTIALS @@ -85646,7 +86254,7 @@ index 7e2829701..2bef39841 100644 bool "Force round-robin CPU selection for unbound work items" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile -index 60843ab66..d98f3c92b 100644 +index 60843ab661ba..d98f3c92badb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,11 +30,11 @@ endif @@ -85677,7 +86285,7 @@ diff --git a/drivers/md/bcache/closure.c b/lib/closure.c similarity index 88% rename from drivers/md/bcache/closure.c rename to lib/closure.c -index d8d9394a6..b38ded00b 100644 +index d8d9394a6beb..b38ded00b9b0 100644 --- a/drivers/md/bcache/closure.c +++ b/lib/closure.c @@ -6,13 +6,12 @@ @@ -85790,8 +86398,17 @@ index d8d9394a6..b38ded00b 100644 -MODULE_AUTHOR("Kent Overstreet "); -MODULE_LICENSE("GPL"); +#endif +diff --git a/lib/errname.c b/lib/errname.c +index 05cbf731545f..82ea4778f478 100644 +--- a/lib/errname.c ++++ b/lib/errname.c +@@ -222,3 +222,4 @@ const char *errname(int err) + + return err > 0 ? name + 1 : name; + } ++EXPORT_SYMBOL(errname); diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c -index f25eb111c..41f1bcdc4 100644 +index f25eb111c051..41f1bcdc4488 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -1,4 +1,5 @@ @@ -85898,7 +86515,7 @@ index f25eb111c..41f1bcdc4 100644 { if (level) { diff --git a/lib/hexdump.c b/lib/hexdump.c -index 06833d404..9556f15ad 100644 +index 06833d404398..9556f15ad295 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -9,6 +9,7 @@ @@ -86205,7 +86822,7 @@ index 06833d404..9556f15ad 100644 unsigned char linebuf[32 * 3 + 2 + 32 + 1]; diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c new file mode 100644 -index 000000000..addbac95e +index 000000000000..addbac95e065 --- /dev/null +++ b/lib/pretty-printers.c @@ -0,0 +1,60 @@ @@ -86271,7 +86888,7 @@ index 000000000..addbac95e +EXPORT_SYMBOL(prt_bitflags); diff --git a/lib/printbuf.c b/lib/printbuf.c new file mode 100644 -index 000000000..047470025 +index 000000000000..047470025748 --- /dev/null +++ b/lib/printbuf.c @@ -0,0 +1,258 @@ @@ -86535,7 +87152,7 @@ index 000000000..047470025 +EXPORT_SYMBOL(prt_units_s64); diff --git a/lib/seq_buf.c b/lib/seq_buf.c deleted file mode 100644 -index 0a68f7aa8..000000000 +index 0a68f7aa85d6..000000000000 --- a/lib/seq_buf.c +++ /dev/null @@ -1,397 +0,0 @@ @@ -86937,7 +87554,7 @@ index 0a68f7aa8..000000000 - return 0; -} diff --git a/lib/string_helpers.c b/lib/string_helpers.c -index 5ed3beb06..d247bf945 100644 +index 5ed3beb066e6..d247bf945f16 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -15,6 +15,7 @@ @@ -87277,7 +87894,7 @@ index 5ed3beb06..d247bf945 100644 EXPORT_SYMBOL(string_escape_mem); diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c -index 5144899d3..f9e97879d 100644 +index 5144899d3c6b..f9e97879dcdf 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -25,36 +25,19 @@ static const char * const test_data_1[] __initconst = { @@ -87344,7 +87961,7 @@ index 5144899d3..f9e97879d 100644 result = test_data_1; diff --git a/lib/test_printf.c b/lib/test_printf.c -index 07309c45f..ac5f9f0eb 100644 +index 07309c45f327..ac5f9f0eb4e0 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -9,6 +9,7 @@ @@ -87409,7 +88026,7 @@ index 07309c45f..ac5f9f0eb 100644 kfree(alloced_buffer); } diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 40d26a07a..dfca8a7c9 100644 +index 40d26a07a133..dfca8a7c93ed 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -44,6 +44,7 @@ @@ -90226,7 +90843,7 @@ index 40d26a07a..dfca8a7c9 100644 EXPORT_SYMBOL_GPL(bstr_printf); diff --git a/mm/Makefile b/mm/Makefile -index 4cc13f317..7e852599b 100644 +index 4cc13f3179a5..7e852599b917 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -54,7 +54,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ @@ -90239,7 +90856,7 @@ index 4cc13f317..7e852599b 100644 # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/filemap.c b/mm/filemap.c -index be1859a27..222bcfe7a 100644 +index be1859a276e1..222bcfe7afa0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2223,6 +2223,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, @@ -90251,7 +90868,7 @@ index be1859a27..222bcfe7a 100644 /** * find_get_pages_contig - gang contiguous pagecache lookup diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 598fece89..57861dc9f 100644 +index 598fece89e2b..57861dc9fee5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -62,7 +62,7 @@ @@ -90360,7 +90977,7 @@ index 598fece89..57861dc9f 100644 #define K(x) ((x) << (PAGE_SHIFT-10)) diff --git a/mm/nommu.c b/mm/nommu.c -index 9d7afc2d9..dd5302026 100644 +index 9d7afc2d959e..dd53020262d8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -281,6 +281,24 @@ void *vzalloc_node(unsigned long size, int node) @@ -90389,7 +91006,7 @@ index 9d7afc2d9..dd5302026 100644 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 49d7df39b..9c550a283 100644 +index 49d7df39b02d..9c550a283037 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p) @@ -90433,7 +91050,7 @@ diff --git a/lib/show_mem.c b/mm/show_mem.c similarity index 83% rename from lib/show_mem.c rename to mm/show_mem.c -index 1c26c14ff..47225158c 100644 +index 1c26c14ffbb9..47225158ce49 100644 --- a/lib/show_mem.c +++ b/mm/show_mem.c @@ -7,6 +7,9 @@ @@ -90457,7 +91074,7 @@ index 1c26c14ff..47225158c 100644 + printk("%pf()", CALL_PP(shrinkers_to_text)); } diff --git a/mm/slab.h b/mm/slab.h -index 95eb34174..a91fc5aa1 100644 +index 95eb34174c1b..a91fc5aa1054 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -805,10 +805,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -90476,7 +91093,7 @@ index 95eb34174..a91fc5aa1 100644 } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c -index 2b3206a2c..333f431e0 100644 +index 2b3206a2c3b5..333f431e0708 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -24,6 +24,7 @@ @@ -90568,7 +91185,7 @@ index 2b3206a2c..333f431e0 100644 } diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index cadfbb515..60456a184 100644 +index cadfbb5155ea..60456a184b6a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3363,6 +3363,27 @@ void *vzalloc_node(unsigned long size, int node) @@ -90600,7 +91217,7 @@ index cadfbb515..60456a184 100644 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/mm/vmscan.c b/mm/vmscan.c -index 1678802e0..d911c5e33 100644 +index 1678802e03e7..d911c5e3304e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,7 @@ @@ -90718,8 +91335,330 @@ index 1678802e0..d911c5e33 100644 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); total_scan -= shrinkctl->nr_scanned; +diff --git a/net/9p/client.c b/net/9p/client.c +index 8bba0d9cf975..e14074d031c6 100644 +--- a/net/9p/client.c ++++ b/net/9p/client.c +@@ -218,23 +218,29 @@ static int parse_opts(char *opts, struct p9_client *clnt) + return ret; + } + +-static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, +- int alloc_msize) ++static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx, unsigned alloc_msize) + { +- if (likely(c->fcall_cache) && alloc_msize == c->msize) { +- fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); +- fc->cache = c->fcall_cache; +- } else { +- fc->sdata = kmalloc(alloc_msize, GFP_NOFS); +- fc->cache = NULL; +- } +- if (!fc->sdata) +- return -ENOMEM; ++ gfp_t gfp = GFP_NOFS|__GFP_NOWARN; ++ ++ BUG_ON(alloc_msize > c->msize); ++ ++ fc->sdata = NULL; ++ fc->used_mempool = false; + fc->capacity = alloc_msize; +- return 0; ++ ++ if (alloc_msize < c->msize) ++ fc->sdata = kmalloc(alloc_msize, gfp); ++ ++ if (!fc->sdata) { ++ fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp); ++ fc->used_mempool = true; ++ fc->capacity = c->msize; ++ } + } + +-void p9_fcall_fini(struct p9_fcall *fc) ++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx) + { + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us +@@ -242,8 +248,8 @@ void p9_fcall_fini(struct p9_fcall *fc) + if (unlikely(!fc->sdata)) + return; + +- if (fc->cache) +- kmem_cache_free(fc->cache, fc->sdata); ++ if (fc->used_mempool) ++ mempool_free(fc->sdata, &c->pools[fc_idx]); + else + kfree(fc->sdata); + } +@@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) + if (!req) + return ERR_PTR(-ENOMEM); + +- if (p9_fcall_init(c, &req->tc, alloc_msize)) +- goto free_req; +- if (p9_fcall_init(c, &req->rc, alloc_msize)) +- goto free; ++ p9_fcall_init(c, &req->tc, 0, alloc_msize); ++ p9_fcall_init(c, &req->rc, 1, alloc_msize); + + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); +@@ -305,14 +309,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ +- refcount_set(&req->refcount.refcount, 2); ++ refcount_set(&req->refcount, 2); + + return req; + + free: +- p9_fcall_fini(&req->tc); +- p9_fcall_fini(&req->rc); +-free_req: ++ p9_fcall_fini(c, &req->tc, 0); ++ p9_fcall_fini(c, &req->rc, 1); + kmem_cache_free(p9_req_cache, req); + return ERR_PTR(-ENOMEM); + } +@@ -341,7 +344,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { +- p9_req_put(req); ++ p9_req_put(c, req); + goto again; + } + } +@@ -367,21 +370,18 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); +- return p9_req_put(r); +-} +- +-static void p9_req_free(struct kref *ref) +-{ +- struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); +- +- p9_fcall_fini(&r->tc); +- p9_fcall_fini(&r->rc); +- kmem_cache_free(p9_req_cache, r); ++ return p9_req_put(c, r); + } + +-int p9_req_put(struct p9_req_t *r) ++int p9_req_put(struct p9_client *c, struct p9_req_t *r) + { +- return kref_put(&r->refcount, p9_req_free); ++ if (refcount_dec_and_test(&r->refcount)) { ++ p9_fcall_fini(c, &r->tc, 0); ++ p9_fcall_fini(c, &r->rc, 1); ++ kmem_cache_free(p9_req_cache, r); ++ return 1; ++ } ++ return 0; + } + EXPORT_SYMBOL(p9_req_put); + +@@ -426,7 +426,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) + + wake_up(&req->wq); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); +- p9_req_put(req); ++ p9_req_put(c, req); + } + EXPORT_SYMBOL(p9_client_cb); + +@@ -709,7 +709,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, + reterr: + p9_tag_remove(c, req); + /* We have to put also the 2nd reference as it won't be used */ +- p9_req_put(req); ++ p9_req_put(c, req); + return ERR_PTR(err); + } + +@@ -746,7 +746,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) + err = c->trans_mod->request(c, req); + if (err < 0) { + /* write won't happen */ +- p9_req_put(req); ++ p9_req_put(c, req); + if (err != -ERESTARTSYS && err != -EFAULT) + c->status = Disconnected; + goto recalc_sigpending; +@@ -1002,7 +1002,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + char *client_id; + + err = 0; +- clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); ++ clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + return ERR_PTR(-ENOMEM); + +@@ -1053,10 +1053,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + goto close_trans; + } + +- err = p9_client_version(clnt); +- if (err) +- goto close_trans; +- + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ +@@ -1066,6 +1062,15 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + clnt->msize - (P9_HDRSZ + 4), + NULL); + ++ err = mempool_init_slab_pool(&clnt->pools[0], 4, clnt->fcall_cache) ?: ++ mempool_init_slab_pool(&clnt->pools[1], 4, clnt->fcall_cache); ++ if (err) ++ goto close_trans; ++ ++ err = p9_client_version(clnt); ++ if (err) ++ goto close_trans; ++ + return clnt; + + close_trans: +@@ -1073,6 +1078,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + put_trans: + v9fs_put_trans(clnt->trans_mod); + free_client: ++ mempool_exit(&clnt->pools[1]); ++ mempool_exit(&clnt->pools[0]); + kfree(clnt); + return ERR_PTR(err); + } +@@ -1097,6 +1104,8 @@ void p9_client_destroy(struct p9_client *clnt) + + p9_tag_cleanup(clnt); + ++ mempool_exit(&clnt->pools[1]); ++ mempool_exit(&clnt->pools[0]); + kmem_cache_destroy(clnt->fcall_cache); + kfree(clnt); + } +diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c +index 8f8f95e39b03..007c3f45fe05 100644 +--- a/net/9p/trans_fd.c ++++ b/net/9p/trans_fd.c +@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work) + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; +- p9_req_put(m->rreq); ++ p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + +@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work) + m->wpos += err; + if (m->wpos == m->wsize) { + m->wpos = m->wsize = 0; +- p9_req_put(m->wreq); ++ p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + +@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) + if (req->status == REQ_STATUS_UNSENT) { + list_del(&req->req_list); + req->status = REQ_STATUS_FLSHD; +- p9_req_put(req); ++ p9_req_put(client, req); + ret = 0; + } + spin_unlock(&client->lock); +@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) + list_del(&req->req_list); + req->status = REQ_STATUS_FLSHD; + spin_unlock(&client->lock); +- p9_req_put(req); ++ p9_req_put(client, req); + + return 0; + } +@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m) + p9_mux_poll_stop(m); + cancel_work_sync(&m->rq); + if (m->rreq) { +- p9_req_put(m->rreq); ++ p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + cancel_work_sync(&m->wq); + if (m->wreq) { +- p9_req_put(m->wreq); ++ p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + +diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c +index 88e563826674..99d878d70d56 100644 +--- a/net/9p/trans_rdma.c ++++ b/net/9p/trans_rdma.c +@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc) + c->busa, c->req->tc.size, + DMA_TO_DEVICE); + up(&rdma->sq_sem); +- p9_req_put(c->req); ++ p9_req_put(client, c->req); + kfree(c); + } + +@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) + if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { + if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { + /* Got one! */ +- p9_fcall_fini(&req->rc); ++ p9_fcall_fini(client, &req->rc, 1); + req->rc.sdata = NULL; + goto dont_need_post_recv; + } else { +diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c +index b24a4fb0f0a2..147972bf2e79 100644 +--- a/net/9p/trans_virtio.c ++++ b/net/9p/trans_virtio.c +@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) + /* Reply won't come, so drop req ref */ + static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) + { +- p9_req_put(req); ++ p9_req_put(client, req); + return 0; + } + +@@ -523,7 +523,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, + kvfree(out_pages); + if (!kicked) { + /* reply won't come */ +- p9_req_put(req); ++ p9_req_put(client, req); + } + return err; + } +diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c +index 77883b6788cd..4cf0c78d4d22 100644 +--- a/net/9p/trans_xen.c ++++ b/net/9p/trans_xen.c +@@ -163,7 +163,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); +- p9_req_put(p9_req); ++ p9_req_put(client, p9_req); + + return 0; + } diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c -index 4d1a94736..a2097955d 100644 +index 4d1a947367f9..a2097955dace 100644 --- a/tools/testing/nvdimm/test/ndtest.c +++ b/tools/testing/nvdimm/test/ndtest.c @@ -12,7 +12,7 @@ @@ -90774,5 +91713,5 @@ index 4d1a94736..a2097955d 100644 static DEVICE_ATTR_RO(flags); -- -2.37.0.rc0.15.g3b9a5a33c2 +2.37.1 diff --git a/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch b/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch new file mode 100644 index 0000000..9673d36 --- /dev/null +++ b/linux-tkg-patches/5.19/0008-5.19-bcachefs.patch @@ -0,0 +1,91722 @@ +From db2079ca2ce5000c3dfe3656c1c7f580b053a325 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 22 Jul 2022 14:23:53 +0200 +Subject: [PATCH] 5.19-bcachefs + +Signed-off-by: Peter Jung +--- + .github/ISSUE_TEMPLATE/bug_report.md | 61 + + Documentation/core-api/printk-formats.rst | 22 + + arch/powerpc/kernel/process.c | 16 +- + arch/powerpc/kernel/security.c | 75 +- + arch/powerpc/platforms/pseries/papr_scm.c | 34 +- + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 16 +- + block/bio.c | 34 +- + block/blk-core.c | 1 + + block/blk.h | 1 - + drivers/acpi/apei/erst-dbg.c | 1 + + drivers/block/loop.c | 2 - + drivers/clk/tegra/clk-bpmp.c | 21 +- + drivers/input/joystick/analog.c | 23 +- + drivers/md/bcache/Kconfig | 10 +- + drivers/md/bcache/Makefile | 4 +- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 1 - + drivers/md/bcache/util.h | 3 +- + drivers/pci/p2pdma.c | 21 +- + fs/Kconfig | 1 + + fs/Makefile | 1 + + fs/bcachefs/Kconfig | 59 + + fs/bcachefs/Makefile | 69 + + fs/bcachefs/acl.c | 406 ++ + fs/bcachefs/acl.h | 58 + + fs/bcachefs/alloc_background.c | 1552 ++++++++ + fs/bcachefs/alloc_background.h | 183 + + fs/bcachefs/alloc_foreground.c | 1380 +++++++ + fs/bcachefs/alloc_foreground.h | 181 + + fs/bcachefs/alloc_types.h | 87 + + fs/bcachefs/backpointers.c | 875 ++++ + fs/bcachefs/backpointers.h | 38 + + fs/bcachefs/bcachefs.h | 1000 +++++ + fs/bcachefs/bcachefs_format.h | 2052 ++++++++++ + fs/bcachefs/bcachefs_ioctl.h | 368 ++ + fs/bcachefs/bkey.c | 1175 ++++++ + fs/bcachefs/bkey.h | 566 +++ + fs/bcachefs/bkey_buf.h | 60 + + fs/bcachefs/bkey_methods.c | 503 +++ + fs/bcachefs/bkey_methods.h | 175 + + fs/bcachefs/bkey_sort.c | 198 + + fs/bcachefs/bkey_sort.h | 44 + + fs/bcachefs/bset.c | 1598 ++++++++ + fs/bcachefs/bset.h | 615 +++ + fs/bcachefs/btree_cache.c | 1170 ++++++ + fs/bcachefs/btree_cache.h | 107 + + fs/bcachefs/btree_gc.c | 2098 ++++++++++ + fs/bcachefs/btree_gc.h | 112 + + fs/bcachefs/btree_io.c | 2150 ++++++++++ + fs/bcachefs/btree_io.h | 222 ++ + fs/bcachefs/btree_iter.c | 3515 +++++++++++++++++ + fs/bcachefs/btree_iter.h | 556 +++ + fs/bcachefs/btree_key_cache.c | 855 ++++ + fs/bcachefs/btree_key_cache.h | 47 + + fs/bcachefs/btree_locking.h | 289 ++ + fs/bcachefs/btree_types.h | 697 ++++ + fs/bcachefs/btree_update.h | 158 + + fs/bcachefs/btree_update_interior.c | 2266 +++++++++++ + fs/bcachefs/btree_update_interior.h | 321 ++ + fs/bcachefs/btree_update_leaf.c | 1800 +++++++++ + fs/bcachefs/buckets.c | 2113 ++++++++++ + fs/bcachefs/buckets.h | 300 ++ + fs/bcachefs/buckets_types.h | 103 + + fs/bcachefs/buckets_waiting_for_journal.c | 167 + + fs/bcachefs/buckets_waiting_for_journal.h | 15 + + .../buckets_waiting_for_journal_types.h | 23 + + fs/bcachefs/chardev.c | 760 ++++ + fs/bcachefs/chardev.h | 31 + + fs/bcachefs/checksum.c | 712 ++++ + fs/bcachefs/checksum.h | 204 + + fs/bcachefs/clock.c | 191 + + fs/bcachefs/clock.h | 38 + + fs/bcachefs/clock_types.h | 37 + + fs/bcachefs/compress.c | 639 +++ + fs/bcachefs/compress.h | 18 + + fs/bcachefs/counters.c | 107 + + fs/bcachefs/counters.h | 17 + + fs/bcachefs/darray.h | 77 + + fs/bcachefs/data_update.c | 376 ++ + fs/bcachefs/data_update.h | 38 + + fs/bcachefs/debug.c | 764 ++++ + fs/bcachefs/debug.h | 30 + + fs/bcachefs/dirent.c | 565 +++ + fs/bcachefs/dirent.h | 67 + + fs/bcachefs/disk_groups.c | 506 +++ + fs/bcachefs/disk_groups.h | 90 + + fs/bcachefs/ec.c | 1673 ++++++++ + fs/bcachefs/ec.h | 230 ++ + fs/bcachefs/ec_types.h | 46 + + fs/bcachefs/errcode.c | 51 + + fs/bcachefs/errcode.h | 64 + + fs/bcachefs/error.c | 184 + + fs/bcachefs/error.h | 223 ++ + fs/bcachefs/extent_update.c | 178 + + fs/bcachefs/extent_update.h | 12 + + fs/bcachefs/extents.c | 1324 +++++++ + fs/bcachefs/extents.h | 685 ++++ + fs/bcachefs/extents_types.h | 40 + + fs/bcachefs/eytzinger.h | 281 ++ + fs/bcachefs/fifo.h | 127 + + fs/bcachefs/fs-common.c | 496 +++ + fs/bcachefs/fs-common.h | 43 + + fs/bcachefs/fs-io.c | 3496 ++++++++++++++++ + fs/bcachefs/fs-io.h | 56 + + fs/bcachefs/fs-ioctl.c | 523 +++ + fs/bcachefs/fs-ioctl.h | 81 + + fs/bcachefs/fs.c | 1939 +++++++++ + fs/bcachefs/fs.h | 208 + + fs/bcachefs/fsck.c | 2390 +++++++++++ + fs/bcachefs/fsck.h | 8 + + fs/bcachefs/inode.c | 771 ++++ + fs/bcachefs/inode.h | 189 + + fs/bcachefs/io.c | 2422 ++++++++++++ + fs/bcachefs/io.h | 189 + + fs/bcachefs/io_types.h | 161 + + fs/bcachefs/journal.c | 1429 +++++++ + fs/bcachefs/journal.h | 521 +++ + fs/bcachefs/journal_io.c | 1735 ++++++++ + fs/bcachefs/journal_io.h | 59 + + fs/bcachefs/journal_reclaim.c | 852 ++++ + fs/bcachefs/journal_reclaim.h | 86 + + fs/bcachefs/journal_sb.c | 220 ++ + fs/bcachefs/journal_sb.h | 24 + + fs/bcachefs/journal_seq_blacklist.c | 322 ++ + fs/bcachefs/journal_seq_blacklist.h | 22 + + fs/bcachefs/journal_types.h | 340 ++ + fs/bcachefs/keylist.c | 67 + + fs/bcachefs/keylist.h | 76 + + fs/bcachefs/keylist_types.h | 16 + + fs/bcachefs/lru.c | 206 + + fs/bcachefs/lru.h | 19 + + fs/bcachefs/migrate.c | 186 + + fs/bcachefs/migrate.h | 7 + + fs/bcachefs/move.c | 952 +++++ + fs/bcachefs/move.h | 67 + + fs/bcachefs/move_types.h | 19 + + fs/bcachefs/movinggc.c | 285 ++ + fs/bcachefs/movinggc.h | 10 + + fs/bcachefs/opts.c | 578 +++ + fs/bcachefs/opts.h | 509 +++ + fs/bcachefs/quota.c | 823 ++++ + fs/bcachefs/quota.h | 71 + + fs/bcachefs/quota_types.h | 43 + + fs/bcachefs/rebalance.c | 361 ++ + fs/bcachefs/rebalance.h | 28 + + fs/bcachefs/rebalance_types.h | 26 + + fs/bcachefs/recovery.c | 1597 ++++++++ + fs/bcachefs/recovery.h | 58 + + fs/bcachefs/reflink.c | 422 ++ + fs/bcachefs/reflink.h | 76 + + fs/bcachefs/replicas.c | 1073 +++++ + fs/bcachefs/replicas.h | 106 + + fs/bcachefs/replicas_types.h | 10 + + fs/bcachefs/siphash.c | 173 + + fs/bcachefs/siphash.h | 87 + + fs/bcachefs/str_hash.h | 351 ++ + fs/bcachefs/subvolume.c | 1108 ++++++ + fs/bcachefs/subvolume.h | 137 + + fs/bcachefs/subvolume_types.h | 9 + + fs/bcachefs/super-io.c | 1602 ++++++++ + fs/bcachefs/super-io.h | 126 + + fs/bcachefs/super.c | 1950 +++++++++ + fs/bcachefs/super.h | 264 ++ + fs/bcachefs/super_types.h | 51 + + fs/bcachefs/sysfs.c | 943 +++++ + fs/bcachefs/sysfs.h | 48 + + fs/bcachefs/tests.c | 976 +++++ + fs/bcachefs/tests.h | 15 + + fs/bcachefs/trace.c | 12 + + fs/bcachefs/util.c | 964 +++++ + fs/bcachefs/util.h | 783 ++++ + fs/bcachefs/varint.c | 121 + + fs/bcachefs/varint.h | 11 + + fs/bcachefs/vstructs.h | 63 + + fs/bcachefs/xattr.c | 648 +++ + fs/bcachefs/xattr.h | 50 + + fs/d_path.c | 35 + + fs/dcache.c | 10 +- + fs/inode.c | 218 +- + include/linux/bio.h | 7 +- + include/linux/blkdev.h | 1 + + .../md/bcache => include/linux}/closure.h | 39 +- + include/linux/compiler_attributes.h | 5 + + include/linux/dcache.h | 2 + + include/linux/exportfs.h | 6 + + include/linux/fs.h | 9 +- + include/linux/generic-radix-tree.h | 68 +- + include/linux/kernel.h | 12 + + include/linux/list_bl.h | 22 + + include/linux/lockdep.h | 4 + + include/linux/pretty-printers.h | 10 + + include/linux/printbuf.h | 283 ++ + include/linux/sched.h | 1 + + include/linux/seq_buf.h | 162 - + include/linux/shrinker.h | 8 + + include/linux/six.h | 203 + + include/linux/string.h | 5 + + include/linux/string_helpers.h | 8 +- + include/linux/trace_events.h | 2 +- + include/linux/trace_seq.h | 17 +- + include/linux/vmalloc.h | 1 + + include/net/9p/9p.h | 2 +- + include/net/9p/client.h | 20 +- + include/trace/events/bcachefs.h | 1048 +++++ + init/init_task.c | 1 + + kernel/Kconfig.locks | 3 + + kernel/locking/Makefile | 1 + + kernel/locking/lockdep.c | 20 + + kernel/locking/six.c | 759 ++++ + kernel/module/main.c | 4 +- + kernel/stacktrace.c | 2 + + kernel/trace/trace.c | 45 +- + kernel/trace/trace_dynevent.c | 34 +- + kernel/trace/trace_events_filter.c | 2 +- + kernel/trace/trace_events_synth.c | 32 +- + kernel/trace/trace_functions_graph.c | 6 +- + kernel/trace/trace_kprobe.c | 2 +- + kernel/trace/trace_seq.c | 111 +- + lib/Kconfig | 3 + + lib/Kconfig.debug | 9 + + lib/Makefile | 8 +- + {drivers/md/bcache => lib}/closure.c | 35 +- + lib/errname.c | 1 + + lib/generic-radix-tree.c | 76 +- + lib/hexdump.c | 246 +- + lib/pretty-printers.c | 60 + + lib/printbuf.c | 258 ++ + lib/seq_buf.c | 397 -- + lib/string_helpers.c | 224 +- + lib/test_hexdump.c | 30 +- + lib/test_printf.c | 33 +- + lib/vsprintf.c | 1740 ++++---- + mm/Makefile | 2 +- + mm/filemap.c | 1 + + mm/memcontrol.c | 68 +- + mm/nommu.c | 18 + + mm/oom_kill.c | 23 - + {lib => mm}/show_mem.c | 8 + + mm/slab.h | 6 +- + mm/slab_common.c | 53 +- + mm/vmalloc.c | 21 + + mm/vmscan.c | 88 + + net/9p/client.c | 97 +- + net/9p/trans_fd.c | 12 +- + net/9p/trans_rdma.c | 4 +- + net/9p/trans_virtio.c | 4 +- + net/9p/trans_xen.c | 2 +- + tools/testing/nvdimm/test/ndtest.c | 22 +- + 248 files changed, 84382 insertions(+), 2223 deletions(-) + create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md + create mode 100644 fs/bcachefs/Kconfig + create mode 100644 fs/bcachefs/Makefile + create mode 100644 fs/bcachefs/acl.c + create mode 100644 fs/bcachefs/acl.h + create mode 100644 fs/bcachefs/alloc_background.c + create mode 100644 fs/bcachefs/alloc_background.h + create mode 100644 fs/bcachefs/alloc_foreground.c + create mode 100644 fs/bcachefs/alloc_foreground.h + create mode 100644 fs/bcachefs/alloc_types.h + create mode 100644 fs/bcachefs/backpointers.c + create mode 100644 fs/bcachefs/backpointers.h + create mode 100644 fs/bcachefs/bcachefs.h + create mode 100644 fs/bcachefs/bcachefs_format.h + create mode 100644 fs/bcachefs/bcachefs_ioctl.h + create mode 100644 fs/bcachefs/bkey.c + create mode 100644 fs/bcachefs/bkey.h + create mode 100644 fs/bcachefs/bkey_buf.h + create mode 100644 fs/bcachefs/bkey_methods.c + create mode 100644 fs/bcachefs/bkey_methods.h + create mode 100644 fs/bcachefs/bkey_sort.c + create mode 100644 fs/bcachefs/bkey_sort.h + create mode 100644 fs/bcachefs/bset.c + create mode 100644 fs/bcachefs/bset.h + create mode 100644 fs/bcachefs/btree_cache.c + create mode 100644 fs/bcachefs/btree_cache.h + create mode 100644 fs/bcachefs/btree_gc.c + create mode 100644 fs/bcachefs/btree_gc.h + create mode 100644 fs/bcachefs/btree_io.c + create mode 100644 fs/bcachefs/btree_io.h + create mode 100644 fs/bcachefs/btree_iter.c + create mode 100644 fs/bcachefs/btree_iter.h + create mode 100644 fs/bcachefs/btree_key_cache.c + create mode 100644 fs/bcachefs/btree_key_cache.h + create mode 100644 fs/bcachefs/btree_locking.h + create mode 100644 fs/bcachefs/btree_types.h + create mode 100644 fs/bcachefs/btree_update.h + create mode 100644 fs/bcachefs/btree_update_interior.c + create mode 100644 fs/bcachefs/btree_update_interior.h + create mode 100644 fs/bcachefs/btree_update_leaf.c + create mode 100644 fs/bcachefs/buckets.c + create mode 100644 fs/bcachefs/buckets.h + create mode 100644 fs/bcachefs/buckets_types.h + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c + create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h + create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h + create mode 100644 fs/bcachefs/chardev.c + create mode 100644 fs/bcachefs/chardev.h + create mode 100644 fs/bcachefs/checksum.c + create mode 100644 fs/bcachefs/checksum.h + create mode 100644 fs/bcachefs/clock.c + create mode 100644 fs/bcachefs/clock.h + create mode 100644 fs/bcachefs/clock_types.h + create mode 100644 fs/bcachefs/compress.c + create mode 100644 fs/bcachefs/compress.h + create mode 100644 fs/bcachefs/counters.c + create mode 100644 fs/bcachefs/counters.h + create mode 100644 fs/bcachefs/darray.h + create mode 100644 fs/bcachefs/data_update.c + create mode 100644 fs/bcachefs/data_update.h + create mode 100644 fs/bcachefs/debug.c + create mode 100644 fs/bcachefs/debug.h + create mode 100644 fs/bcachefs/dirent.c + create mode 100644 fs/bcachefs/dirent.h + create mode 100644 fs/bcachefs/disk_groups.c + create mode 100644 fs/bcachefs/disk_groups.h + create mode 100644 fs/bcachefs/ec.c + create mode 100644 fs/bcachefs/ec.h + create mode 100644 fs/bcachefs/ec_types.h + create mode 100644 fs/bcachefs/errcode.c + create mode 100644 fs/bcachefs/errcode.h + create mode 100644 fs/bcachefs/error.c + create mode 100644 fs/bcachefs/error.h + create mode 100644 fs/bcachefs/extent_update.c + create mode 100644 fs/bcachefs/extent_update.h + create mode 100644 fs/bcachefs/extents.c + create mode 100644 fs/bcachefs/extents.h + create mode 100644 fs/bcachefs/extents_types.h + create mode 100644 fs/bcachefs/eytzinger.h + create mode 100644 fs/bcachefs/fifo.h + create mode 100644 fs/bcachefs/fs-common.c + create mode 100644 fs/bcachefs/fs-common.h + create mode 100644 fs/bcachefs/fs-io.c + create mode 100644 fs/bcachefs/fs-io.h + create mode 100644 fs/bcachefs/fs-ioctl.c + create mode 100644 fs/bcachefs/fs-ioctl.h + create mode 100644 fs/bcachefs/fs.c + create mode 100644 fs/bcachefs/fs.h + create mode 100644 fs/bcachefs/fsck.c + create mode 100644 fs/bcachefs/fsck.h + create mode 100644 fs/bcachefs/inode.c + create mode 100644 fs/bcachefs/inode.h + create mode 100644 fs/bcachefs/io.c + create mode 100644 fs/bcachefs/io.h + create mode 100644 fs/bcachefs/io_types.h + create mode 100644 fs/bcachefs/journal.c + create mode 100644 fs/bcachefs/journal.h + create mode 100644 fs/bcachefs/journal_io.c + create mode 100644 fs/bcachefs/journal_io.h + create mode 100644 fs/bcachefs/journal_reclaim.c + create mode 100644 fs/bcachefs/journal_reclaim.h + create mode 100644 fs/bcachefs/journal_sb.c + create mode 100644 fs/bcachefs/journal_sb.h + create mode 100644 fs/bcachefs/journal_seq_blacklist.c + create mode 100644 fs/bcachefs/journal_seq_blacklist.h + create mode 100644 fs/bcachefs/journal_types.h + create mode 100644 fs/bcachefs/keylist.c + create mode 100644 fs/bcachefs/keylist.h + create mode 100644 fs/bcachefs/keylist_types.h + create mode 100644 fs/bcachefs/lru.c + create mode 100644 fs/bcachefs/lru.h + create mode 100644 fs/bcachefs/migrate.c + create mode 100644 fs/bcachefs/migrate.h + create mode 100644 fs/bcachefs/move.c + create mode 100644 fs/bcachefs/move.h + create mode 100644 fs/bcachefs/move_types.h + create mode 100644 fs/bcachefs/movinggc.c + create mode 100644 fs/bcachefs/movinggc.h + create mode 100644 fs/bcachefs/opts.c + create mode 100644 fs/bcachefs/opts.h + create mode 100644 fs/bcachefs/quota.c + create mode 100644 fs/bcachefs/quota.h + create mode 100644 fs/bcachefs/quota_types.h + create mode 100644 fs/bcachefs/rebalance.c + create mode 100644 fs/bcachefs/rebalance.h + create mode 100644 fs/bcachefs/rebalance_types.h + create mode 100644 fs/bcachefs/recovery.c + create mode 100644 fs/bcachefs/recovery.h + create mode 100644 fs/bcachefs/reflink.c + create mode 100644 fs/bcachefs/reflink.h + create mode 100644 fs/bcachefs/replicas.c + create mode 100644 fs/bcachefs/replicas.h + create mode 100644 fs/bcachefs/replicas_types.h + create mode 100644 fs/bcachefs/siphash.c + create mode 100644 fs/bcachefs/siphash.h + create mode 100644 fs/bcachefs/str_hash.h + create mode 100644 fs/bcachefs/subvolume.c + create mode 100644 fs/bcachefs/subvolume.h + create mode 100644 fs/bcachefs/subvolume_types.h + create mode 100644 fs/bcachefs/super-io.c + create mode 100644 fs/bcachefs/super-io.h + create mode 100644 fs/bcachefs/super.c + create mode 100644 fs/bcachefs/super.h + create mode 100644 fs/bcachefs/super_types.h + create mode 100644 fs/bcachefs/sysfs.c + create mode 100644 fs/bcachefs/sysfs.h + create mode 100644 fs/bcachefs/tests.c + create mode 100644 fs/bcachefs/tests.h + create mode 100644 fs/bcachefs/trace.c + create mode 100644 fs/bcachefs/util.c + create mode 100644 fs/bcachefs/util.h + create mode 100644 fs/bcachefs/varint.c + create mode 100644 fs/bcachefs/varint.h + create mode 100644 fs/bcachefs/vstructs.h + create mode 100644 fs/bcachefs/xattr.c + create mode 100644 fs/bcachefs/xattr.h + rename {drivers/md/bcache => include/linux}/closure.h (94%) + create mode 100644 include/linux/pretty-printers.h + create mode 100644 include/linux/printbuf.h + delete mode 100644 include/linux/seq_buf.h + create mode 100644 include/linux/six.h + create mode 100644 include/trace/events/bcachefs.h + create mode 100644 kernel/locking/six.c + rename {drivers/md/bcache => lib}/closure.c (88%) + create mode 100644 lib/pretty-printers.c + create mode 100644 lib/printbuf.c + delete mode 100644 lib/seq_buf.c + rename {lib => mm}/show_mem.c (83%) + +diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md +new file mode 100644 +index 000000000000..8af34357dd98 +--- /dev/null ++++ b/.github/ISSUE_TEMPLATE/bug_report.md +@@ -0,0 +1,61 @@ ++--- ++name: Bug report ++about: Create a report to help us improve ++title: " [short commit id]" ++labels: bug ++assignees: YellowOnion ++ ++--- ++ ++**Please search for duplicates** ++ ++**Version** ++ ++Make sure you're using a reasonably new version. ++ ++Provide the commit hash from the kernel version (preferable) or tools, don't say "I'm using the latest master" as that will very quickly become out of date. ++ ++**Generic info** ++Provide the output of: ++``` ++bcachefs fs usage ++bcachefs show-super ++``` ++**Tools bugs** ++ ++* pull the latest version, compile it, do not strip the binary. ++* provide the exact commands you used to run. ++* run with gdb: `gdb -ex run --args ./bcacehfs ` ++ ++If you get an assert/segfault etc: ++* type `bt` in to and provide the output here. ++ ++If the tools lockup: ++* run `perf top -p $(pidof bcachefs)` and provide a screenshot. ++* press ctrl+c to interrupt the process and provide the output of `bt`. ++ ++**Kernel bugs** ++Compile the kernel with these flags: ++ ++``` ++CONFIG_PREEMPT=y ++CONFIG_BCACHEFS_DEBUG=y ++CONFIG_KALLSYMS=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_DEBUG_FS=y ++CONFIG_DYNAMIC_FTRACE=y ++CONFIG_FTRACE=y ++``` ++Provide the output of `dmesg` either in a paste-bin or as attachment, if less than 30~ lines just provide inline here. ++ ++ ++**Optional Advanced** ++ ++If lockup or performance issues: ++* run `perf record` and `perf record -e 'bcachefs:*' -o events.data` both during the window of issue and then ctrl+c. ++* run `perf archive` to dump symbols. ++* archive, compress and upload the files: `perf.data`, `events.data` and `perf.data.tar.bz2`. ++ ++Upload large files to a file storage provider: ++* provide the output of `bcachefs list_journal -a | zstd -f -T0 -o ../journal.log.zst` ++*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump +diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst +index 5e89497ba314..4f4a35b3aadc 100644 +--- a/Documentation/core-api/printk-formats.rst ++++ b/Documentation/core-api/printk-formats.rst +@@ -625,6 +625,28 @@ Examples:: + %p4cc Y10 little-endian (0x20303159) + %p4cc NV12 big-endian (0xb231564e) + ++Calling a pretty printer function ++--------------------------------- ++ ++:: ++ ++ %pf(%p) pretty printer function taking one argument ++ %pf(%p,%p) pretty printer function taking two arguments ++ ++For calling generic pretty printers. A pretty printer is a function that takes ++as its first argument a pointer to a printbuf, and then zero or more additional ++pointer arguments. For example: ++ ++ void foo_to_text(struct printbuf *out, struct foo *foo) ++ { ++ pr_buf(out, "bar=%u baz=%u", foo->bar, foo->baz); ++ } ++ ++ printf("%pf(%p)", CALL_PP(foo_to_text, foo)); ++ ++Note that a pretty-printer may not sleep if called from printk(). If called from ++pr_buf() or sprintf() there are no such restrictions. ++ + Thanks + ====== + +diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c +index 0fbda89cd1bb..05654dbeb2c4 100644 +--- a/arch/powerpc/kernel/process.c ++++ b/arch/powerpc/kernel/process.c +@@ -37,7 +37,7 @@ + #include + #include + #include +-#include ++#include + + #include + #include +@@ -1396,32 +1396,30 @@ void show_user_instructions(struct pt_regs *regs) + { + unsigned long pc; + int n = NR_INSN_TO_PRINT; +- struct seq_buf s; + char buf[96]; /* enough for 8 times 9 + 2 chars */ ++ struct printbuf s = PRINTBUF_EXTERN(buf, sizeof(buf)); + + pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); + +- seq_buf_init(&s, buf, sizeof(buf)); +- + while (n) { + int i; + +- seq_buf_clear(&s); ++ printbuf_reset(&s); + + for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { + int instr; + + if (copy_from_user_nofault(&instr, (void __user *)pc, + sizeof(instr))) { +- seq_buf_printf(&s, "XXXXXXXX "); ++ prt_printf(&s, "XXXXXXXX "); + continue; + } +- seq_buf_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr); ++ prt_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr); + } + +- if (!seq_buf_has_overflowed(&s)) ++ if (printbuf_remaining(&s)) + pr_info("%s[%d]: code: %s\n", current->comm, +- current->pid, s.buffer); ++ current->pid, s.buf); + } + } + +diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c +index d96fd14bd7c9..b34de62e65ce 100644 +--- a/arch/powerpc/kernel/security.c ++++ b/arch/powerpc/kernel/security.c +@@ -10,7 +10,7 @@ + #include + #include + #include +-#include ++#include + #include + + #include +@@ -144,31 +144,28 @@ void __init setup_spectre_v2(void) + #ifdef CONFIG_PPC_BOOK3S_64 + ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) + { ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + bool thread_priv; + + thread_priv = security_ftr_enabled(SEC_FTR_L1D_THREAD_PRIV); + + if (rfi_flush) { +- struct seq_buf s; +- seq_buf_init(&s, buf, PAGE_SIZE - 1); + +- seq_buf_printf(&s, "Mitigation: RFI Flush"); ++ prt_printf(&s, "Mitigation: RFI Flush"); + if (thread_priv) +- seq_buf_printf(&s, ", L1D private per thread"); +- +- seq_buf_printf(&s, "\n"); +- +- return s.len; ++ prt_printf(&s, ", L1D private per thread"); ++ ++ prt_printf(&s, "\n"); ++ } else if (thread_priv) { ++ prt_printf(&s, "Vulnerable: L1D private per thread\n"); ++ } else if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && ++ !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)) { ++ prt_printf(&s, "Not affected\n"); ++ } else { ++ prt_printf(&s, "Vulnerable\n"); + } + +- if (thread_priv) +- return sprintf(buf, "Vulnerable: L1D private per thread\n"); +- +- if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && +- !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)) +- return sprintf(buf, "Not affected\n"); +- +- return sprintf(buf, "Vulnerable\n"); ++ return printbuf_written(&s); + } + + ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +@@ -179,70 +176,66 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b + + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) + { +- struct seq_buf s; +- +- seq_buf_init(&s, buf, PAGE_SIZE - 1); ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + + if (security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR)) { + if (barrier_nospec_enabled) +- seq_buf_printf(&s, "Mitigation: __user pointer sanitization"); ++ prt_printf(&s, "Mitigation: __user pointer sanitization"); + else +- seq_buf_printf(&s, "Vulnerable"); ++ prt_printf(&s, "Vulnerable"); + + if (security_ftr_enabled(SEC_FTR_SPEC_BAR_ORI31)) +- seq_buf_printf(&s, ", ori31 speculation barrier enabled"); ++ prt_printf(&s, ", ori31 speculation barrier enabled"); + +- seq_buf_printf(&s, "\n"); ++ prt_printf(&s, "\n"); + } else +- seq_buf_printf(&s, "Not affected\n"); ++ prt_printf(&s, "Not affected\n"); + +- return s.len; ++ return printbuf_written(&s); + } + + ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) + { +- struct seq_buf s; ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + bool bcs, ccd; + +- seq_buf_init(&s, buf, PAGE_SIZE - 1); +- + bcs = security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED); + ccd = security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED); + + if (bcs || ccd) { +- seq_buf_printf(&s, "Mitigation: "); ++ prt_printf(&s, "Mitigation: "); + + if (bcs) +- seq_buf_printf(&s, "Indirect branch serialisation (kernel only)"); ++ prt_printf(&s, "Indirect branch serialisation (kernel only)"); + + if (bcs && ccd) +- seq_buf_printf(&s, ", "); ++ prt_printf(&s, ", "); + + if (ccd) +- seq_buf_printf(&s, "Indirect branch cache disabled"); ++ prt_printf(&s, "Indirect branch cache disabled"); + + } else if (count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) { +- seq_buf_printf(&s, "Mitigation: Software count cache flush"); ++ prt_printf(&s, "Mitigation: Software count cache flush"); + + if (count_cache_flush_type == BRANCH_CACHE_FLUSH_HW) +- seq_buf_printf(&s, " (hardware accelerated)"); ++ prt_printf(&s, " (hardware accelerated)"); + + } else if (btb_flush_enabled) { +- seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); ++ prt_printf(&s, "Mitigation: Branch predictor state flush"); + } else { +- seq_buf_printf(&s, "Vulnerable"); ++ prt_printf(&s, "Vulnerable"); + } + + if (bcs || ccd || count_cache_flush_type != BRANCH_CACHE_FLUSH_NONE) { + if (link_stack_flush_type != BRANCH_CACHE_FLUSH_NONE) +- seq_buf_printf(&s, ", Software link stack flush"); ++ prt_printf(&s, ", Software link stack flush"); + if (link_stack_flush_type == BRANCH_CACHE_FLUSH_HW) +- seq_buf_printf(&s, " (hardware accelerated)"); ++ prt_printf(&s, " (hardware accelerated)"); + } + +- seq_buf_printf(&s, "\n"); ++ prt_printf(&s, "\n"); + +- return s.len; ++ return printbuf_written(&s); + } + + #ifdef CONFIG_PPC_BOOK3S_64 +diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c +index 82cae08976bc..fe2b41858b5f 100644 +--- a/arch/powerpc/platforms/pseries/papr_scm.c ++++ b/arch/powerpc/platforms/pseries/papr_scm.c +@@ -12,7 +12,7 @@ + #include + #include + #include +-#include ++#include + #include + + #include +@@ -1142,7 +1142,7 @@ static ssize_t perf_stats_show(struct device *dev, + { + int index; + ssize_t rc; +- struct seq_buf s; ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct nvdimm *dimm = to_nvdimm(dev); +@@ -1165,18 +1165,17 @@ static ssize_t perf_stats_show(struct device *dev, + * values. Since stat_id is essentially a char string of + * 8 bytes, simply use the string format specifier to print it. + */ +- seq_buf_init(&s, buf, PAGE_SIZE); + for (index = 0, stat = stats->scm_statistic; + index < be32_to_cpu(stats->num_statistics); + ++index, ++stat) { +- seq_buf_printf(&s, "%.8s = 0x%016llX\n", +- stat->stat_id, +- be64_to_cpu(stat->stat_val)); ++ prt_printf(&s, "%.8s = 0x%016llX\n", ++ stat->stat_id, ++ be64_to_cpu(stat->stat_val)); + } + + free_stats: + kfree(stats); +- return rc ? rc : (ssize_t)seq_buf_used(&s); ++ return rc ?: printbuf_written(&s); + } + static DEVICE_ATTR_ADMIN_RO(perf_stats); + +@@ -1185,7 +1184,7 @@ static ssize_t flags_show(struct device *dev, + { + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); +- struct seq_buf s; ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + u64 health; + int rc; + +@@ -1196,29 +1195,28 @@ static ssize_t flags_show(struct device *dev, + /* Copy health_bitmap locally, check masks & update out buffer */ + health = READ_ONCE(p->health_bitmap); + +- seq_buf_init(&s, buf, PAGE_SIZE); + if (health & PAPR_PMEM_UNARMED_MASK) +- seq_buf_printf(&s, "not_armed "); ++ prt_printf(&s, "not_armed "); + + if (health & PAPR_PMEM_BAD_SHUTDOWN_MASK) +- seq_buf_printf(&s, "flush_fail "); ++ prt_printf(&s, "flush_fail "); + + if (health & PAPR_PMEM_BAD_RESTORE_MASK) +- seq_buf_printf(&s, "restore_fail "); ++ prt_printf(&s, "restore_fail "); + + if (health & PAPR_PMEM_ENCRYPTED) +- seq_buf_printf(&s, "encrypted "); ++ prt_printf(&s, "encrypted "); + + if (health & PAPR_PMEM_SMART_EVENT_MASK) +- seq_buf_printf(&s, "smart_notify "); ++ prt_printf(&s, "smart_notify "); + + if (health & PAPR_PMEM_SCRUBBED_AND_LOCKED) +- seq_buf_printf(&s, "scrubbed locked "); ++ prt_printf(&s, "scrubbed locked "); + +- if (seq_buf_used(&s)) +- seq_buf_printf(&s, "\n"); ++ if (printbuf_written(&s)) ++ prt_printf(&s, "\n"); + +- return seq_buf_used(&s); ++ return printbuf_written(&s); + } + DEVICE_ATTR_RO(flags); + +diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +index f276aff521e8..50c12711a249 100644 +--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c ++++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c +@@ -19,7 +19,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -51,7 +51,7 @@ static struct kernfs_node *kn_mongrp; + /* Kernel fs node for "mon_data" directory under root */ + static struct kernfs_node *kn_mondata; + +-static struct seq_buf last_cmd_status; ++static struct printbuf last_cmd_status; + static char last_cmd_status_buf[512]; + + struct dentry *debugfs_resctrl; +@@ -59,13 +59,13 @@ struct dentry *debugfs_resctrl; + void rdt_last_cmd_clear(void) + { + lockdep_assert_held(&rdtgroup_mutex); +- seq_buf_clear(&last_cmd_status); ++ printbuf_reset(&last_cmd_status); + } + + void rdt_last_cmd_puts(const char *s) + { + lockdep_assert_held(&rdtgroup_mutex); +- seq_buf_puts(&last_cmd_status, s); ++ prt_str(&last_cmd_status, s); + } + + void rdt_last_cmd_printf(const char *fmt, ...) +@@ -74,7 +74,7 @@ void rdt_last_cmd_printf(const char *fmt, ...) + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); +- seq_buf_vprintf(&last_cmd_status, fmt, ap); ++ prt_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); + } + +@@ -833,7 +833,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + int len; + + mutex_lock(&rdtgroup_mutex); +- len = seq_buf_used(&last_cmd_status); ++ len = printbuf_written(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else +@@ -3248,8 +3248,8 @@ int __init rdtgroup_init(void) + { + int ret = 0; + +- seq_buf_init(&last_cmd_status, last_cmd_status_buf, +- sizeof(last_cmd_status_buf)); ++ last_cmd_status = PRINTBUF_EXTERN(last_cmd_status_buf, ++ sizeof(last_cmd_status_buf)); + + ret = rdtgroup_setup_root(); + if (ret) +diff --git a/block/bio.c b/block/bio.c +index 51c99f2c5c90..2d0d7f13d59a 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -582,15 +582,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) + } + EXPORT_SYMBOL(bio_kmalloc); + +-void zero_fill_bio(struct bio *bio) ++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) + { + struct bio_vec bv; + struct bvec_iter iter; + +- bio_for_each_segment(bv, bio, iter) ++ __bio_for_each_segment(bv, bio, iter, start) + memzero_bvec(&bv); + } +-EXPORT_SYMBOL(zero_fill_bio); ++EXPORT_SYMBOL(zero_fill_bio_iter); + + /** + * bio_truncate - truncate the bio to small size of @new_size +@@ -1363,17 +1363,27 @@ EXPORT_SYMBOL(__bio_advance); + void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) + { ++ struct bio_vec src_bv, dst_bv; ++ void *src_p, *dst_p; ++ unsigned bytes; ++ + while (src_iter->bi_size && dst_iter->bi_size) { +- struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); +- struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); +- unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); +- void *src_buf = bvec_kmap_local(&src_bv); +- void *dst_buf = bvec_kmap_local(&dst_bv); ++ src_bv = bio_iter_iovec(src, *src_iter); ++ dst_bv = bio_iter_iovec(dst, *dst_iter); ++ ++ bytes = min(src_bv.bv_len, dst_bv.bv_len); ++ ++ src_p = kmap_atomic(src_bv.bv_page); ++ dst_p = kmap_atomic(dst_bv.bv_page); ++ ++ memcpy(dst_p + dst_bv.bv_offset, ++ src_p + src_bv.bv_offset, ++ bytes); + +- memcpy(dst_buf, src_buf, bytes); ++ kunmap_atomic(dst_p); ++ kunmap_atomic(src_p); + +- kunmap_local(dst_buf); +- kunmap_local(src_buf); ++ flush_dcache_page(dst_bv.bv_page); + + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); +@@ -1447,6 +1457,7 @@ void bio_set_pages_dirty(struct bio *bio) + set_page_dirty_lock(bvec->bv_page); + } + } ++EXPORT_SYMBOL_GPL(bio_set_pages_dirty); + + /* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. +@@ -1506,6 +1517,7 @@ void bio_check_pages_dirty(struct bio *bio) + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } ++EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + + static inline bool bio_remaining_done(struct bio *bio) + { +diff --git a/block/blk-core.c b/block/blk-core.c +index 27fb1357ad4b..7697abda9fad 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -207,6 +207,7 @@ const char *blk_status_to_str(blk_status_t status) + return ""; + return blk_errors[idx].name; + } ++EXPORT_SYMBOL_GPL(blk_status_to_str); + + /** + * blk_sync_queue - cancel any pending callbacks on a queue +diff --git a/block/blk.h b/block/blk.h +index 434017701403..066fd89c916b 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -240,7 +240,6 @@ static inline void blk_integrity_del(struct gendisk *disk) + + unsigned long blk_rq_timeout(unsigned long timeout); + void blk_add_timer(struct request *req); +-const char *blk_status_to_str(blk_status_t status); + + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs); +diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c +index 8bc71cdc2270..370993c9c381 100644 +--- a/drivers/acpi/apei/erst-dbg.c ++++ b/drivers/acpi/apei/erst-dbg.c +@@ -11,6 +11,7 @@ + * Author: Huang Ying + */ + ++#include + #include + #include + #include +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index 084f9b8a0ba3..7a420623ac38 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -1166,8 +1166,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) + if (!release) + blk_mq_unfreeze_queue(lo->lo_queue); + +- disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); +- + if (lo->lo_flags & LO_FLAGS_PARTSCAN) { + int err; + +diff --git a/drivers/clk/tegra/clk-bpmp.c b/drivers/clk/tegra/clk-bpmp.c +index 3748a39dae7c..7e3b48ed9d45 100644 +--- a/drivers/clk/tegra/clk-bpmp.c ++++ b/drivers/clk/tegra/clk-bpmp.c +@@ -5,7 +5,7 @@ + + #include + #include +-#include ++#include + #include + + #include +@@ -365,39 +365,38 @@ static void tegra_bpmp_clk_info_dump(struct tegra_bpmp *bpmp, + const struct tegra_bpmp_clk_info *info) + { + const char *prefix = ""; +- struct seq_buf buf; ++ struct printbuf buf = PRINTBUF; + unsigned int i; +- char flags[64]; +- +- seq_buf_init(&buf, flags, sizeof(flags)); + + if (info->flags) +- seq_buf_printf(&buf, "("); ++ prt_printf(&buf, "("); + + if (info->flags & TEGRA_BPMP_CLK_HAS_MUX) { +- seq_buf_printf(&buf, "%smux", prefix); ++ prt_printf(&buf, "%smux", prefix); + prefix = ", "; + } + + if ((info->flags & TEGRA_BPMP_CLK_HAS_SET_RATE) == 0) { +- seq_buf_printf(&buf, "%sfixed", prefix); ++ prt_printf(&buf, "%sfixed", prefix); + prefix = ", "; + } + + if (info->flags & TEGRA_BPMP_CLK_IS_ROOT) { +- seq_buf_printf(&buf, "%sroot", prefix); ++ prt_printf(&buf, "%sroot", prefix); + prefix = ", "; + } + + if (info->flags) +- seq_buf_printf(&buf, ")"); ++ prt_printf(&buf, ")"); + + dev_printk(level, bpmp->dev, "%03u: %s\n", info->id, info->name); +- dev_printk(level, bpmp->dev, " flags: %lx %s\n", info->flags, flags); ++ dev_printk(level, bpmp->dev, " flags: %lx %s\n", info->flags, printbuf_str(&buf)); + dev_printk(level, bpmp->dev, " parents: %u\n", info->num_parents); + + for (i = 0; i < info->num_parents; i++) + dev_printk(level, bpmp->dev, " %03u\n", info->parents[i]); ++ ++ printbuf_exit(&buf); + } + + static int tegra_bpmp_probe_clocks(struct tegra_bpmp *bpmp, +diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c +index 3088c5b829f0..a8c5f90e8208 100644 +--- a/drivers/input/joystick/analog.c ++++ b/drivers/input/joystick/analog.c +@@ -19,7 +19,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -339,24 +339,21 @@ static void analog_calibrate_timer(struct analog_port *port) + + static void analog_name(struct analog *analog) + { +- struct seq_buf s; ++ struct printbuf buf = PRINTBUF_EXTERN(analog->name, sizeof(analog->name)); + +- seq_buf_init(&s, analog->name, sizeof(analog->name)); +- seq_buf_printf(&s, "Analog %d-axis %d-button", +- hweight8(analog->mask & ANALOG_AXES_STD), +- hweight8(analog->mask & ANALOG_BTNS_STD) + !!(analog->mask & ANALOG_BTNS_CHF) * 2 + +- hweight16(analog->mask & ANALOG_BTNS_GAMEPAD) + !!(analog->mask & ANALOG_HBTN_CHF) * 4); ++ prt_printf(&buf, "Analog %d-axis %d-button", ++ hweight8(analog->mask & ANALOG_AXES_STD), ++ hweight8(analog->mask & ANALOG_BTNS_STD) + !!(analog->mask & ANALOG_BTNS_CHF) * 2 + ++ hweight16(analog->mask & ANALOG_BTNS_GAMEPAD) + !!(analog->mask & ANALOG_HBTN_CHF) * 4); + + if (analog->mask & ANALOG_HATS_ALL) +- seq_buf_printf(&s, " %d-hat", +- hweight16(analog->mask & ANALOG_HATS_ALL)); +- ++ prt_printf(&buf, " %d-hat", hweight16(analog->mask & ANALOG_HATS_ALL)); + if (analog->mask & ANALOG_HAT_FCS) +- seq_buf_printf(&s, " FCS"); ++ prt_printf(&buf, " FCS"); + if (analog->mask & ANALOG_ANY_CHF) +- seq_buf_printf(&s, (analog->mask & ANALOG_SAITEK) ? " Saitek" : " CHF"); ++ prt_printf(&buf, (analog->mask & ANALOG_SAITEK) ? " Saitek" : " CHF"); + +- seq_buf_printf(&s, (analog->mask & ANALOG_GAMEPAD) ? " gamepad" : " joystick"); ++ prt_printf(&buf, (analog->mask & ANALOG_GAMEPAD) ? " gamepad" : " joystick"); + } + + /* +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..f1a1f0c4a0ea 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -4,6 +4,7 @@ config BCACHE + tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS + select CRC64 ++ select CLOSURES + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. +@@ -19,15 +20,6 @@ config BCACHE_DEBUG + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +-config BCACHE_CLOSURES_DEBUG +- bool "Debug closures" +- depends on BCACHE +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +- + config BCACHE_ASYNC_REGISTRATION + bool "Asynchronous device registration (EXPERIMENTAL)" + depends on BCACHE +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..054e8a33a7ab 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -2,6 +2,6 @@ + + obj-$(CONFIG_BCACHE) += bcache.o + +-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ +- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ ++bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ ++ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 2acda9cea0f9..bf96b3e6b6eb 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -179,6 +179,7 @@ + #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ + + #include ++#include + #include + #include + #include +@@ -192,7 +193,6 @@ + #include "bcache_ondisk.h" + #include "bset.h" + #include "util.h" +-#include "closure.h" + + struct bucket { + atomic_t pin; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 3563d15dbaf2..9249aba333bc 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2913,7 +2913,6 @@ static int __init bcache_init(void) + goto err; + + bch_debug_init(); +- closure_debug_init(); + + bcache_is_reboot = false; + +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index 6f3cb7c92130..f61ab1bada6c 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -4,6 +4,7 @@ + #define _BCACHE_UTIL_H + + #include ++#include + #include + #include + #include +@@ -13,8 +14,6 @@ + #include + #include + +-#include "closure.h" +- + struct closure; + + #ifdef CONFIG_BCACHE_DEBUG +diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c +index 462b429ad243..f06328035b9c 100644 +--- a/drivers/pci/p2pdma.c ++++ b/drivers/pci/p2pdma.c +@@ -17,7 +17,7 @@ + #include + #include + #include +-#include ++#include + #include + + enum pci_p2pdma_map_type { +@@ -281,12 +281,9 @@ static int pci_bridge_has_acs_redir(struct pci_dev *pdev) + return 0; + } + +-static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) ++static void prt_bus_devfn(struct printbuf *buf, struct pci_dev *pdev) + { +- if (!buf) +- return; +- +- seq_buf_printf(buf, "%s;", pci_name(pdev)); ++ prt_printf(buf, "%s;", pci_name(pdev)); + } + + static bool cpu_supports_p2pdma(void) +@@ -460,13 +457,11 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client, + struct pci_dev *a = provider, *b = client, *bb; + bool acs_redirects = false; + struct pci_p2pdma *p2pdma; +- struct seq_buf acs_list; + int acs_cnt = 0; + int dist_a = 0; + int dist_b = 0; + char buf[128]; +- +- seq_buf_init(&acs_list, buf, sizeof(buf)); ++ struct printbuf acs_list = PRINTBUF_EXTERN(buf, sizeof(buf)); + + /* + * Note, we don't need to take references to devices returned by +@@ -477,7 +472,7 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client, + dist_b = 0; + + if (pci_bridge_has_acs_redir(a)) { +- seq_buf_print_bus_devfn(&acs_list, a); ++ prt_bus_devfn(&acs_list, a); + acs_cnt++; + } + +@@ -506,7 +501,7 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client, + break; + + if (pci_bridge_has_acs_redir(bb)) { +- seq_buf_print_bus_devfn(&acs_list, bb); ++ prt_bus_devfn(&acs_list, bb); + acs_cnt++; + } + +@@ -521,11 +516,11 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client, + } + + if (verbose) { +- acs_list.buffer[acs_list.len-1] = 0; /* drop final semicolon */ ++ acs_list.buf[acs_list.pos-1] = 0; /* drop final semicolon */ + pci_warn(client, "ACS redirect is set between the client and provider (%s)\n", + pci_name(provider)); + pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n", +- acs_list.buffer); ++ acs_list.buf); + } + acs_redirects = true; + +diff --git a/fs/Kconfig b/fs/Kconfig +index 5976eb33535f..6d2c4231494a 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" + source "fs/btrfs/Kconfig" + source "fs/nilfs2/Kconfig" + source "fs/f2fs/Kconfig" ++source "fs/bcachefs/Kconfig" + source "fs/zonefs/Kconfig" + + endif # BLOCK +diff --git a/fs/Makefile b/fs/Makefile +index 208a74e0b00e..5d5c8c792058 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -134,6 +134,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_F2FS_FS) += f2fs/ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +new file mode 100644 +index 000000000000..008886967841 +--- /dev/null ++++ b/fs/bcachefs/Kconfig +@@ -0,0 +1,59 @@ ++ ++config BCACHEFS_FS ++ tristate "bcachefs filesystem support" ++ depends on BLOCK ++ select EXPORTFS ++ select CLOSURES ++ select LIBCRC32C ++ select CRC64 ++ select FS_POSIX_ACL ++ select LZ4_COMPRESS ++ select LZ4_DECOMPRESS ++ select ZLIB_DEFLATE ++ select ZLIB_INFLATE ++ select ZSTD_COMPRESS ++ select ZSTD_DECOMPRESS ++ select CRYPTO_SHA256 ++ select CRYPTO_CHACHA20 ++ select CRYPTO_POLY1305 ++ select KEYS ++ select SIXLOCKS ++ select RAID6_PQ ++ select XOR_BLOCKS ++ select XXHASH ++ select SRCU ++ select SYMBOLIC_ERRNAME ++ help ++ The bcachefs filesystem - a modern, copy on write filesystem, with ++ support for multiple devices, compression, checksumming, etc. ++ ++config BCACHEFS_QUOTA ++ bool "bcachefs quota support" ++ depends on BCACHEFS_FS ++ select QUOTACTL ++ ++config BCACHEFS_POSIX_ACL ++ bool "bcachefs POSIX ACL support" ++ depends on BCACHEFS_FS ++ select FS_POSIX_ACL ++ ++config BCACHEFS_DEBUG ++ bool "bcachefs debugging" ++ depends on BCACHEFS_FS ++ help ++ Enables many extra debugging checks and assertions. ++ ++ The resulting code will be significantly slower than normal; you ++ probably shouldn't select this option unless you're a developer. ++ ++config BCACHEFS_TESTS ++ bool "bcachefs unit and performance tests" ++ depends on BCACHEFS_FS ++ help ++ Include some unit and performance tests for the core btree code ++ ++config BCACHEFS_LOCK_TIME_STATS ++ bool "bcachefs lock time statistics" ++ depends on BCACHEFS_FS ++ help ++ Expose statistics for how long we held a lock in debugfs +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +new file mode 100644 +index 000000000000..5dad8ed03a20 +--- /dev/null ++++ b/fs/bcachefs/Makefile +@@ -0,0 +1,69 @@ ++ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o ++ ++bcachefs-y := \ ++ alloc_background.o \ ++ alloc_foreground.o \ ++ backpointers.o \ ++ bkey.o \ ++ bkey_methods.o \ ++ bkey_sort.o \ ++ bset.o \ ++ btree_cache.o \ ++ btree_gc.o \ ++ btree_io.o \ ++ btree_iter.o \ ++ btree_key_cache.o \ ++ btree_update_interior.o \ ++ btree_update_leaf.o \ ++ buckets.o \ ++ buckets_waiting_for_journal.o \ ++ chardev.o \ ++ checksum.o \ ++ clock.o \ ++ compress.o \ ++ counters.o \ ++ debug.o \ ++ dirent.o \ ++ disk_groups.o \ ++ data_update.o \ ++ ec.o \ ++ errcode.o \ ++ error.o \ ++ extents.o \ ++ extent_update.o \ ++ fs.o \ ++ fs-common.o \ ++ fs-ioctl.o \ ++ fs-io.o \ ++ fsck.o \ ++ inode.o \ ++ io.o \ ++ journal.o \ ++ journal_io.o \ ++ journal_reclaim.o \ ++ journal_sb.o \ ++ journal_seq_blacklist.o \ ++ keylist.o \ ++ lru.o \ ++ migrate.o \ ++ move.o \ ++ movinggc.o \ ++ opts.o \ ++ quota.o \ ++ rebalance.o \ ++ recovery.o \ ++ reflink.o \ ++ replicas.o \ ++ siphash.o \ ++ subvolume.o \ ++ super.o \ ++ super-io.o \ ++ sysfs.o \ ++ tests.o \ ++ trace.o \ ++ util.o \ ++ varint.o \ ++ xattr.o ++ ++bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +new file mode 100644 +index 000000000000..5c6ccf685094 +--- /dev/null ++++ b/fs/bcachefs/acl.c +@@ -0,0 +1,406 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#include "bcachefs.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "acl.h" ++#include "fs.h" ++#include "xattr.h" ++ ++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) ++{ ++ return sizeof(bch_acl_header) + ++ sizeof(bch_acl_entry_short) * nr_short + ++ sizeof(bch_acl_entry) * nr_long; ++} ++ ++static inline int acl_to_xattr_type(int type) ++{ ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; ++ case ACL_TYPE_DEFAULT: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Convert from filesystem to in-memory representation. ++ */ ++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) ++{ ++ const void *p, *end = value + size; ++ struct posix_acl *acl; ++ struct posix_acl_entry *out; ++ unsigned count = 0; ++ ++ if (!value) ++ return NULL; ++ if (size < sizeof(bch_acl_header)) ++ goto invalid; ++ if (((bch_acl_header *)value)->a_version != ++ cpu_to_le32(BCH_ACL_VERSION)) ++ goto invalid; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *entry = p; ++ ++ if (p + sizeof(bch_acl_entry_short) > end) ++ goto invalid; ++ ++ switch (le16_to_cpu(entry->e_tag)) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ case ACL_GROUP: ++ p += sizeof(bch_acl_entry); ++ break; ++ default: ++ goto invalid; ++ } ++ ++ count++; ++ } ++ ++ if (p > end) ++ goto invalid; ++ ++ if (!count) ++ return NULL; ++ ++ acl = posix_acl_alloc(count, GFP_KERNEL); ++ if (!acl) ++ return ERR_PTR(-ENOMEM); ++ ++ out = acl->a_entries; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *in = p; ++ ++ out->e_tag = le16_to_cpu(in->e_tag); ++ out->e_perm = le16_to_cpu(in->e_perm); ++ ++ switch (out->e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ out->e_uid = make_kuid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ out->e_gid = make_kgid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ } ++ ++ out++; ++ } ++ ++ BUG_ON(out != acl->a_entries + acl->a_count); ++ ++ return acl; ++invalid: ++ pr_err("invalid acl entry"); ++ return ERR_PTR(-EINVAL); ++} ++ ++#define acl_for_each_entry(acl, acl_e) \ ++ for (acl_e = acl->a_entries; \ ++ acl_e < acl->a_entries + acl->a_count; \ ++ acl_e++) ++ ++/* ++ * Convert from in-memory to filesystem representation. ++ */ ++static struct bkey_i_xattr * ++bch2_acl_to_xattr(struct btree_trans *trans, ++ const struct posix_acl *acl, ++ int type) ++{ ++ struct bkey_i_xattr *xattr; ++ bch_acl_header *acl_header; ++ const struct posix_acl_entry *acl_e; ++ void *outptr; ++ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; ++ ++ acl_for_each_entry(acl, acl_e) { ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ nr_long++; ++ break; ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ nr_short++; ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ ++ acl_len = bch2_acl_size(nr_short, nr_long); ++ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); ++ ++ if (u64s > U8_MAX) ++ return ERR_PTR(-E2BIG); ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return xattr; ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = acl_to_xattr_type(type); ++ xattr->v.x_name_len = 0, ++ xattr->v.x_val_len = cpu_to_le16(acl_len); ++ ++ acl_header = xattr_val(&xattr->v); ++ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); ++ ++ outptr = (void *) acl_header + sizeof(*acl_header); ++ ++ acl_for_each_entry(acl, acl_e) { ++ bch_acl_entry *entry = outptr; ++ ++ entry->e_tag = cpu_to_le16(acl_e->e_tag); ++ entry->e_perm = cpu_to_le16(acl_e->e_perm); ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ entry->e_id = cpu_to_le32( ++ from_kuid(&init_user_ns, acl_e->e_uid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32( ++ from_kgid(&init_user_ns, acl_e->e_gid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ outptr += sizeof(bch_acl_entry_short); ++ break; ++ } ++ } ++ ++ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); ++ ++ return xattr; ++} ++ ++struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c_xattr xattr; ++ struct posix_acl *acl = NULL; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (rcu) ++ return ERR_PTR(-ECHILD); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, ++ &hash, inode_inum(inode), ++ &X_SEARCH(acl_to_xattr_type(type), "", 0), ++ 0); ++ if (ret) { ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ if (ret != -ENOENT) ++ acl = ERR_PTR(ret); ++ goto out; ++ } ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) { ++ acl = ERR_PTR(ret); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(k); ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ++ if (!IS_ERR(acl)) ++ set_cached_acl(&inode->v, type, acl); ++out: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return acl; ++} ++ ++int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode_u, ++ struct posix_acl *acl, int type) ++{ ++ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); ++ int ret; ++ ++ if (type == ACL_TYPE_DEFAULT && ++ !S_ISDIR(inode_u->bi_mode)) ++ return acl ? -EACCES : 0; ++ ++ if (acl) { ++ struct bkey_i_xattr *xattr = ++ bch2_acl_to_xattr(trans, acl, type); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, ++ inum, &xattr->k_i, 0); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(acl_to_xattr_type(type), "", 0); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, ++ inum, &search); ++ } ++ ++ return ret == -ENOENT ? 0 : ret; ++} ++ ++int bch2_set_acl(struct user_namespace *mnt_userns, ++ struct inode *vinode, struct posix_acl *_acl, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl; ++ umode_t mode; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ acl = _acl; ++ ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto btree_err; ++ ++ mode = inode_u.bi_mode; ++ ++ if (type == ACL_TYPE_ACCESS) { ++ ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); ++ if (ret) ++ goto btree_err; ++ ++ inode_u.bi_ctime = bch2_current_time(c); ++ inode_u.bi_mode = mode; ++ ++ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, NULL, 0); ++btree_err: ++ bch2_trans_iter_exit(&trans, &inode_iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ bch2_inode_update_after_write(&trans, inode, &inode_u, ++ ATTR_CTIME|ATTR_MODE); ++ ++ set_cached_acl(&inode->v, type, acl); ++err: ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); ++ struct btree_iter iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_i_xattr *new; ++ struct posix_acl *acl; ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, ++ &hash_info, inum, ++ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), ++ BTREE_ITER_INTENT); ++ if (ret) ++ return ret == -ENOENT ? 0 : ret; ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ xattr = bkey_s_c_to_xattr(k); ++ if (ret) ++ goto err; ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ret = PTR_ERR_OR_ZERO(acl); ++ if (IS_ERR_OR_NULL(acl)) ++ goto err; ++ ++ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); ++ if (ret) ++ goto err; ++ ++ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ++ if (IS_ERR(new)) { ++ ret = PTR_ERR(new); ++ goto err; ++ } ++ ++ new->k.p = iter.pos; ++ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); ++ *new_acl = acl; ++ acl = NULL; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ if (!IS_ERR_OR_NULL(acl)) ++ kfree(acl); ++ return ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +new file mode 100644 +index 000000000000..2d76a4897ba8 +--- /dev/null ++++ b/fs/bcachefs/acl.h +@@ -0,0 +1,58 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ACL_H ++#define _BCACHEFS_ACL_H ++ ++struct bch_inode_unpacked; ++struct bch_hash_info; ++struct bch_inode_info; ++struct posix_acl; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#define BCH_ACL_VERSION 0x0001 ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++ __le32 e_id; ++} bch_acl_entry; ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++} bch_acl_entry_short; ++ ++typedef struct { ++ __le32 a_version; ++} bch_acl_header; ++ ++struct posix_acl *bch2_get_acl(struct inode *, int, bool); ++ ++int bch2_set_acl_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, ++ struct posix_acl *, int); ++int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); ++int bch2_acl_chmod(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, ++ umode_t, struct posix_acl **); ++ ++#else ++ ++static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode_u, ++ struct posix_acl *acl, int type) ++{ ++ return 0; ++} ++ ++static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ ++ ++#endif /* _BCACHEFS_ACL_H */ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +new file mode 100644 +index 000000000000..cd6cbd2064ee +--- /dev/null ++++ b/fs/bcachefs/alloc_background.c +@@ -0,0 +1,1552 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "backpointers.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "buckets_waiting_for_journal.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "lru.h" ++#include "recovery.h" ++#include "varint.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Persistent alloc info: */ ++ ++static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, ++ BCH_ALLOC_FIELDS_V1() ++#undef x ++}; ++ ++struct bkey_alloc_unpacked { ++ u64 journal_seq; ++ u8 gen; ++ u8 oldest_gen; ++ u8 data_type; ++ bool need_discard:1; ++ bool need_inc_gen:1; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++}; ++ ++static inline u64 alloc_field_v1_get(const struct bch_alloc *a, ++ const void **p, unsigned field) ++{ ++ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; ++ u64 v; ++ ++ if (!(a->fields & (1 << field))) ++ return 0; ++ ++ switch (bytes) { ++ case 1: ++ v = *((const u8 *) *p); ++ break; ++ case 2: ++ v = le16_to_cpup(*p); ++ break; ++ case 4: ++ v = le32_to_cpup(*p); ++ break; ++ case 8: ++ v = le64_to_cpup(*p); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++ return v; ++} ++ ++static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) ++{ ++ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; ++ ++ if (!v) ++ return; ++ ++ a->v.fields |= 1 << field; ++ ++ switch (bytes) { ++ case 1: ++ *((u8 *) *p) = v; ++ break; ++ case 2: ++ *((__le16 *) *p) = cpu_to_le16(v); ++ break; ++ case 4: ++ *((__le32 *) *p) = cpu_to_le32(v); ++ break; ++ case 8: ++ *((__le64 *) *p) = cpu_to_le64(v); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++} ++ ++static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) ++{ ++ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; ++ const void *d = in->data; ++ unsigned idx = 0; ++ ++ out->gen = in->gen; ++ ++#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); ++ BCH_ALLOC_FIELDS_V1() ++#undef x ++} ++ ++static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); ++ const u8 *in = a.v->data; ++ const u8 *end = bkey_val_end(a); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v; ++ ++ out->gen = a.v->gen; ++ out->oldest_gen = a.v->oldest_gen; ++ out->data_type = a.v->data_type; ++ ++#define x(_name, _bits) \ ++ if (fieldnr < a.v->nr_fields) { \ ++ ret = bch2_varint_decode_fast(in, end, &v); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v = 0; \ ++ } \ ++ out->_name = v; \ ++ if (v != out->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++ return 0; ++} ++ ++static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); ++ const u8 *in = a.v->data; ++ const u8 *end = bkey_val_end(a); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v; ++ ++ out->gen = a.v->gen; ++ out->oldest_gen = a.v->oldest_gen; ++ out->data_type = a.v->data_type; ++ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); ++ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); ++ out->journal_seq = le64_to_cpu(a.v->journal_seq); ++ ++#define x(_name, _bits) \ ++ if (fieldnr < a.v->nr_fields) { \ ++ ret = bch2_varint_decode_fast(in, end, &v); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v = 0; \ ++ } \ ++ out->_name = v; \ ++ if (v != out->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_ALLOC_FIELDS_V2() ++#undef x ++ return 0; ++} ++ ++static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ bch2_alloc_unpack_v1(&ret, k); ++ break; ++ case KEY_TYPE_alloc_v2: ++ bch2_alloc_unpack_v2(&ret, k); ++ break; ++ case KEY_TYPE_alloc_v3: ++ bch2_alloc_unpack_v3(&ret, k); ++ break; ++ } ++ ++ return ret; ++} ++ ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ struct bkey_i_alloc_v4 *a; ++ int ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ERR_PTR(ret); ++ } ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ if (IS_ERR(a)) ++ bch2_trans_iter_exit(trans, iter); ++ return a; ++} ++ ++static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) ++{ ++ unsigned i, bytes = offsetof(struct bch_alloc, data); ++ ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) ++ if (a->fields & (1 << i)) ++ bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; ++ ++ return DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ ++ /* allow for unknown fields */ ++ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { ++ prt_printf(err, "incorrect value size (%zu < %u)", ++ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_alloc_unpacked u; ++ ++ if (bch2_alloc_unpack_v2(&u, k)) { ++ prt_printf(err, "unpack error"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_alloc_unpacked u; ++ ++ if (bch2_alloc_unpack_v3(&u, k)) { ++ prt_printf(err, "unpack error"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); ++ ++ if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { ++ prt_printf(err, "bad val size (%lu != %u)", ++ bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); ++ return -EINVAL; ++ } ++ ++ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && ++ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { ++ prt_printf(err, "invalid backpointers_start"); ++ return -EINVAL; ++ } ++ ++ if (rw == WRITE) { ++ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { ++ prt_printf(err, "invalid data type (got %u should be %u)", ++ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); ++ return -EINVAL; ++ } ++ ++ switch (a.v->data_type) { ++ case BCH_DATA_free: ++ case BCH_DATA_need_gc_gens: ++ case BCH_DATA_need_discard: ++ if (a.v->dirty_sectors || ++ a.v->cached_sectors || ++ a.v->stripe) { ++ prt_printf(err, "empty data type free but have data"); ++ return -EINVAL; ++ } ++ break; ++ case BCH_DATA_sb: ++ case BCH_DATA_journal: ++ case BCH_DATA_btree: ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ if (!a.v->dirty_sectors) { ++ prt_printf(err, "data_type %s but dirty_sectors==0", ++ bch2_data_types[a.v->data_type]); ++ return -EINVAL; ++ } ++ break; ++ case BCH_DATA_cached: ++ if (!a.v->cached_sectors || ++ a.v->dirty_sectors || ++ a.v->stripe) { ++ prt_printf(err, "data type inconsistency"); ++ return -EINVAL; ++ } ++ ++ if (!a.v->io_time[READ] && ++ test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { ++ prt_printf(err, "cached bucket with read_time == 0"); ++ return -EINVAL; ++ } ++ break; ++ case BCH_DATA_stripe: ++ if (!a.v->stripe) { ++ prt_printf(err, "data_type %s but stripe==0", ++ bch2_data_types[a.v->data_type]); ++ return -EINVAL; ++ } ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++static inline u64 swab40(u64 x) ++{ ++ return (((x & 0x00000000ffULL) << 32)| ++ ((x & 0x000000ff00ULL) << 16)| ++ ((x & 0x0000ff0000ULL) >> 0)| ++ ((x & 0x00ff000000ULL) >> 16)| ++ ((x & 0xff00000000ULL) >> 32)); ++} ++ ++void bch2_alloc_v4_swab(struct bkey_s k) ++{ ++ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; ++ struct bch_backpointer *bp, *bps; ++ ++ a->journal_seq = swab64(a->journal_seq); ++ a->flags = swab32(a->flags); ++ a->dirty_sectors = swab32(a->dirty_sectors); ++ a->cached_sectors = swab32(a->cached_sectors); ++ a->io_time[0] = swab64(a->io_time[0]); ++ a->io_time[1] = swab64(a->io_time[1]); ++ a->stripe = swab32(a->stripe); ++ a->nr_external_backpointers = swab32(a->nr_external_backpointers); ++ ++ bps = alloc_v4_backpointers(a); ++ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { ++ bp->bucket_offset = swab40(bp->bucket_offset); ++ bp->bucket_len = swab32(bp->bucket_len); ++ bch2_bpos_swab(&bp->pos); ++ } ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_alloc_v4 _a; ++ const struct bch_alloc_v4 *a = &_a; ++ const struct bch_backpointer *bps; ++ unsigned i; ++ ++ if (k.k->type == KEY_TYPE_alloc_v4) ++ a = bkey_s_c_to_alloc_v4(k).v; ++ else ++ bch2_alloc_to_v4(k, &_a); ++ ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "gen %u oldest_gen %u data_type %s", ++ a->gen, a->oldest_gen, bch2_data_types[a->data_type]); ++ prt_newline(out); ++ prt_printf(out, "journal_seq %llu", a->journal_seq); ++ prt_newline(out); ++ prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); ++ prt_newline(out); ++ prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); ++ prt_newline(out); ++ prt_printf(out, "dirty_sectors %u", a->dirty_sectors); ++ prt_newline(out); ++ prt_printf(out, "cached_sectors %u", a->cached_sectors); ++ prt_newline(out); ++ prt_printf(out, "stripe %u", a->stripe); ++ prt_newline(out); ++ prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); ++ prt_newline(out); ++ prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); ++ prt_newline(out); ++ prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); ++ prt_newline(out); ++ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a)); ++ printbuf_indent_add(out, 2); ++ ++ bps = alloc_v4_backpointers_c(a); ++ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) { ++ prt_newline(out); ++ bch2_backpointer_to_text(out, &bps[i]); ++ } ++ ++ printbuf_indent_sub(out, 4); ++} ++ ++void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) ++{ ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ int d; ++ ++ *out = *bkey_s_c_to_alloc_v4(k).v; ++ ++ d = (int) BCH_ALLOC_V4_U64s - ++ (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0); ++ if (unlikely(d > 0)) { ++ memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out), ++ 0, ++ d * sizeof(u64)); ++ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); ++ } ++ } else { ++ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); ++ ++ *out = (struct bch_alloc_v4) { ++ .journal_seq = u.journal_seq, ++ .flags = u.need_discard, ++ .gen = u.gen, ++ .oldest_gen = u.oldest_gen, ++ .data_type = u.data_type, ++ .stripe_redundancy = u.stripe_redundancy, ++ .dirty_sectors = u.dirty_sectors, ++ .cached_sectors = u.cached_sectors, ++ .io_time[READ] = u.read_time, ++ .io_time[WRITE] = u.write_time, ++ .stripe = u.stripe, ++ }; ++ ++ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); ++ } ++} ++ ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ unsigned bytes = k.k->type == KEY_TYPE_alloc_v4 ++ ? bkey_bytes(k.k) ++ : sizeof(struct bkey_i_alloc_v4); ++ struct bkey_i_alloc_v4 *ret; ++ ++ /* ++ * Reserve space for one more backpointer here: ++ * Not sketchy at doing it this way, nope... ++ */ ++ ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); ++ if (IS_ERR(ret)) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_alloc_v4) { ++ bkey_reassemble(&ret->k_i, k); ++ ++ if (BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v) < BCH_ALLOC_V4_U64s) { ++ struct bch_backpointer *src, *dst; ++ ++ src = alloc_v4_backpointers(&ret->v); ++ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); ++ dst = alloc_v4_backpointers(&ret->v); ++ ++ memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * ++ sizeof(struct bch_backpointer)); ++ memset(src, 0, dst - src); ++ set_alloc_v4_u64s(ret); ++ } ++ } else { ++ bkey_alloc_v4_init(&ret->k_i); ++ ret->k.p = k.k->p; ++ bch2_alloc_to_v4(k, &ret->v); ++ } ++ return ret; ++} ++ ++int bch2_alloc_read(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct bch_dev *ca; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ /* ++ * Not a fsck error because this is checked/repaired by ++ * bch2_check_alloc_key() which runs later: ++ */ ++ if (!bch2_dev_bucket_exists(c, k.k->p)) ++ continue; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ bch2_alloc_to_v4(k, &a); ++ ++ *bucket_gen(ca, k.k->p.offset) = a.gen; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); ++ ++ return ret; ++} ++ ++/* Free space/discard btree: */ ++ ++static int bch2_bucket_do_index(struct btree_trans *trans, ++ struct bkey_s_c alloc_k, ++ const struct bch_alloc_v4 *a, ++ bool set) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); ++ struct btree_iter iter; ++ struct bkey_s_c old; ++ struct bkey_i *k; ++ enum btree_id btree; ++ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ if (a->data_type != BCH_DATA_free && ++ a->data_type != BCH_DATA_need_discard) ++ return 0; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.type = new_type; ++ ++ switch (a->data_type) { ++ case BCH_DATA_free: ++ btree = BTREE_ID_freespace; ++ k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); ++ bch2_key_resize(&k->k, 1); ++ break; ++ case BCH_DATA_need_discard: ++ btree = BTREE_ID_need_discard; ++ k->k.p = alloc_k.k->p; ++ break; ++ default: ++ return 0; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, btree, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ old = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(old); ++ if (ret) ++ goto err; ++ ++ if (ca->mi.freespace_initialized && ++ bch2_trans_inconsistent_on(old.k->type != old_type, trans, ++ "incorrect key when %s %s btree (got %s should be %s)\n" ++ " for %s", ++ set ? "setting" : "clearing", ++ bch2_btree_ids[btree], ++ bch2_bkey_types[old.k->type], ++ bch2_bkey_types[old_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, k, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_trans_mark_alloc(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_alloc_v4 old_a, *new_a; ++ u64 old_lru, new_lru; ++ int ret = 0; ++ ++ /* ++ * Deletion only happens in the device removal path, with ++ * BTREE_TRIGGER_NORUN: ++ */ ++ BUG_ON(new->k.type != KEY_TYPE_alloc_v4); ++ ++ bch2_alloc_to_v4(old, &old_a); ++ new_a = &bkey_i_to_alloc_v4(new)->v; ++ ++ new_a->data_type = alloc_data_type(*new_a, new_a->data_type); ++ ++ if (new_a->dirty_sectors > old_a.dirty_sectors || ++ new_a->cached_sectors > old_a.cached_sectors) { ++ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); ++ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); ++ } ++ ++ if (data_type_is_empty(new_a->data_type) && ++ BCH_ALLOC_V4_NEED_INC_GEN(new_a) && ++ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { ++ new_a->gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); ++ } ++ ++ if (old_a.data_type != new_a->data_type || ++ (new_a->data_type == BCH_DATA_free && ++ alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { ++ ret = bch2_bucket_do_index(trans, old, &old_a, false) ?: ++ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); ++ if (ret) ++ return ret; ++ } ++ ++ if (new_a->data_type == BCH_DATA_cached && ++ !new_a->io_time[READ]) ++ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ ++ old_lru = alloc_lru_idx(old_a); ++ new_lru = alloc_lru_idx(*new_a); ++ ++ if (old_lru != new_lru) { ++ ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, ++ old_lru, &new_lru, old); ++ if (ret) ++ return ret; ++ ++ if (new_a->data_type == BCH_DATA_cached) ++ new_a->io_time[READ] = new_lru; ++ } ++ ++ return 0; ++} ++ ++static int bch2_check_alloc_key(struct btree_trans *trans, ++ struct btree_iter *alloc_iter, ++ struct btree_iter *discard_iter, ++ struct btree_iter *freespace_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca; ++ struct bch_alloc_v4 a; ++ unsigned discard_key_type, freespace_key_type; ++ struct bkey_s_c alloc_k, k; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos) ++ ? bch2_btree_iter_peek_slot(alloc_iter) ++ : bch2_btree_iter_peek(alloc_iter); ++ if (!alloc_k.k) ++ return 1; ++ ++ ret = bkey_err(alloc_k); ++ if (ret) ++ return ret; ++ ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, ++ "alloc key for invalid device:bucket %llu:%llu", ++ alloc_k.k->p.inode, alloc_k.k->p.offset)) ++ return bch2_btree_delete_at(trans, alloc_iter, 0); ++ ++ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); ++ if (!ca->mi.freespace_initialized) ++ return 0; ++ ++ bch2_alloc_to_v4(alloc_k, &a); ++ ++ discard_key_type = a.data_type == BCH_DATA_need_discard ++ ? KEY_TYPE_set : 0; ++ freespace_key_type = a.data_type == BCH_DATA_free ++ ? KEY_TYPE_set : 0; ++ ++ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); ++ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a)); ++ ++ k = bch2_btree_iter_peek_slot(discard_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != discard_key_type && ++ (c->opts.reconstruct_alloc || ++ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[discard_key_type], ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.type = discard_key_type; ++ update->k.p = discard_iter->pos; ++ ++ ret = bch2_trans_update(trans, discard_iter, update, 0); ++ if (ret) ++ goto err; ++ } ++ ++ k = bch2_btree_iter_peek_slot(freespace_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != freespace_key_type && ++ (c->opts.reconstruct_alloc || ++ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" ++ " %s", ++ bch2_bkey_types[k.k->type], ++ bch2_bkey_types[freespace_key_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.type = freespace_key_type; ++ update->k.p = freespace_iter->pos; ++ bch2_key_resize(&update->k, 1); ++ ++ ret = bch2_trans_update(trans, freespace_iter, update, 0); ++ if (ret) ++ goto err; ++ } ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_check_discard_freespace_key(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter; ++ struct bkey_s_c alloc_k; ++ struct bch_alloc_v4 a; ++ u64 genbits; ++ struct bpos pos; ++ enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ++ ? BCH_DATA_need_discard ++ : BCH_DATA_free; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ pos = iter->pos; ++ pos.offset &= ~(~0ULL << 56); ++ genbits = iter->pos.offset & (~0ULL << 56); ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); ++ ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, ++ "entry in %s btree for nonexistant dev:bucket %llu:%llu", ++ bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) ++ goto delete; ++ ++ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); ++ if (ret) ++ goto err; ++ ++ bch2_alloc_to_v4(alloc_k, &a); ++ ++ if (fsck_err_on(a.data_type != state || ++ (state == BCH_DATA_free && ++ genbits != alloc_freespace_genbits(a)), c, ++ "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ bch2_btree_ids[iter->btree_id], ++ a.data_type == state, ++ genbits >> 56, alloc_freespace_genbits(a) >> 56)) ++ goto delete; ++out: ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++delete: ++ ret = bch2_btree_delete_extent_at(trans, iter, ++ iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0); ++ goto out; ++} ++ ++int bch2_check_alloc_info(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter, discard_iter, freespace_iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ while (1) { ++ ret = commit_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_check_alloc_key(&trans, &iter, ++ &discard_iter, ++ &freespace_iter)); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_advance(&iter); ++ } ++ bch2_trans_iter_exit(&trans, &freespace_iter); ++ bch2_trans_iter_exit(&trans, &discard_iter); ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret < 0) ++ goto err; ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_need_discard, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_discard_freespace_key(&trans, &iter)) ?: ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_freespace, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_discard_freespace_key(&trans, &iter)); ++err: ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, ++ struct btree_iter *alloc_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter lru_iter; ++ struct bch_alloc_v4 a; ++ struct bkey_s_c alloc_k, k; ++ struct printbuf buf = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret; ++ ++ alloc_k = bch2_btree_iter_peek(alloc_iter); ++ if (!alloc_k.k) ++ return 0; ++ ++ ret = bkey_err(alloc_k); ++ if (ret) ++ return ret; ++ ++ bch2_alloc_to_v4(alloc_k, &a); ++ ++ if (a.data_type != BCH_DATA_cached) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, ++ POS(alloc_k.k->p.inode, a.io_time[READ]), 0); ++ ++ k = bch2_btree_iter_peek_slot(&lru_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(!a.io_time[READ], c, ++ "cached bucket with read_time 0\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || ++ fsck_err_on(k.k->type != KEY_TYPE_lru || ++ le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, ++ "incorrect/missing lru entry\n" ++ " %s\n" ++ " %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ u64 read_time = a.io_time[READ]; ++ ++ if (!a.io_time[READ]) ++ a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ ++ ret = bch2_lru_set(trans, ++ alloc_k.k->p.inode, ++ alloc_k.k->p.offset, ++ &a.io_time[READ]); ++ if (ret) ++ goto err; ++ ++ if (a.io_time[READ] != read_time) { ++ struct bkey_i_alloc_v4 *a_mut = ++ bch2_alloc_to_v4_mut(trans, alloc_k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ goto err; ++ ++ a_mut->v.io_time[READ] = a.io_time[READ]; ++ ret = bch2_trans_update(trans, alloc_iter, ++ &a_mut->k_i, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; ++ } ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &lru_iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_check_alloc_to_lru_refs(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_alloc_to_lru_ref(&trans, &iter)); ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++static int bch2_discard_one_bucket(struct btree_trans *trans, ++ struct btree_iter *need_discard_iter, ++ struct bpos *discard_pos_done, ++ u64 *seen, ++ u64 *open, ++ u64 *need_journal_commit, ++ u64 *discarded) ++{ ++ struct bch_fs *c = trans->c; ++ struct bpos pos = need_discard_iter->pos; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct bkey_i_alloc_v4 *a; ++ struct printbuf buf = PRINTBUF; ++ bool did_discard = false; ++ int ret = 0; ++ ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); ++ return 0; ++ } ++ ++ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { ++ (*open)++; ++ goto out; ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ pos.inode, pos.offset)) { ++ (*need_journal_commit)++; ++ goto out; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ need_discard_iter->pos, ++ BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { ++ a->v.gen++; ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); ++ goto write; ++ } ++ ++ if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans, ++ "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" ++ "%s", ++ a->v.journal_seq, ++ c->journal.flushed_seq_ondisk, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans, ++ "bucket incorrectly set in need_discard btree\n" ++ "%s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (bkey_cmp(*discard_pos_done, iter.pos) && ++ ca->mi.discard && !c->opts.nochanges) { ++ /* ++ * This works without any other locks because this is the only ++ * thread that removes items from the need_discard tree ++ */ ++ bch2_trans_unlock(trans); ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ k.k->p.offset * ca->mi.bucket_size, ++ ca->mi.bucket_size, ++ GFP_KERNEL, 0); ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ goto out; ++ } ++ ++ *discard_pos_done = iter.pos; ++ did_discard = true; ++ ++ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); ++ a->v.data_type = alloc_data_type(a->v, a->v.data_type); ++write: ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); ++ if (ret) ++ goto out; ++ ++ if (did_discard) { ++ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); ++ (*discarded)++; ++ } ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ percpu_ref_put(&ca->io_ref); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_do_discards_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, discard_work); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; ++ struct bpos discard_pos_done = POS_MAX; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * We're doing the commit in bch2_discard_one_bucket instead of using ++ * for_each_btree_key_commit() so that we can increment counters after ++ * successful commit: ++ */ ++ ret = for_each_btree_key2(&trans, iter, ++ BTREE_ID_need_discard, POS_MIN, 0, k, ++ bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, ++ &seen, ++ &open, ++ &need_journal_commit, ++ &discarded)); ++ ++ bch2_trans_exit(&trans); ++ ++ if (need_journal_commit * 2 > seen) ++ bch2_journal_flush_async(&c->journal, NULL); ++ ++ percpu_ref_put(&c->writes); ++ ++ trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ++ bch2_err_str(ret)); ++} ++ ++void bch2_do_discards(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget_live(&c->writes) && ++ !queue_work(system_long_wq, &c->discard_work)) ++ percpu_ref_put(&c->writes); ++} ++ ++static int invalidate_one_bucket(struct btree_trans *trans, ++ struct btree_iter *lru_iter, struct bkey_s_c k, ++ unsigned dev_idx, s64 *nr_to_invalidate) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter = { NULL }; ++ struct bkey_i_alloc_v4 *a; ++ struct bpos bucket; ++ struct printbuf buf = PRINTBUF; ++ unsigned cached_sectors; ++ int ret = 0; ++ ++ if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx) ++ return 1; ++ ++ if (k.k->type != KEY_TYPE_lru) { ++ prt_printf(&buf, "non lru key in lru btree:\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { ++ bch_err(c, "%s", buf.buf); ++ } else { ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ret = -EINVAL; ++ } ++ ++ goto out; ++ } ++ ++ bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx)); ++ ++ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ if (k.k->p.offset != alloc_lru_idx(a->v)) { ++ prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { ++ bch_err(c, "%s", buf.buf); ++ } else { ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ret = -EINVAL; ++ } ++ ++ goto out; ++ } ++ ++ if (!a->v.cached_sectors) ++ bch_err(c, "invalidating empty bucket, confused"); ++ ++ cached_sectors = a->v.cached_sectors; ++ ++ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); ++ a->v.gen++; ++ a->v.data_type = 0; ++ a->v.dirty_sectors = 0; ++ a->v.cached_sectors = 0; ++ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); ++ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); ++ ++ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); ++ if (ret) ++ goto out; ++ ++ trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors); ++ this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]); ++ --*nr_to_invalidate; ++out: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_do_invalidates_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); ++ struct bch_dev *ca; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) { ++ s64 nr_to_invalidate = ++ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru, ++ POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k, ++ invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate)); ++ ++ if (ret < 0) { ++ percpu_ref_put(&ca->ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ percpu_ref_put(&c->writes); ++} ++ ++void bch2_do_invalidates(struct bch_fs *c) ++{ ++ if (percpu_ref_tryget_live(&c->writes) && ++ !queue_work(system_long_wq, &c->invalidate_work)) ++ percpu_ref_put(&c->writes); ++} ++ ++static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, struct bch_dev *ca) ++{ ++ struct bch_alloc_v4 a; ++ ++ if (iter->pos.offset >= ca->mi.nbuckets) ++ return 1; ++ ++ bch2_alloc_to_v4(k, &a); ++ return bch2_bucket_do_index(trans, k, &a, true); ++} ++ ++static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_member *m; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bucket_freespace_init(&trans, &iter, k, ca)); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret < 0) { ++ bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; ++ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_fs_freespace_init(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ bool doing_init = false; ++ ++ /* ++ * We can crash during the device add path, so we need to check this on ++ * every mount: ++ */ ++ ++ for_each_member_device(ca, c, i) { ++ if (ca->mi.freespace_initialized) ++ continue; ++ ++ if (!doing_init) { ++ bch_info(c, "initializing freespace"); ++ doing_init = true; ++ } ++ ++ ret = bch2_dev_freespace_init(c, ca); ++ if (ret) { ++ percpu_ref_put(&ca->ref); ++ return ret; ++ } ++ } ++ ++ if (doing_init) { ++ mutex_lock(&c->sb_lock); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch_verbose(c, "done initializing freespace"); ++ } ++ ++ return ret; ++} ++ ++/* Bucket IO clocks: */ ++ ++int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ++ size_t bucket_nr, int rw) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_i_alloc_v4 *a; ++ u64 now; ++ int ret = 0; ++ ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; ++ ++ now = atomic64_read(&c->io_clock[rw].now); ++ if (a->v.io_time[rw] == now) ++ goto out; ++ ++ a->v.io_time[rw] = now; ++ ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* Startup/shutdown (ro/rw): */ ++ ++void bch2_recalc_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve; ++ unsigned bucket_size_max = 0; ++ unsigned long ra_pages = 0; ++ unsigned i; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_online_member(ca, c, i) { ++ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ++ ++ ra_pages += bdi->ra_pages; ++ } ++ ++ bch2_set_ra_pages(c, ra_pages); ++ ++ for_each_rw_member(ca, c, i) { ++ u64 dev_reserve = 0; ++ ++ /* ++ * We need to reserve buckets (from the number ++ * of currently available buckets) against ++ * foreground writes so that mainly copygc can ++ * make forward progress. ++ * ++ * We need enough to refill the various reserves ++ * from scratch - copygc will use its entire ++ * reserve all at once, then run against when ++ * its reserve is refilled (from the formerly ++ * available buckets). ++ * ++ * This reserve is just used when considering if ++ * allocations for foreground writes must wait - ++ * not -ENOSPC calculations. ++ */ ++ ++ dev_reserve += ca->nr_btree_reserve * 2; ++ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ ++ ++ dev_reserve += 1; /* btree write point */ ++ dev_reserve += 1; /* copygc write point */ ++ dev_reserve += 1; /* rebalance write point */ ++ ++ dev_reserve *= ca->mi.bucket_size; ++ ++ capacity += bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket); ++ ++ reserved_sectors += dev_reserve * 2; ++ ++ bucket_size_max = max_t(unsigned, bucket_size_max, ++ ca->mi.bucket_size); ++ } ++ ++ gc_reserve = c->opts.gc_reserve_bytes ++ ? c->opts.gc_reserve_bytes >> 9 ++ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); ++ ++ reserved_sectors = max(gc_reserve, reserved_sectors); ++ ++ reserved_sectors = min(reserved_sectors, capacity); ++ ++ c->capacity = capacity - reserved_sectors; ++ ++ c->bucket_size_max = bucket_size_max; ++ ++ /* Wake up case someone was waiting for buckets */ ++ closure_wake_up(&c->freelist_wait); ++} ++ ++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct open_bucket *ob; ++ bool ret = false; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list && ++ ob->dev == ca->dev_idx) ++ ret = true; ++ spin_unlock(&ob->lock); ++ } ++ ++ return ret; ++} ++ ++/* device goes ro: */ ++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ /* First, remove device from allocation groups: */ ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ ++ /* ++ * Capacity is calculated based off of devices in allocation groups: ++ */ ++ bch2_recalc_capacity(c); ++ ++ /* Next, close write points that point to this device... */ ++ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) ++ bch2_writepoint_stop(c, ca, &c->write_points[i]); ++ ++ bch2_writepoint_stop(c, ca, &c->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); ++ bch2_writepoint_stop(c, ca, &c->btree_write_point); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ bch2_open_buckets_put(c, &a->ob); ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ while (1) { ++ struct open_bucket *ob; ++ ++ spin_lock(&c->freelist_lock); ++ if (!ca->open_buckets_partial_nr) { ++ spin_unlock(&c->freelist_lock); ++ break; ++ } ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_ec_stop_dev(c, ca); ++ ++ /* ++ * Wake up threads that were blocked on allocation, so they can notice ++ * the device can no longer be removed and the capacity has changed: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ ++ /* ++ * journal_res_get() can block waiting for free space in the journal - ++ * it needs to notice there may not be devices to allocate from anymore: ++ */ ++ wake_up(&c->journal.wait); ++ ++ /* Now wait for any in flight writes: */ ++ ++ closure_wait_event(&c->open_buckets_wait, ++ !bch2_dev_has_open_write_point(c, ca)); ++} ++ ++/* device goes rw: */ ++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (ca->mi.data_allowed & (1 << i)) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++} ++ ++void bch2_fs_allocator_background_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->freelist_lock); ++ INIT_WORK(&c->discard_work, bch2_do_discards_work); ++ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +new file mode 100644 +index 000000000000..044bc72992d4 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.h +@@ -0,0 +1,183 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H ++#define _BCACHEFS_ALLOC_BACKGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "debug.h" ++#include "super.h" ++ ++/* How out of date a pointer gen is allowed to be: */ ++#define BUCKET_GC_GEN_MAX 96U ++ ++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) ++{ ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, pos.inode)) ++ return false; ++ ++ ca = bch_dev_bkey_exists(c, pos.inode); ++ return pos.offset >= ca->mi.first_bucket && ++ pos.offset < ca->mi.nbuckets; ++} ++ ++static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) ++{ ++ return a.gen - a.oldest_gen; ++} ++ ++static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, ++ u32 cached_sectors, ++ u32 stripe, ++ struct bch_alloc_v4 a, ++ enum bch_data_type data_type) ++{ ++ if (dirty_sectors) ++ return data_type; ++ if (stripe) ++ return BCH_DATA_stripe; ++ if (cached_sectors) ++ return BCH_DATA_cached; ++ if (BCH_ALLOC_V4_NEED_DISCARD(&a)) ++ return BCH_DATA_need_discard; ++ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) ++ return BCH_DATA_need_gc_gens; ++ return BCH_DATA_free; ++} ++ ++static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, ++ enum bch_data_type data_type) ++{ ++ return __alloc_data_type(a.dirty_sectors, a.cached_sectors, ++ a.stripe, a, data_type); ++} ++ ++static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) ++{ ++ return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; ++} ++ ++static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) ++{ ++ return ((u64) alloc_gc_gen(a) >> 4) << 56; ++} ++ ++static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) ++{ ++ pos.offset |= alloc_freespace_genbits(a); ++ return pos; ++} ++ ++static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) ++{ ++ unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: ++ BCH_ALLOC_V4_U64s_V0) + ++ BCH_ALLOC_V4_NR_BACKPOINTERS(a) * ++ (sizeof(struct bch_backpointer) / sizeof(u64)); ++ ++ BUG_ON(ret > U8_MAX - BKEY_U64s); ++ return ret; ++} ++ ++static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) ++{ ++ set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); ++} ++ ++struct bkey_i_alloc_v4 * ++bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); ++ ++void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); ++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); ++ ++int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); ++ ++#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) ++ ++int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_alloc_v4_swab(struct bkey_s); ++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v1_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ ++#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v2_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ ++#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v3_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ ++#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_v4_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++ .swab = bch2_alloc_v4_swab, \ ++ .trans_trigger = bch2_trans_mark_alloc, \ ++ .atomic_trigger = bch2_mark_alloc, \ ++} ++ ++static inline bool bkey_is_alloc(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_alloc || ++ k->type == KEY_TYPE_alloc_v2 || ++ k->type == KEY_TYPE_alloc_v3; ++} ++ ++int bch2_alloc_read(struct bch_fs *); ++ ++int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, ++ struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_check_alloc_info(struct bch_fs *); ++int bch2_check_alloc_to_lru_refs(struct bch_fs *); ++void bch2_do_discards(struct bch_fs *); ++ ++static inline u64 should_invalidate_buckets(struct bch_dev *ca, ++ struct bch_dev_usage u) ++{ ++ u64 want_free = ca->mi.nbuckets >> 7; ++ u64 free = max_t(s64, 0, ++ u.d[BCH_DATA_free].buckets ++ + u.d[BCH_DATA_need_discard].buckets ++ - bch2_dev_buckets_reserved(ca, RESERVE_none)); ++ ++ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); ++} ++ ++void bch2_do_invalidates(struct bch_fs *); ++ ++static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) ++{ ++ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); ++} ++ ++static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) ++{ ++ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); ++} ++ ++int bch2_fs_freespace_init(struct bch_fs *); ++ ++void bch2_recalc_capacity(struct bch_fs *); ++ ++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); ++ ++void bch2_fs_allocator_background_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +new file mode 100644 +index 000000000000..6e52230e69e1 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.c +@@ -0,0 +1,1380 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright 2012 Google, Inc. ++ * ++ * Foreground allocator code: allocate buckets from freelist, and allocate in ++ * sector granularity from writepoints. ++ * ++ * bch2_bucket_alloc() allocates a single bucket from a specific device. ++ * ++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices ++ * in a given filesystem. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "buckets_waiting_for_journal.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "movinggc.h" ++ ++#include ++#include ++#include ++#include ++ ++const char * const bch2_alloc_reserves[] = { ++#define x(t) #t, ++ BCH_ALLOC_RESERVES() ++#undef x ++ NULL ++}; ++ ++/* ++ * Open buckets represent a bucket that's currently being allocated from. They ++ * serve two purposes: ++ * ++ * - They track buckets that have been partially allocated, allowing for ++ * sub-bucket sized allocations - they're used by the sector allocator below ++ * ++ * - They provide a reference to the buckets they own that mark and sweep GC ++ * can find, until the new allocation has a pointer to it inserted into the ++ * btree ++ * ++ * When allocating some space with the sector allocator, the allocation comes ++ * with a reference to an open bucket - the caller is required to put that ++ * reference _after_ doing the index update that makes its allocation reachable. ++ */ ++ ++static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ ob->hash = *slot; ++ *slot = idx; ++} ++ ++static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) ++{ ++ open_bucket_idx_t idx = ob - c->open_buckets; ++ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ++ ++ while (*slot != idx) { ++ BUG_ON(!*slot); ++ slot = &c->open_buckets[*slot].hash; ++ } ++ ++ *slot = ob->hash; ++ ob->hash = 0; ++} ++ ++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ if (ob->ec) { ++ bch2_ec_bucket_written(c, ob); ++ return; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&ob->lock); ++ ++ ob->valid = false; ++ ob->data_type = 0; ++ ++ spin_unlock(&ob->lock); ++ percpu_up_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ bch2_open_bucket_hash_remove(c, ob); ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ ++ c->open_buckets_nr_free++; ++ ca->nr_open_buckets--; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *c, ++ struct open_buckets *obs, ++ unsigned dev) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->dev == dev && ob->ec) ++ bch2_ec_bucket_cancel(c, ob); ++} ++ ++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ++ ++ ob = c->open_buckets + c->open_buckets_freelist; ++ c->open_buckets_freelist = ob->freelist; ++ atomic_set(&ob->pin, 1); ++ ob->data_type = 0; ++ ++ c->open_buckets_nr_free--; ++ return ob; ++} ++ ++static void open_bucket_free_unused(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ bool may_realloc = wp->data_type == BCH_DATA_user; ++ ++ BUG_ON(ca->open_buckets_partial_nr > ++ ARRAY_SIZE(ca->open_buckets_partial)); ++ ++ if (ca->open_buckets_partial_nr < ++ ARRAY_SIZE(ca->open_buckets_partial) && ++ may_realloc) { ++ spin_lock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++ closure_wake_up(&c->freelist_wait); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++} ++ ++/* _only_ for allocating the journal on a new device: */ ++long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++{ ++ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { ++ u64 b = ca->new_fs_bucket_idx++; ++ ++ if (!is_superblock_bucket(ca, b) && ++ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) ++ return b; ++ } ++ ++ return -1; ++} ++ ++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ++{ ++ switch (reserve) { ++ case RESERVE_btree: ++ case RESERVE_btree_movinggc: ++ return 0; ++ case RESERVE_movinggc: ++ return OPEN_BUCKETS_COUNT / 4; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ ++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ u64 bucket, ++ enum alloc_reserve reserve, ++ struct bch_alloc_v4 *a, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct open_bucket *ob; ++ ++ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { ++ (*skipped_nouse)++; ++ return NULL; ++ } ++ ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { ++ (*skipped_open)++; ++ return NULL; ++ } ++ ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { ++ (*skipped_need_journal_commit)++; ++ return NULL; ++ } ++ ++ spin_lock(&c->freelist_lock); ++ ++ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { ++ if (cl) ++ closure_wait(&c->open_buckets_wait, cl); ++ ++ if (!c->blocked_allocate_open_bucket) ++ c->blocked_allocate_open_bucket = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ return ERR_PTR(-BCH_ERR_open_buckets_empty); ++ } ++ ++ /* Recheck under lock: */ ++ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { ++ spin_unlock(&c->freelist_lock); ++ (*skipped_open)++; ++ return NULL; ++ } ++ ++ ob = bch2_open_bucket_alloc(c); ++ ++ spin_lock(&ob->lock); ++ ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->alloc_reserve = reserve; ++ ob->dev = ca->dev_idx; ++ ob->gen = a->gen; ++ ob->bucket = bucket; ++ spin_unlock(&ob->lock); ++ ++ ca->nr_open_buckets++; ++ bch2_open_bucket_hash_add(c, ob); ++ ++ if (c->blocked_allocate_open_bucket) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate_open_bucket], ++ c->blocked_allocate_open_bucket); ++ c->blocked_allocate_open_bucket = 0; ++ } ++ ++ if (c->blocked_allocate) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate], ++ c->blocked_allocate); ++ c->blocked_allocate = 0; ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); ++ return ob; ++} ++ ++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, ++ enum alloc_reserve reserve, u64 free_entry, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct bkey_s_c freespace_k, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ struct open_bucket *ob; ++ struct bch_alloc_v4 a; ++ u64 b = free_entry & ~(~0ULL << 56); ++ unsigned genbits = free_entry >> 56; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { ++ prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" ++ " freespace key ", ++ ca->mi.first_bucket, ca->mi.nbuckets); ++ bch2_bkey_val_to_text(&buf, c, freespace_k); ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) { ++ ob = ERR_PTR(ret); ++ goto err; ++ } ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (genbits != (alloc_freespace_genbits(a) >> 56)) { ++ prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" ++ " freespace key ", ++ genbits, alloc_freespace_genbits(a) >> 56); ++ bch2_bkey_val_to_text(&buf, c, freespace_k); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ob = ERR_PTR(-EIO); ++ goto err; ++ ++ } ++ ++ if (a.data_type != BCH_DATA_free) { ++ prt_printf(&buf, "non free bucket in freespace btree\n" ++ " freespace key "); ++ bch2_bkey_val_to_text(&buf, c, freespace_k); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ob = ERR_PTR(-EIO); ++ goto err; ++ } ++ ++ ob = __try_alloc_bucket(c, ca, b, reserve, &a, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++ if (!ob) ++ iter.path->preserve = false; ++err: ++ set_btree_iter_dontneed(&iter); ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ob; ++} ++ ++static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve) ++{ ++ struct open_bucket *ob; ++ int i; ++ ++ spin_lock(&c->freelist_lock); ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ return NULL; ++} ++ ++/* ++ * This path is for before the freespace btree is initialized: ++ * ++ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & ++ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx ++ */ ++static noinline struct open_bucket * ++bch2_bucket_alloc_early(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); ++ *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), ++ BTREE_ITER_SLOTS, k, ret) { ++ struct bch_alloc_v4 a; ++ ++ if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ break; ++ ++ if (ca->new_fs_bucket_idx && ++ is_superblock_bucket(ca, k.k->p.offset)) ++ continue; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (a.data_type != BCH_DATA_free) ++ continue; ++ ++ (*buckets_seen)++; ++ ++ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl); ++ if (ob) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ *cur_bucket = iter.pos.offset; ++ ++ return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); ++} ++ ++static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ u64 *cur_bucket, ++ u64 *buckets_seen, ++ u64 *skipped_open, ++ u64 *skipped_need_journal_commit, ++ u64 *skipped_nouse, ++ struct closure *cl) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct open_bucket *ob = NULL; ++ int ret; ++ ++ BUG_ON(ca->new_fs_bucket_idx); ++ ++ /* ++ * XXX: ++ * On transaction restart, we'd like to restart from the bucket we were ++ * at previously ++ */ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, ++ POS(ca->dev_idx, *cur_bucket), 0, k, ret) { ++ if (k.k->p.inode != ca->dev_idx) ++ break; ++ ++ for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); ++ *cur_bucket < k.k->p.offset; ++ (*cur_bucket)++) { ++ ret = btree_trans_too_many_iters(trans); ++ if (ret) ++ break; ++ ++ (*buckets_seen)++; ++ ++ ob = try_alloc_bucket(trans, ca, reserve, ++ *cur_bucket, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ k, cl); ++ if (ob) ++ break; ++ } ++ ++ if (ob || ret) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ob ?: ERR_PTR(ret); ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ++ struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct open_bucket *ob = NULL; ++ struct bch_dev_usage usage; ++ bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); ++ u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; ++ u64 avail; ++ u64 cur_bucket = start; ++ u64 buckets_seen = 0; ++ u64 skipped_open = 0; ++ u64 skipped_need_journal_commit = 0; ++ u64 skipped_nouse = 0; ++ bool waiting = false; ++ int ret; ++again: ++ usage = bch2_dev_usage_read(ca); ++ avail = dev_buckets_free(ca, usage, reserve); ++ ++ if (usage.d[BCH_DATA_need_discard].buckets > avail) ++ bch2_do_discards(c); ++ ++ if (usage.d[BCH_DATA_need_gc_gens].buckets > avail) ++ bch2_do_gc_gens(c); ++ ++ if (should_invalidate_buckets(ca, usage)) ++ bch2_do_invalidates(c); ++ ++ if (!avail) { ++ if (cl && !waiting) { ++ closure_wait(&c->freelist_wait, cl); ++ waiting = true; ++ goto again; ++ } ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ ob = ERR_PTR(-BCH_ERR_freelist_empty); ++ goto err; ++ } ++ ++ if (waiting) ++ closure_wake_up(&c->freelist_wait); ++ ++ if (may_alloc_partial) { ++ ob = try_alloc_partial_bucket(c, ca, reserve); ++ if (ob) ++ return ob; ++ } ++ ++ ob = likely(ca->mi.freespace_initialized) ++ ? bch2_bucket_alloc_freelist(trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl) ++ : bch2_bucket_alloc_early(trans, ca, reserve, ++ &cur_bucket, ++ &buckets_seen, ++ &skipped_open, ++ &skipped_need_journal_commit, ++ &skipped_nouse, ++ cl); ++ ++ if (skipped_need_journal_commit * 2 > avail) ++ bch2_journal_flush_async(&c->journal, NULL); ++ ++ if (!ob && !ret && !freespace_initialized && start) { ++ start = cur_bucket = 0; ++ goto again; ++ } ++ ++ if (!freespace_initialized) ++ ca->bucket_alloc_trans_early_cursor = cur_bucket; ++err: ++ if (!ob) ++ ob = ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); ++ ++ if (IS_ERR(ob)) { ++ trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], ++ usage.d[BCH_DATA_free].buckets, ++ avail, ++ bch2_copygc_wait_amount(c), ++ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), ++ buckets_seen, ++ skipped_open, ++ skipped_need_journal_commit, ++ skipped_nouse, ++ cl == NULL, ++ bch2_err_str(PTR_ERR(ob))); ++ atomic_long_inc(&c->bucket_alloc_fail); ++ } ++ ++ return ob; ++} ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct open_bucket *ob; ++ ++ bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, ++ may_alloc_partial, cl))); ++ return ob; ++} ++ ++static int __dev_stripe_cmp(struct dev_stripe_state *stripe, ++ unsigned l, unsigned r) ++{ ++ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - ++ (stripe->next_alloc[l] < stripe->next_alloc[r])); ++} ++ ++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs) ++{ ++ struct dev_alloc_list ret = { .nr = 0 }; ++ unsigned i; ++ ++ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ++ ret.devs[ret.nr++] = i; ++ ++ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ return ret; ++} ++ ++void bch2_dev_stripe_increment(struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ u64 *v = stripe->next_alloc + ca->dev_idx; ++ u64 free_space = dev_buckets_available(ca, RESERVE_none); ++ u64 free_space_inv = free_space ++ ? div64_u64(1ULL << 48, free_space) ++ : 1ULL << 48; ++ u64 scale = *v / 4; ++ ++ if (*v + free_space_inv >= *v) ++ *v += free_space_inv; ++ else ++ *v = U64_MAX; ++ ++ for (v = stripe->next_alloc; ++ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) ++ *v = *v < scale ? 0 : *v - scale; ++} ++ ++#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) ++#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) ++ ++static void add_new_bucket(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct open_bucket *ob) ++{ ++ unsigned durability = ++ bch_dev_bkey_exists(c, ob->dev)->mi.durability; ++ ++ __clear_bit(ob->dev, devs_may_alloc->d); ++ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ++ ? durability : 1; ++ *have_cache |= !durability; ++ ++ ob_push(c, ptrs, ob); ++} ++ ++static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct dev_alloc_list devs_sorted = ++ bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ unsigned dev; ++ struct bch_dev *ca; ++ int ret = -BCH_ERR_insufficient_devices; ++ unsigned i; ++ ++ BUG_ON(*nr_effective >= nr_replicas); ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ struct open_bucket *ob; ++ ++ dev = devs_sorted.devs[i]; ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ continue; ++ ++ if (!ca->mi.durability && *have_cache) { ++ percpu_ref_put(&ca->ref); ++ continue; ++ } ++ ++ ob = bch2_bucket_alloc_trans(trans, ca, reserve, ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (!IS_ERR(ob)) ++ bch2_dev_stripe_increment(ca, stripe); ++ percpu_ref_put(&ca->ref); ++ ++ if (IS_ERR(ob)) { ++ ret = PTR_ERR(ob); ++ if (ret == -EINTR || cl) ++ break; ++ continue; ++ } ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ ++ if (*nr_effective >= nr_replicas) { ++ ret = 0; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_bucket_alloc_set_trans(&trans, ptrs, stripe, ++ devs_may_alloc, nr_replicas, ++ nr_effective, have_cache, reserve, ++ flags, cl)); ++} ++ ++/* Allocate from stripes: */ ++ ++/* ++ * if we can't allocate a new stripe because there are already too many ++ * partially filled stripes, force allocating from an existing stripe even when ++ * it's to a device we don't want: ++ */ ++ ++static int bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct dev_alloc_list devs_sorted; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ struct bch_dev *ca; ++ unsigned i, ec_idx; ++ ++ if (!erasure_code) ++ return 0; ++ ++ if (nr_replicas < 2) ++ return 0; ++ ++ if (ec_open_bucket(c, ptrs)) ++ return 0; ++ ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, ++ wp == &c->copygc_write_point, ++ cl); ++ if (IS_ERR(h)) ++ return -PTR_ERR(h); ++ if (!h) ++ return 0; ++ ++ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ ++ for (i = 0; i < devs_sorted.nr; i++) ++ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { ++ if (!h->s->blocks[ec_idx]) ++ continue; ++ ++ ob = c->open_buckets + h->s->blocks[ec_idx]; ++ if (ob->dev == devs_sorted.devs[i] && ++ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) ++ goto got_bucket; ++ } ++ goto out_put_head; ++got_bucket: ++ ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ ob->ec_idx = ec_idx; ++ ob->ec = h->s; ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ atomic_inc(&h->s->pin); ++out_put_head: ++ bch2_ec_stripe_head_put(c, h); ++ return 0; ++} ++ ++/* Sector allocator */ ++ ++static void get_buckets_from_writepoint(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ bool need_ec) ++{ ++ struct open_buckets ptrs_skip = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ if (*nr_effective < nr_replicas && ++ test_bit(ob->dev, devs_may_alloc->d) && ++ (ca->mi.durability || ++ (wp->data_type == BCH_DATA_user && !*have_cache)) && ++ (ob->ec || !need_ec)) { ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, ++ flags, ob); ++ } else { ++ ob_push(c, &ptrs_skip, ob); ++ } ++ } ++ wp->ptrs = ptrs_skip; ++} ++ ++static int open_bucket_add_buckets(struct btree_trans *trans, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ struct closure *cl = NULL; ++ int ret; ++ unsigned i; ++ ++ rcu_read_lock(); ++ devs = target_rw_devs(c, wp->data_type, target); ++ rcu_read_unlock(); ++ ++ /* Don't allocate from devices we already have pointers to: */ ++ for (i = 0; i < devs_have->nr; i++) ++ __clear_bit(devs_have->devs[i], devs.d); ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ __clear_bit(ob->dev, devs.d); ++ ++ if (erasure_code) { ++ if (!ec_open_bucket(c, ptrs)) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ if (!ec_open_bucket(c, ptrs)) { ++ ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags, _cl); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ bch2_err_matches(ret, BCH_ERR_freelist_empty) || ++ bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ++ return ret; ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ } ++ ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, false); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++retry_blocking: ++ /* ++ * Try nonblocking first, so that if one device is full we'll try from ++ * other devices: ++ */ ++ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, ++ nr_replicas, nr_effective, have_cache, ++ reserve, flags, cl); ++ if (ret && ++ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && ++ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && ++ !cl && _cl) { ++ cl = _cl; ++ goto retry_blocking; ++ } ++ ++ return ret; ++} ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, ++ struct open_buckets *obs) ++{ ++ struct open_buckets ptrs = { .nr = 0 }; ++ struct open_bucket *ob, *ob2; ++ unsigned i, j; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ bool drop = !ca || ob->dev == ca->dev_idx; ++ ++ if (!drop && ob->ec) { ++ mutex_lock(&ob->ec->lock); ++ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { ++ if (!ob->ec->blocks[j]) ++ continue; ++ ++ ob2 = c->open_buckets + ob->ec->blocks[j]; ++ drop |= ob2->dev == ca->dev_idx; ++ } ++ mutex_unlock(&ob->ec->lock); ++ } ++ ++ if (drop) ++ bch2_open_bucket_put(c, ob); ++ else ++ ob_push(c, &ptrs, ob); ++ } ++ ++ *obs = ptrs; ++} ++ ++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, ++ struct write_point *wp) ++{ ++ mutex_lock(&wp->lock); ++ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); ++ mutex_unlock(&wp->lock); ++} ++ ++static inline struct hlist_head *writepoint_hash(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ unsigned hash = ++ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); ++ ++ return &c->write_points_hash[hash]; ++} ++ ++static struct write_point *__writepoint_find(struct hlist_head *head, ++ unsigned long write_point) ++{ ++ struct write_point *wp; ++ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(wp, head, node) ++ if (wp->write_point == write_point) ++ goto out; ++ wp = NULL; ++out: ++ rcu_read_unlock(); ++ return wp; ++} ++ ++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) ++{ ++ u64 stranded = c->write_points_nr * c->bucket_size_max; ++ u64 free = bch2_fs_usage_read_short(c).free; ++ ++ return stranded * factor > free; ++} ++ ++static bool try_increase_writepoints(struct bch_fs *c) ++{ ++ struct write_point *wp; ++ ++ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || ++ too_many_writepoints(c, 32)) ++ return false; ++ ++ wp = c->write_points + c->write_points_nr++; ++ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); ++ return true; ++} ++ ++static bool try_decrease_writepoints(struct bch_fs *c, ++ unsigned old_nr) ++{ ++ struct write_point *wp; ++ ++ mutex_lock(&c->write_points_hash_lock); ++ if (c->write_points_nr < old_nr) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return true; ++ } ++ ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return false; ++ } ++ ++ wp = c->write_points + --c->write_points_nr; ++ ++ hlist_del_rcu(&wp->node); ++ mutex_unlock(&c->write_points_hash_lock); ++ ++ bch2_writepoint_stop(c, NULL, wp); ++ return true; ++} ++ ++static void bch2_trans_mutex_lock(struct btree_trans *trans, ++ struct mutex *lock) ++{ ++ if (!mutex_trylock(lock)) { ++ bch2_trans_unlock(trans); ++ mutex_lock(lock); ++ } ++} ++ ++static struct write_point *writepoint_find(struct btree_trans *trans, ++ unsigned long write_point) ++{ ++ struct bch_fs *c = trans->c; ++ struct write_point *wp, *oldest; ++ struct hlist_head *head; ++ ++ if (!(write_point & 1UL)) { ++ wp = (struct write_point *) write_point; ++ bch2_trans_mutex_lock(trans, &wp->lock); ++ return wp; ++ } ++ ++ head = writepoint_hash(c, write_point); ++restart_find: ++ wp = __writepoint_find(head, write_point); ++ if (wp) { ++lock_wp: ++ bch2_trans_mutex_lock(trans, &wp->lock); ++ if (wp->write_point == write_point) ++ goto out; ++ mutex_unlock(&wp->lock); ++ goto restart_find; ++ } ++restart_find_oldest: ++ oldest = NULL; ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) ++ if (!oldest || time_before64(wp->last_used, oldest->last_used)) ++ oldest = wp; ++ ++ bch2_trans_mutex_lock(trans, &oldest->lock); ++ bch2_trans_mutex_lock(trans, &c->write_points_hash_lock); ++ if (oldest >= c->write_points + c->write_points_nr || ++ try_increase_writepoints(c)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto restart_find_oldest; ++ } ++ ++ wp = __writepoint_find(head, write_point); ++ if (wp && wp != oldest) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto lock_wp; ++ } ++ ++ wp = oldest; ++ hlist_del_rcu(&wp->node); ++ wp->write_point = write_point; ++ hlist_add_head_rcu(&wp->node, head); ++ mutex_unlock(&c->write_points_hash_lock); ++out: ++ wp->last_used = sched_clock(); ++ return wp; ++} ++ ++/* ++ * Get us an open_bucket we can allocate from, return with it locked: ++ */ ++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct write_point *wp; ++ struct open_bucket *ob; ++ struct open_buckets ptrs; ++ unsigned nr_effective, write_points_nr; ++ unsigned ob_flags = 0; ++ bool have_cache; ++ int ret; ++ int i; ++ ++ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; ++ ++ BUG_ON(!nr_replicas || !nr_replicas_required); ++retry: ++ ptrs.nr = 0; ++ nr_effective = 0; ++ write_points_nr = c->write_points_nr; ++ have_cache = false; ++ ++ wp = writepoint_find(trans, write_point.v); ++ ++ if (wp->data_type == BCH_DATA_user) ++ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ++ ++ /* metadata may not allocate on cache devices: */ ++ if (wp->data_type != BCH_DATA_user) ++ have_cache = true; ++ ++ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } else { ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, NULL); ++ if (!ret || ret == -EINTR) ++ goto alloc_done; ++ ++ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, ++ 0, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } ++alloc_done: ++ BUG_ON(!ret && nr_effective < nr_replicas); ++ ++ if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ pr_debug("failed to get ec bucket: ret %u", ret); ++ ++ if (ret == -BCH_ERR_insufficient_devices && ++ nr_effective >= nr_replicas_required) ++ ret = 0; ++ ++ if (ret) ++ goto err; ++ ++ /* Free buckets we didn't use: */ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_free_unused(c, wp, ob); ++ ++ wp->ptrs = ptrs; ++ ++ wp->sectors_free = UINT_MAX; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ ++ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ ++ return wp; ++err: ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ++ ob_push(c, &ptrs, ob); ++ else ++ open_bucket_free_unused(c, wp, ob); ++ wp->ptrs = ptrs; ++ ++ mutex_unlock(&wp->lock); ++ ++ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && ++ try_decrease_writepoints(c, write_points_nr)) ++ goto retry; ++ ++ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || ++ bch2_err_matches(ret, BCH_ERR_freelist_empty)) ++ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); ++ ++ if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) ++ return ERR_PTR(-EROFS); ++ ++ return ERR_PTR(ret); ++} ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ ++ bch2_trans_do(c, NULL, NULL, 0, ++ PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target, ++ erasure_code, ++ write_point, ++ devs_have, ++ nr_replicas, ++ nr_replicas_required, ++ reserve, ++ flags, cl))); ++ return wp; ++ ++} ++ ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ ++ return (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = ob->gen, ++ .dev = ob->dev, ++ .offset = bucket_to_sector(ca, ob->bucket) + ++ ca->mi.bucket_size - ++ ob->sectors_free, ++ }; ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, ++ struct bkey_i *k, unsigned sectors, ++ bool cached) ++ ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(sectors > wp->sectors_free); ++ wp->sectors_free -= sectors; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); ++ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); ++ ++ ptr.cached = cached || ++ (!ca->mi.durability && ++ wp->data_type == BCH_DATA_user); ++ ++ bch2_bkey_append_ptr(k, ptr); ++ ++ BUG_ON(sectors > ob->sectors_free); ++ ob->sectors_free -= sectors; ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); ++ wp->ptrs = keep; ++ ++ mutex_unlock(&wp->lock); ++ ++ bch2_open_buckets_put(c, &ptrs); ++} ++ ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->data_type = type; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ struct write_point *wp; ++ ++ mutex_init(&c->write_points_hash_lock); ++ c->write_points_nr = ARRAY_SIZE(c->write_points); ++ ++ /* open bucket 0 is a sentinal NULL: */ ++ spin_lock_init(&c->open_buckets[0].lock); ++ ++ for (ob = c->open_buckets + 1; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { ++ spin_lock_init(&ob->lock); ++ c->open_buckets_nr_free++; ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ } ++ ++ writepoint_init(&c->btree_write_point, BCH_DATA_btree); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); ++ writepoint_init(&c->copygc_write_point, BCH_DATA_user); ++ ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) { ++ writepoint_init(wp, BCH_DATA_user); ++ ++ wp->last_used = sched_clock(); ++ wp->write_point = (unsigned long) wp; ++ hlist_add_head_rcu(&wp->node, ++ writepoint_hash(c, wp->write_point)); ++ } ++} ++ ++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list) { ++ prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n", ++ ob - c->open_buckets, ++ atomic_read(&ob->pin), ++ bch2_data_types[ob->data_type], ++ ob->dev, ob->bucket, ob->gen); ++ } ++ spin_unlock(&ob->lock); ++ } ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +new file mode 100644 +index 000000000000..6de63a351fa8 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.h +@@ -0,0 +1,181 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H ++#define _BCACHEFS_ALLOC_FOREGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++ ++#include ++ ++struct bkey; ++struct bch_dev; ++struct bch_fs; ++struct bch_devs_List; ++ ++extern const char * const bch2_alloc_reserves[]; ++ ++struct dev_alloc_list { ++ unsigned nr; ++ u8 devs[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *); ++void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); ++ ++long bch2_bucket_alloc_new_fs(struct bch_dev *); ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, ++ enum alloc_reserve, bool, ++ struct closure *); ++ ++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ++ struct open_bucket *ob) ++{ ++ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); ++ ++ obs->v[obs->nr++] = ob - c->open_buckets; ++} ++ ++#define open_bucket_for_each(_c, _obs, _ob, _i) \ ++ for ((_i) = 0; \ ++ (_i) < (_obs)->nr && \ ++ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ ++ (_i)++) ++ ++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, ++ struct open_buckets *obs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ec) ++ return ob; ++ ++ return NULL; ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *, ++ struct open_buckets *, unsigned); ++ ++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); ++ ++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ if (atomic_dec_and_test(&ob->pin)) ++ __bch2_open_bucket_put(c, ob); ++} ++ ++static inline void bch2_open_buckets_put(struct bch_fs *c, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ bch2_open_bucket_put(c, ob); ++ ptrs->nr = 0; ++} ++ ++static inline void bch2_open_bucket_get(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ ob->data_type = wp->data_type; ++ atomic_inc(&ob->pin); ++ ob_push(c, ptrs, ob); ++ } ++} ++ ++static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, ++ unsigned dev, u64 bucket) ++{ ++ return c->open_buckets_hash + ++ (jhash_3words(dev, bucket, bucket >> 32, 0) & ++ (OPEN_BUCKETS_COUNT - 1)); ++} ++ ++static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); ++ ++ while (slot) { ++ struct open_bucket *ob = &c->open_buckets[slot]; ++ ++ if (ob->dev == dev && ob->bucket == bucket) ++ return true; ++ ++ slot = ob->hash; ++ } ++ ++ return false; ++} ++ ++static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) ++{ ++ bool ret; ++ ++ if (bch2_bucket_is_open(c, dev, bucket)) ++ return true; ++ ++ spin_lock(&c->freelist_lock); ++ ret = bch2_bucket_is_open(c, dev, bucket); ++ spin_unlock(&c->freelist_lock); ++ ++ return ret; ++} ++ ++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++ struct dev_stripe_state *, struct bch_devs_mask *, ++ unsigned, unsigned *, bool *, enum alloc_reserve, ++ unsigned, struct closure *); ++ ++struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++ ++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, ++ struct bkey_i *, unsigned, bool); ++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, ++ struct open_buckets *); ++ ++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, ++ struct write_point *); ++ ++static inline struct write_point_specifier writepoint_hashed(unsigned long v) ++{ ++ return (struct write_point_specifier) { .v = v | 1 }; ++} ++ ++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) ++{ ++ return (struct write_point_specifier) { .v = (unsigned long) wp }; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *); ++ ++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +new file mode 100644 +index 000000000000..e078584d46f6 +--- /dev/null ++++ b/fs/bcachefs/alloc_types.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_TYPES_H ++#define _BCACHEFS_ALLOC_TYPES_H ++ ++#include ++#include ++ ++#include "clock_types.h" ++#include "fifo.h" ++ ++struct ec_bucket_buf; ++ ++#define BCH_ALLOC_RESERVES() \ ++ x(btree_movinggc) \ ++ x(btree) \ ++ x(movinggc) \ ++ x(none) ++ ++enum alloc_reserve { ++#define x(name) RESERVE_##name, ++ BCH_ALLOC_RESERVES() ++#undef x ++}; ++ ++#define OPEN_BUCKETS_COUNT 1024 ++ ++#define WRITE_POINT_HASH_NR 32 ++#define WRITE_POINT_MAX 32 ++ ++/* ++ * 0 is never a valid open_bucket_idx_t: ++ */ ++typedef u16 open_bucket_idx_t; ++ ++struct open_bucket { ++ spinlock_t lock; ++ atomic_t pin; ++ open_bucket_idx_t freelist; ++ open_bucket_idx_t hash; ++ ++ /* ++ * When an open bucket has an ec_stripe attached, this is the index of ++ * the block in the stripe this open_bucket corresponds to: ++ */ ++ u8 ec_idx; ++ enum bch_data_type data_type:8; ++ unsigned valid:1; ++ unsigned on_partial_list:1; ++ unsigned alloc_reserve:3; ++ ++ u8 dev; ++ u8 gen; ++ u32 sectors_free; ++ u64 bucket; ++ struct ec_stripe_new *ec; ++}; ++ ++#define OPEN_BUCKET_LIST_MAX 15 ++ ++struct open_buckets { ++ open_bucket_idx_t nr; ++ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; ++}; ++ ++struct dev_stripe_state { ++ u64 next_alloc[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct write_point { ++ struct hlist_node node; ++ struct mutex lock; ++ u64 last_used; ++ unsigned long write_point; ++ enum bch_data_type data_type; ++ ++ /* calculated based on how many pointers we're actually going to use: */ ++ unsigned sectors_free; ++ ++ struct open_buckets ptrs; ++ struct dev_stripe_state stripe; ++}; ++ ++struct write_point_specifier { ++ unsigned long v; ++}; ++ ++#endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c +new file mode 100644 +index 000000000000..5a46b25b0587 +--- /dev/null ++++ b/fs/bcachefs/backpointers.c +@@ -0,0 +1,875 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "backpointers.h" ++#include "btree_cache.h" ++#include "btree_update.h" ++#include "error.h" ++ ++#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 ++ ++/* ++ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc ++ * btree: ++ */ ++static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, ++ struct bpos bp_pos) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); ++ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; ++ ++ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); ++} ++ ++/* ++ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: ++ */ ++static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, ++ struct bpos bucket, ++ u64 bucket_offset) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); ++ ++ return POS(bucket.inode, ++ (bucket_to_sector(ca, bucket.offset) << ++ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); ++} ++ ++void bch2_extent_ptr_to_bp(struct bch_fs *c, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ struct bpos *bucket_pos, struct bch_backpointer *bp) ++{ ++ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; ++ s64 sectors = level ? btree_sectors(c) : k.k->size; ++ u32 bucket_offset; ++ ++ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); ++ *bp = (struct bch_backpointer) { ++ .btree_id = btree_id, ++ .level = level, ++ .data_type = data_type, ++ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + ++ p.crc.offset, ++ .bucket_len = ptr_disk_sectors(sectors, p), ++ .pos = k.k->p, ++ }; ++} ++ ++static bool extent_matches_bp(struct bch_fs *c, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, ++ struct bpos bucket, ++ struct bch_backpointer bp) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ struct bpos bucket2; ++ struct bch_backpointer bp2; ++ ++ if (p.ptr.cached) ++ continue; ++ ++ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, ++ &bucket2, &bp2); ++ if (!bpos_cmp(bucket, bucket2) && ++ !memcmp(&bp, &bp2, sizeof(bp))) ++ return true; ++ } ++ ++ return false; ++} ++ ++int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); ++ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); ++ ++ if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) { ++ prt_str(err, "incorrect value size"); ++ return -EINVAL; ++ } ++ ++ if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { ++ prt_str(err, "backpointer at wrong pos"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) ++{ ++ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", ++ bch2_btree_ids[bp->btree_id], ++ bp->level, ++ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), ++ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), ++ bp->bucket_len); ++ bch2_bpos_to_text(out, bp->pos); ++} ++ ++void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); ++} ++ ++void bch2_backpointer_swab(struct bkey_s k) ++{ ++ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); ++ ++ bp.v->bucket_offset = swab32(bp.v->bucket_offset); ++ bp.v->bucket_len = swab32(bp.v->bucket_len); ++ bch2_bpos_swab(&bp.v->pos); ++} ++ ++#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1) ++ ++static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r) ++{ ++ return cmp_int(l.bucket_offset, r.bucket_offset); ++} ++ ++static int bch2_backpointer_del_by_offset(struct btree_trans *trans, ++ struct bpos bucket, ++ u64 bp_offset, ++ struct bch_backpointer bp) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (bp_offset < BACKPOINTER_OFFSET_MAX) { ++ struct bch_backpointer *bps; ++ struct bkey_i_alloc_v4 *a; ++ unsigned i, nr; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ bucket, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_alloc_v4) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto err; ++ bps = alloc_v4_backpointers(&a->v); ++ nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); ++ ++ for (i = 0; i < nr; i++) { ++ if (bps[i].bucket_offset == bp_offset) ++ goto found; ++ if (bps[i].bucket_offset > bp_offset) ++ break; ++ } ++ ++ ret = -ENOENT; ++ goto err; ++found: ++ if (memcmp(&bps[i], &bp, sizeof(bp))) { ++ ret = -ENOENT; ++ goto err; ++ } ++ array_remove_item(bps, nr, i); ++ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); ++ set_alloc_v4_u64s(a); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ } else { ++ bp_offset -= BACKPOINTER_OFFSET_MAX; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp(c, bucket, bp_offset), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_backpointer || ++ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_bucket_backpointer_del(struct btree_trans *trans, ++ struct bkey_i_alloc_v4 *a, ++ struct bch_backpointer bp, ++ struct bkey_s_c orig_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); ++ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); ++ struct btree_iter bp_iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for (i = 0; i < nr; i++) { ++ int cmp = backpointer_cmp(bps[i], bp) ?: ++ memcmp(&bps[i], &bp, sizeof(bp)); ++ if (!cmp) ++ goto found; ++ if (cmp >= 0) ++ break; ++ } ++ ++ goto btree; ++found: ++ array_remove_item(bps, nr, i); ++ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); ++ set_alloc_v4_u64s(a); ++ return 0; ++btree: ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&bp_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_backpointer || ++ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_printf(&buf, "backpointer not found when deleting"); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ ++ prt_printf(&buf, "searching for "); ++ bch2_backpointer_to_text(&buf, &bp); ++ prt_newline(&buf); ++ ++ prt_printf(&buf, "got "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ ++ prt_str(&buf, "alloc "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); ++ prt_newline(&buf); ++ ++ prt_printf(&buf, "for "); ++ bch2_bkey_val_to_text(&buf, c, orig_k); ++ ++ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ bch_err(c, "%s", buf.buf); ++ } else { ++ ret = -EIO; ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ } ++ printbuf_exit(&buf); ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &bp_iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &bp_iter); ++ return ret; ++} ++ ++int bch2_bucket_backpointer_add(struct btree_trans *trans, ++ struct bkey_i_alloc_v4 *a, ++ struct bch_backpointer bp, ++ struct bkey_s_c orig_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca; ++ struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); ++ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); ++ struct bkey_i_backpointer *bp_k; ++ struct btree_iter bp_iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ /* Check for duplicates: */ ++ for (i = 0; i < nr; i++) { ++ int cmp = backpointer_cmp(bps[i], bp); ++ if (cmp >= 0) ++ break; ++ } ++ ++ if ((i && ++ (bps[i - 1].bucket_offset + ++ bps[i - 1].bucket_len > bp.bucket_offset)) || ++ (i < nr && ++ (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_printf(&buf, "overlapping backpointer found when inserting "); ++ bch2_backpointer_to_text(&buf, &bp); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ ++ prt_printf(&buf, "into "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); ++ prt_newline(&buf); ++ ++ prt_printf(&buf, "for "); ++ bch2_bkey_val_to_text(&buf, c, orig_k); ++ ++ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) ++ bch_err(c, "%s", buf.buf); ++ else { ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ printbuf_exit(&buf); ++ return -EIO; ++ } ++ } ++ ++ if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) { ++ array_insert_item(bps, nr, i, bp); ++ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); ++ set_alloc_v4_u64s(a); ++ return 0; ++ } ++ ++ /* Overflow: use backpointer btree */ ++ bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k)); ++ ret = PTR_ERR_OR_ZERO(bp_k); ++ if (ret) ++ return ret; ++ ++ ca = bch_dev_bkey_exists(c, a->k.p.inode); ++ ++ bkey_backpointer_init(&bp_k->k_i); ++ bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); ++ bp_k->v = bp; ++ ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&bp_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_printf(&buf, "existing btree backpointer key found when inserting "); ++ bch2_backpointer_to_text(&buf, &bp); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ ++ prt_printf(&buf, "found "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ ++ prt_printf(&buf, "for "); ++ bch2_bkey_val_to_text(&buf, c, orig_k); ++ ++ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) ++ bch_err(c, "%s", buf.buf); ++ else { ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ printbuf_exit(&buf); ++ ret = -EIO; ++ goto err; ++ } ++ } ++ ++ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &bp_iter); ++ return ret; ++} ++ ++/* ++ * Find the next backpointer >= *bp_offset: ++ */ ++int bch2_get_next_backpointer(struct btree_trans *trans, ++ struct bpos bucket, int gen, ++ u64 *bp_offset, ++ struct bch_backpointer *dst) ++{ ++ struct bch_fs *c = trans->c; ++ struct bpos bp_pos = ++ bucket_pos_to_bp(c, bucket, ++ max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); ++ struct bpos bp_end_pos = ++ bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); ++ struct btree_iter alloc_iter, bp_iter = { NULL }; ++ struct bkey_s_c k; ++ struct bkey_s_c_alloc_v4 a; ++ size_t i; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, ++ bucket, BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ if (k.k->type != KEY_TYPE_alloc_v4) ++ goto done; ++ ++ a = bkey_s_c_to_alloc_v4(k); ++ if (gen >= 0 && a.v->gen != gen) ++ goto done; ++ ++ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { ++ if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) ++ continue; ++ ++ *dst = alloc_v4_backpointers_c(a.v)[i]; ++ *bp_offset = dst->bucket_offset; ++ goto out; ++ } ++ ++ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, ++ bp_pos, 0, k, ret) { ++ if (bpos_cmp(k.k->p, bp_end_pos) >= 0) ++ break; ++ ++ if (k.k->type != KEY_TYPE_backpointer) ++ continue; ++ ++ *dst = *bkey_s_c_to_backpointer(k).v; ++ *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX; ++ goto out; ++ } ++done: ++ *bp_offset = U64_MAX; ++out: ++ bch2_trans_iter_exit(trans, &bp_iter); ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ return ret; ++} ++ ++static void backpointer_not_found(struct btree_trans *trans, ++ struct bpos bucket, ++ u64 bp_offset, ++ struct bch_backpointer bp, ++ struct bkey_s_c k, ++ const char *thing_it_points_to) ++{ ++ struct bch_fs *c = trans->c; ++ struct printbuf buf = PRINTBUF; ++ ++ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", ++ thing_it_points_to); ++ prt_printf(&buf, "bucket: "); ++ bch2_bpos_to_text(&buf, bucket); ++ prt_printf(&buf, "\n "); ++ ++ if (bp_offset >= BACKPOINTER_OFFSET_MAX) { ++ struct bpos bp_pos = ++ bucket_pos_to_bp(c, bucket, ++ bp_offset - BACKPOINTER_OFFSET_MAX); ++ prt_printf(&buf, "backpointer pos: "); ++ bch2_bpos_to_text(&buf, bp_pos); ++ prt_printf(&buf, "\n "); ++ } ++ ++ bch2_backpointer_to_text(&buf, &bp); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) ++ bch_err(c, "%s", buf.buf); ++ else ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ ++ printbuf_exit(&buf); ++} ++ ++struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos bucket, ++ u64 bp_offset, ++ struct bch_backpointer bp) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ ++ bch2_trans_node_iter_init(trans, iter, ++ bp.btree_id, ++ bp.pos, ++ 0, ++ min(bp.level, c->btree_roots[bp.btree_id].level), ++ 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) { ++ bch2_trans_iter_exit(trans, iter); ++ return k; ++ } ++ ++ if (bp.level == c->btree_roots[bp.btree_id].level + 1) ++ k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); ++ ++ if (extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) ++ return k; ++ ++ backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); ++ ++ bch2_trans_iter_exit(trans, iter); ++ return bkey_s_c_null; ++} ++ ++struct btree *bch2_backpointer_get_node(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos bucket, ++ u64 bp_offset, ++ struct bch_backpointer bp) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b; ++ struct bkey_s_c k; ++ ++ BUG_ON(!bp.level); ++ ++ bch2_trans_node_iter_init(trans, iter, ++ bp.btree_id, ++ bp.pos, ++ 0, ++ bp.level - 1, ++ 0); ++ b = bch2_btree_iter_peek_node(iter); ++ if (IS_ERR(b)) { ++ bch2_trans_iter_exit(trans, iter); ++ return b; ++ } ++ ++ if (extent_matches_bp(c, bp.btree_id, bp.level, ++ bkey_i_to_s_c(&b->key), ++ bucket, bp)) ++ return b; ++ ++ if (!btree_node_will_make_reachable(b)) ++ backpointer_not_found(trans, bucket, bp_offset, ++ bp, k, "btree node"); ++ ++ bch2_trans_iter_exit(trans, iter); ++ return NULL; ++} ++ ++static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter = { NULL }; ++ struct bch_dev *ca; ++ struct bkey_s_c alloc_k; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, ++ "backpointer for mising device:\n%s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, bp_iter, 0); ++ goto out; ++ } ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, ++ bp_pos_to_bucket(c, k.k->p), 0); ++ ++ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); ++ if (ret) ++ goto out; ++ ++ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, ++ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", ++ alloc_iter.pos.inode, alloc_iter.pos.offset, ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, bp_iter, 0); ++ goto out; ++ } ++out: ++fsck_err: ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++/* verify that every backpointer has a corresponding alloc key */ ++int bch2_check_btree_backpointers(struct bch_fs *c) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ return bch2_trans_run(c, ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_backpointers, POS_MIN, 0, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ bch2_check_btree_backpointer(&trans, &iter, k))); ++} ++ ++static int check_bp_exists(struct btree_trans *trans, ++ struct bpos bucket_pos, ++ struct bch_backpointer bp, ++ struct bkey_s_c orig_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter alloc_iter, bp_iter = { NULL }; ++ struct printbuf buf = PRINTBUF; ++ struct bkey_s_c alloc_k, bp_k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); ++ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); ++ if (ret) ++ goto err; ++ ++ if (alloc_k.k->type == KEY_TYPE_alloc_v4) { ++ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k); ++ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v); ++ unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); ++ ++ for (i = 0; i < nr; i++) { ++ int cmp = backpointer_cmp(bps[i], bp) ?: ++ memcmp(&bps[i], &bp, sizeof(bp)); ++ if (!cmp) ++ goto out; ++ if (cmp >= 0) ++ break; ++ } ++ } else { ++ goto missing; ++ } ++ ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), ++ 0); ++ bp_k = bch2_btree_iter_peek_slot(&bp_iter); ++ ret = bkey_err(bp_k); ++ if (ret) ++ goto err; ++ ++ if (bp_k.k->type != KEY_TYPE_backpointer || ++ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) ++ goto missing; ++out: ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &bp_iter); ++ bch2_trans_iter_exit(trans, &alloc_iter); ++ printbuf_exit(&buf); ++ return ret; ++missing: ++ prt_printf(&buf, "missing backpointer for btree=%s l=%u ", ++ bch2_btree_ids[bp.btree_id], bp.level); ++ bch2_bkey_val_to_text(&buf, c, orig_k); ++ prt_printf(&buf, "\nin alloc key "); ++ bch2_bkey_val_to_text(&buf, c, alloc_k); ++ ++ if (c->sb.version < bcachefs_metadata_version_backpointers || ++ c->opts.reconstruct_alloc || ++ fsck_err(c, "%s", buf.buf)) { ++ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); ++ ++ ret = PTR_ERR_OR_ZERO(a) ?: ++ bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?: ++ bch2_trans_update(trans, &alloc_iter, &a->k_i, 0); ++ } ++ ++ goto out; ++} ++ ++static int check_extent_to_backpointers(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_ptrs_c ptrs; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bkey_s_c k; ++ int ret; ++ ++ k = bch2_btree_iter_peek_all_levels(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ if (!k.k) ++ return 0; ++ ++ ptrs = bch2_bkey_ptrs_c(k); ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ struct bpos bucket_pos; ++ struct bch_backpointer bp; ++ ++ if (p.ptr.cached) ++ continue; ++ ++ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, ++ k, p, &bucket_pos, &bp); ++ ++ ret = check_bp_exists(trans, bucket_pos, bp, k); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int check_btree_root_to_backpointers(struct btree_trans *trans, ++ enum btree_id btree_id) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct btree *b; ++ struct bkey_s_c k; ++ struct bkey_ptrs_c ptrs; ++ struct extent_ptr_decoded p; ++ const union bch_extent_entry *entry; ++ int ret; ++ ++ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, ++ c->btree_roots[btree_id].level, 0); ++ b = bch2_btree_iter_peek_node(&iter); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto err; ++ ++ BUG_ON(b != btree_node_root(c, b)); ++ ++ k = bkey_i_to_s_c(&b->key); ++ ptrs = bch2_bkey_ptrs_c(k); ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ struct bpos bucket_pos; ++ struct bch_backpointer bp; ++ ++ if (p.ptr.cached) ++ continue; ++ ++ bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, ++ k, p, &bucket_pos, &bp); ++ ++ ret = check_bp_exists(trans, bucket_pos, bp, k); ++ if (ret) ++ goto err; ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_check_extents_to_backpointers(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ enum btree_id btree_id; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0, ++ 0, ++ BTREE_ITER_ALL_LEVELS| ++ BTREE_ITER_PREFETCH); ++ ++ do { ++ ret = commit_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_extent_to_backpointers(&trans, &iter)); ++ if (ret) ++ break; ++ } while (!bch2_btree_iter_advance(&iter)); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ break; ++ ++ ret = commit_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_btree_root_to_backpointers(&trans, btree_id)); ++ if (ret) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int check_one_backpointer(struct btree_trans *trans, ++ struct bpos bucket, ++ u64 *bp_offset) ++{ ++ struct btree_iter iter; ++ struct bch_backpointer bp; ++ struct bkey_s_c k; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ ret = bch2_get_next_backpointer(trans, bucket, -1, ++ bp_offset, &bp); ++ if (ret || *bp_offset == U64_MAX) ++ return ret; ++ ++ k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (fsck_err_on(!k.k, trans->c, ++ "%s backpointer points to missing extent\n%s", ++ *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree", ++ (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { ++ ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "backpointer at %llu not found", *bp_offset); ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_check_backpointers_to_extents(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ u64 bp_offset = 0; ++ ++ while (!(ret = commit_do(&trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ check_one_backpointer(&trans, iter.pos, &bp_offset))) && ++ bp_offset < U64_MAX) ++ bp_offset++; ++ ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} +diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h +new file mode 100644 +index 000000000000..fe42af296e9c +--- /dev/null ++++ b/fs/bcachefs/backpointers.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H ++#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H ++ ++#include "super.h" ++ ++int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, ++ int, struct printbuf *); ++void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); ++void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++void bch2_backpointer_swab(struct bkey_s); ++ ++#define bch2_bkey_ops_backpointer (struct bkey_ops) { \ ++ .key_invalid = bch2_backpointer_invalid, \ ++ .val_to_text = bch2_backpointer_k_to_text, \ ++ .swab = bch2_backpointer_swab, \ ++} ++ ++void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned, ++ struct bkey_s_c, struct extent_ptr_decoded, ++ struct bpos *, struct bch_backpointer *); ++ ++int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, ++ struct bch_backpointer, struct bkey_s_c); ++int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, ++ struct bch_backpointer, struct bkey_s_c); ++int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, ++ u64 *, struct bch_backpointer *); ++struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64, struct bch_backpointer); ++struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64, struct bch_backpointer); ++ ++int bch2_check_btree_backpointers(struct bch_fs *); ++int bch2_check_extents_to_backpointers(struct bch_fs *); ++int bch2_check_backpointers_to_extents(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +new file mode 100644 +index 000000000000..8ffdb4dee47a +--- /dev/null ++++ b/fs/bcachefs/bcachefs.h +@@ -0,0 +1,1000 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_H ++#define _BCACHEFS_H ++ ++/* ++ * SOME HIGH LEVEL CODE DOCUMENTATION: ++ * ++ * Bcache mostly works with cache sets, cache devices, and backing devices. ++ * ++ * Support for multiple cache devices hasn't quite been finished off yet, but ++ * it's about 95% plumbed through. A cache set and its cache devices is sort of ++ * like a md raid array and its component devices. Most of the code doesn't care ++ * about individual cache devices, the main abstraction is the cache set. ++ * ++ * Multiple cache devices is intended to give us the ability to mirror dirty ++ * cached data and metadata, without mirroring clean cached data. ++ * ++ * Backing devices are different, in that they have a lifetime independent of a ++ * cache set. When you register a newly formatted backing device it'll come up ++ * in passthrough mode, and then you can attach and detach a backing device from ++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly ++ * invalidates any cached data for that backing device. ++ * ++ * A cache set can have multiple (many) backing devices attached to it. ++ * ++ * There's also flash only volumes - this is the reason for the distinction ++ * between struct cached_dev and struct bcache_device. A flash only volume ++ * works much like a bcache device that has a backing device, except the ++ * "cached" data is always dirty. The end result is that we get thin ++ * provisioning with very little additional code. ++ * ++ * Flash only volumes work but they're not production ready because the moving ++ * garbage collector needs more work. More on that later. ++ * ++ * BUCKETS/ALLOCATION: ++ * ++ * Bcache is primarily designed for caching, which means that in normal ++ * operation all of our available space will be allocated. Thus, we need an ++ * efficient way of deleting things from the cache so we can write new things to ++ * it. ++ * ++ * To do this, we first divide the cache device up into buckets. A bucket is the ++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ ++ * works efficiently. ++ * ++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with ++ * it. The gens and priorities for all the buckets are stored contiguously and ++ * packed on disk (in a linked list of buckets - aside from the superblock, all ++ * of bcache's metadata is stored in buckets). ++ * ++ * The priority is used to implement an LRU. We reset a bucket's priority when ++ * we allocate it or on cache it, and every so often we decrement the priority ++ * of each bucket. It could be used to implement something more sophisticated, ++ * if anyone ever gets around to it. ++ * ++ * The generation is used for invalidating buckets. Each pointer also has an 8 ++ * bit generation embedded in it; for a pointer to be considered valid, its gen ++ * must match the gen of the bucket it points into. Thus, to reuse a bucket all ++ * we have to do is increment its gen (and write its new gen to disk; we batch ++ * this up). ++ * ++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that ++ * contain metadata (including btree nodes). ++ * ++ * THE BTREE: ++ * ++ * Bcache is in large part design around the btree. ++ * ++ * At a high level, the btree is just an index of key -> ptr tuples. ++ * ++ * Keys represent extents, and thus have a size field. Keys also have a variable ++ * number of pointers attached to them (potentially zero, which is handy for ++ * invalidating the cache). ++ * ++ * The key itself is an inode:offset pair. The inode number corresponds to a ++ * backing device or a flash only volume. The offset is the ending offset of the ++ * extent within the inode - not the starting offset; this makes lookups ++ * slightly more convenient. ++ * ++ * Pointers contain the cache device id, the offset on that device, and an 8 bit ++ * generation number. More on the gen later. ++ * ++ * Index lookups are not fully abstracted - cache lookups in particular are ++ * still somewhat mixed in with the btree code, but things are headed in that ++ * direction. ++ * ++ * Updates are fairly well abstracted, though. There are two different ways of ++ * updating the btree; insert and replace. ++ * ++ * BTREE_INSERT will just take a list of keys and insert them into the btree - ++ * overwriting (possibly only partially) any extents they overlap with. This is ++ * used to update the index after a write. ++ * ++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is ++ * overwriting a key that matches another given key. This is used for inserting ++ * data into the cache after a cache miss, and for background writeback, and for ++ * the moving garbage collector. ++ * ++ * There is no "delete" operation; deleting things from the index is ++ * accomplished by either by invalidating pointers (by incrementing a bucket's ++ * gen) or by inserting a key with 0 pointers - which will overwrite anything ++ * previously present at that location in the index. ++ * ++ * This means that there are always stale/invalid keys in the btree. They're ++ * filtered out by the code that iterates through a btree node, and removed when ++ * a btree node is rewritten. ++ * ++ * BTREE NODES: ++ * ++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * free smaller than a bucket - so, that's how big our btree nodes are. ++ * ++ * (If buckets are really big we'll only use part of the bucket for a btree node ++ * - no less than 1/4th - but a bucket still contains no more than a single ++ * btree node. I'd actually like to change this, but for now we rely on the ++ * bucket's gen for deleting btree nodes when we rewrite/split a node.) ++ * ++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook ++ * btree implementation. ++ * ++ * The way this is solved is that btree nodes are internally log structured; we ++ * can append new keys to an existing btree node without rewriting it. This ++ * means each set of keys we write is sorted, but the node is not. ++ * ++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would ++ * be expensive, and we have to distinguish between the keys we have written and ++ * the keys we haven't. So to do a lookup in a btree node, we have to search ++ * each sorted set. But we do merge written sets together lazily, so the cost of ++ * these extra searches is quite low (normally most of the keys in a btree node ++ * will be in one big set, and then there'll be one or two sets that are much ++ * smaller). ++ * ++ * This log structure makes bcache's btree more of a hybrid between a ++ * conventional btree and a compacting data structure, with some of the ++ * advantages of both. ++ * ++ * GARBAGE COLLECTION: ++ * ++ * We can't just invalidate any bucket - it might contain dirty data or ++ * metadata. If it once contained dirty data, other writes might overwrite it ++ * later, leaving no valid pointers into that bucket in the index. ++ * ++ * Thus, the primary purpose of garbage collection is to find buckets to reuse. ++ * It also counts how much valid data it each bucket currently contains, so that ++ * allocation can reuse buckets sooner when they've been mostly overwritten. ++ * ++ * It also does some things that are really internal to the btree ++ * implementation. If a btree node contains pointers that are stale by more than ++ * some threshold, it rewrites the btree node to avoid the bucket's generation ++ * wrapping around. It also merges adjacent btree nodes if they're empty enough. ++ * ++ * THE JOURNAL: ++ * ++ * Bcache's journal is not necessary for consistency; we always strictly ++ * order metadata writes so that the btree and everything else is consistent on ++ * disk in the event of an unclean shutdown, and in fact bcache had writeback ++ * caching (with recovery from unclean shutdown) before journalling was ++ * implemented. ++ * ++ * Rather, the journal is purely a performance optimization; we can't complete a ++ * write until we've updated the index on disk, otherwise the cache would be ++ * inconsistent in the event of an unclean shutdown. This means that without the ++ * journal, on random write workloads we constantly have to update all the leaf ++ * nodes in the btree, and those writes will be mostly empty (appending at most ++ * a few keys each) - highly inefficient in terms of amount of metadata writes, ++ * and it puts more strain on the various btree resorting/compacting code. ++ * ++ * The journal is just a log of keys we've inserted; on startup we just reinsert ++ * all the keys in the open journal entries. That means that when we're updating ++ * a node in the btree, we can wait until a 4k block of keys fills up before ++ * writing them out. ++ * ++ * For simplicity, we only journal updates to leaf nodes; updates to parent ++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth ++ * the complexity to deal with journalling them (in particular, journal replay) ++ * - updates to non leaf nodes just happen synchronously (see btree_split()). ++ */ ++ ++#undef pr_fmt ++#ifdef __KERNEL__ ++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++#else ++#define pr_fmt(fmt) "%s() " fmt "\n", __func__ ++#endif ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "errcode.h" ++#include "fifo.h" ++#include "opts.h" ++#include "util.h" ++ ++#define dynamic_fault(...) 0 ++#define race_fault(...) 0 ++ ++#define bch2_fs_init_fault(name) \ ++ dynamic_fault("bcachefs:bch_fs_init:" name) ++#define bch2_meta_read_fault(name) \ ++ dynamic_fault("bcachefs:meta:read:" name) ++#define bch2_meta_write_fault(name) \ ++ dynamic_fault("bcachefs:meta:write:" name) ++ ++#ifdef __KERNEL__ ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) ++#else ++#define bch2_fmt(_c, fmt) fmt "\n" ++#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) ++#endif ++ ++#define bch_info(c, fmt, ...) \ ++ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_notice(c, fmt, ...) \ ++ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn(c, fmt, ...) \ ++ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err(c, fmt, ...) \ ++ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_err_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) ++ ++#define bch_verbose(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define pr_verbose_init(opts, fmt, ...) \ ++do { \ ++ if (opt_get(opts, verbose)) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++/* Parameters that are useful for debugging, but should always be compiled in: */ ++#define BCH_DEBUG_PARAMS_ALWAYS() \ ++ BCH_DEBUG_PARAM(key_merging_disabled, \ ++ "Disables merging of extents") \ ++ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ ++ "Causes mark and sweep to compact and rewrite every " \ ++ "btree node it traverses") \ ++ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ ++ "Disables rewriting of btree nodes during mark and sweep")\ ++ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ ++ "Disables the shrinker callback for the btree node cache")\ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") \ ++ BCH_DEBUG_PARAM(verify_all_btree_replicas, \ ++ "When reading btree nodes, read all replicas and " \ ++ "compare them") ++ ++/* Parameters that should only be compiled in in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ ++ BCH_DEBUG_PARAM(expensive_debug_checks, \ ++ "Enables various runtime debugging checks that " \ ++ "significantly affect performance") \ ++ BCH_DEBUG_PARAM(debug_check_iterators, \ ++ "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ ++ "Verify btree accounting for keys within a node") \ ++ BCH_DEBUG_PARAM(journal_seq_verify, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(inject_invalid_keys, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(test_alloc_startup, \ ++ "Force allocator startup to use the slowpath where it" \ ++ "can't find enough free buckets without invalidating" \ ++ "cached data") \ ++ BCH_DEBUG_PARAM(force_reconstruct_read, \ ++ "Force reads to use the reconstruct path, when reading" \ ++ "from erasure coded extents") \ ++ BCH_DEBUG_PARAM(test_restart_gc, \ ++ "Test restarting mark and sweep gc when bucket gens change") ++ ++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() ++#else ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() ++#endif ++ ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#ifndef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++#endif ++ ++#define BCH_LOCK_TIME_NR 128 ++ ++#define BCH_TIME_STATS() \ ++ x(btree_node_mem_alloc) \ ++ x(btree_node_split) \ ++ x(btree_node_compact) \ ++ x(btree_node_merge) \ ++ x(btree_node_sort) \ ++ x(btree_node_read) \ ++ x(btree_interior_update_foreground) \ ++ x(btree_interior_update_total) \ ++ x(btree_gc) \ ++ x(btree_lock_contended_read) \ ++ x(btree_lock_contended_intent) \ ++ x(btree_lock_contended_write) \ ++ x(data_write) \ ++ x(data_read) \ ++ x(data_promote) \ ++ x(journal_flush_write) \ ++ x(journal_noflush_write) \ ++ x(journal_flush_seq) \ ++ x(blocked_journal) \ ++ x(blocked_allocate) \ ++ x(blocked_allocate_open_bucket) ++ ++enum bch_time_stats { ++#define x(name) BCH_TIME_##name, ++ BCH_TIME_STATS() ++#undef x ++ BCH_TIME_STAT_NR ++}; ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "buckets_waiting_for_journal_types.h" ++#include "clock_types.h" ++#include "ec_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "replicas_types.h" ++#include "subvolume_types.h" ++#include "super_types.h" ++ ++/* Number of nodes btree coalesce will try to coalesce at once */ ++#define GC_MERGE_NODES 4U ++ ++/* Maximum number of nodes we might need to allocate atomically: */ ++#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) ++ ++/* Size of the freelist we allocate btree nodes from: */ ++#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) ++ ++#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) ++ ++struct btree; ++ ++enum gc_phase { ++ GC_PHASE_NOT_RUNNING, ++ GC_PHASE_START, ++ GC_PHASE_SB, ++ ++ GC_PHASE_BTREE_stripes, ++ GC_PHASE_BTREE_extents, ++ GC_PHASE_BTREE_inodes, ++ GC_PHASE_BTREE_dirents, ++ GC_PHASE_BTREE_xattrs, ++ GC_PHASE_BTREE_alloc, ++ GC_PHASE_BTREE_quotas, ++ GC_PHASE_BTREE_reflink, ++ GC_PHASE_BTREE_subvolumes, ++ GC_PHASE_BTREE_snapshots, ++ GC_PHASE_BTREE_lru, ++ GC_PHASE_BTREE_freespace, ++ GC_PHASE_BTREE_need_discard, ++ GC_PHASE_BTREE_backpointers, ++ ++ GC_PHASE_PENDING_DELETE, ++}; ++ ++struct gc_pos { ++ enum gc_phase phase; ++ struct bpos pos; ++ unsigned level; ++}; ++ ++struct reflink_gc { ++ u64 offset; ++ u32 size; ++ u32 refcount; ++}; ++ ++typedef GENRADIX(struct reflink_gc) reflink_gc_table; ++ ++struct io_count { ++ u64 sectors[2][BCH_DATA_NR]; ++}; ++ ++struct bch_dev { ++ struct kobject kobj; ++ struct percpu_ref ref; ++ struct completion ref_completion; ++ struct percpu_ref io_ref; ++ struct completion io_ref_completion; ++ ++ struct bch_fs *fs; ++ ++ u8 dev_idx; ++ /* ++ * Cached version of this device's member info from superblock ++ * Committed by bch2_write_super() -> bch_fs_mi_update() ++ */ ++ struct bch_member_cpu mi; ++ uuid_le uuid; ++ char name[BDEVNAME_SIZE]; ++ ++ struct bch_sb_handle disk_sb; ++ struct bch_sb *sb_read_scratch; ++ int sb_write_error; ++ dev_t dev; ++ ++ struct bch_devs_mask self; ++ ++ /* biosets used in cloned bios for writing multiple replicas */ ++ struct bio_set replica_set; ++ ++ /* ++ * Buckets: ++ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and ++ * gc_lock, for device resize - holding any is sufficient for access: ++ * Or rcu_read_lock(), but only for ptr_stale(): ++ */ ++ struct bucket_array __rcu *buckets_gc; ++ struct bucket_gens __rcu *bucket_gens; ++ u8 *oldest_gen; ++ unsigned long *buckets_nouse; ++ struct rw_semaphore bucket_lock; ++ ++ struct bch_dev_usage *usage_base; ++ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; ++ struct bch_dev_usage __percpu *usage_gc; ++ ++ /* Allocator: */ ++ u64 new_fs_bucket_idx; ++ u64 bucket_alloc_trans_early_cursor; ++ ++ unsigned nr_open_buckets; ++ unsigned nr_btree_reserve; ++ ++ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_partial_nr; ++ ++ size_t inc_gen_needs_gc; ++ size_t inc_gen_really_needs_gc; ++ size_t buckets_waiting_on_journal; ++ ++ atomic64_t rebalance_work; ++ ++ struct journal_device journal; ++ u64 prev_journal_sector; ++ ++ struct work_struct io_error_work; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic64_t cur_latency[2]; ++ struct time_stats io_latency[2]; ++ ++#define CONGESTED_MAX 1024 ++ atomic_t congested; ++ u64 congested_last; ++ ++ struct io_count __percpu *io_done; ++}; ++ ++enum { ++ /* startup: */ ++ BCH_FS_STARTED, ++ BCH_FS_MAY_GO_RW, ++ BCH_FS_RW, ++ BCH_FS_WAS_RW, ++ ++ /* shutdown: */ ++ BCH_FS_STOPPING, ++ BCH_FS_EMERGENCY_RO, ++ BCH_FS_WRITE_DISABLE_COMPLETE, ++ BCH_FS_CLEAN_SHUTDOWN, ++ ++ /* fsck passes: */ ++ BCH_FS_TOPOLOGY_REPAIR_DONE, ++ BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ ++ BCH_FS_CHECK_LRUS_DONE, ++ BCH_FS_CHECK_BACKPOINTERS_DONE, ++ BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, ++ BCH_FS_FSCK_DONE, ++ BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ ++ BCH_FS_NEED_ANOTHER_GC, ++ ++ BCH_FS_HAVE_DELETED_SNAPSHOTS, ++ ++ /* errors: */ ++ BCH_FS_ERROR, ++ BCH_FS_TOPOLOGY_ERROR, ++ BCH_FS_ERRORS_FIXED, ++ BCH_FS_ERRORS_NOT_FIXED, ++}; ++ ++struct btree_debug { ++ unsigned id; ++}; ++ ++struct lock_held_stats { ++ struct time_stats times[BCH_LOCK_TIME_NR]; ++ const char *names[BCH_LOCK_TIME_NR]; ++}; ++ ++struct bch_fs_pcpu { ++ u64 sectors_available; ++}; ++ ++struct journal_seq_blacklist_table { ++ size_t nr; ++ struct journal_seq_blacklist_table_entry { ++ u64 start; ++ u64 end; ++ bool dirty; ++ } entries[0]; ++}; ++ ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ bool allocated; ++ bool overwritten; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ /* ++ * Gap buffer: instead of all the empty space in the array being at the ++ * end of the buffer - from @nr to @size - the empty space is at @gap. ++ * This means that sequential insertions are O(n) instead of O(n^2). ++ */ ++ size_t gap; ++ size_t nr; ++ size_t size; ++ u64 journal_seq_base; ++}; ++ ++struct btree_path_buf { ++ struct btree_path *path; ++}; ++ ++#define REPLICAS_DELTA_LIST_MAX (1U << 16) ++ ++struct snapshot_t { ++ u32 parent; ++ u32 children[2]; ++ u32 subvol; /* Nonzero only if a subvolume points to this node: */ ++ u32 equiv; ++}; ++ ++typedef struct { ++ u32 subvol; ++ u64 inum; ++} subvol_inum; ++ ++#define BCACHEFS_ROOT_SUBVOL_INUM \ ++ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) ++ ++struct bch_fs { ++ struct closure cl; ++ ++ struct list_head list; ++ struct kobject kobj; ++ struct kobject counters_kobj; ++ struct kobject internal; ++ struct kobject opts_dir; ++ struct kobject time_stats; ++ unsigned long flags; ++ ++ int minor; ++ struct device *chardev; ++ struct super_block *vfs_sb; ++ dev_t dev; ++ char name[40]; ++ ++ /* ro/rw, add/remove/resize devices: */ ++ struct rw_semaphore state_lock; ++ ++ /* Counts outstanding writes, for clean transition to read-only */ ++ struct percpu_ref writes; ++ struct work_struct read_only_work; ++ ++ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ ++ struct bch_replicas_cpu replicas; ++ struct bch_replicas_cpu replicas_gc; ++ struct mutex replicas_gc_lock; ++ mempool_t replicas_delta_pool; ++ ++ struct journal_entry_res btree_root_journal_res; ++ struct journal_entry_res replicas_journal_res; ++ struct journal_entry_res clock_journal_res; ++ struct journal_entry_res dev_usage_journal_res; ++ ++ struct bch_disk_groups_cpu __rcu *disk_groups; ++ ++ struct bch_opts opts; ++ ++ /* Updated by bch2_sb_update():*/ ++ struct { ++ uuid_le uuid; ++ uuid_le user_uuid; ++ ++ u16 version; ++ u16 version_min; ++ ++ u8 nr_devices; ++ u8 clean; ++ ++ u8 encryption_type; ++ ++ u64 time_base_lo; ++ u32 time_base_hi; ++ unsigned time_units_per_sec; ++ unsigned nsec_per_time_unit; ++ u64 features; ++ u64 compat; ++ } sb; ++ ++ ++ struct bch_sb_handle disk_sb; ++ ++ unsigned short block_bits; /* ilog2(block_size) */ ++ ++ u16 btree_foreground_merge_threshold; ++ ++ struct closure sb_write; ++ struct mutex sb_lock; ++ ++ /* snapshot.c: */ ++ GENRADIX(struct snapshot_t) snapshots; ++ struct bch_snapshot_table __rcu *snapshot_table; ++ struct mutex snapshot_table_lock; ++ struct work_struct snapshot_delete_work; ++ struct work_struct snapshot_wait_for_pagecache_and_delete_work; ++ snapshot_id_list snapshots_unlinked; ++ struct mutex snapshots_unlinked_lock; ++ ++ /* BTREE CACHE */ ++ struct bio_set btree_bio; ++ struct workqueue_struct *io_complete_wq; ++ ++ struct btree_root btree_roots[BTREE_ID_NR]; ++ struct mutex btree_root_lock; ++ ++ struct btree_cache btree_cache; ++ ++ /* ++ * Cache of allocated btree nodes - if we allocate a btree node and ++ * don't use it, if we free it that space can't be reused until going ++ * _all_ the way through the allocator (which exposes us to a livelock ++ * when allocating btree reserves fail halfway through) - instead, we ++ * can stick them here: ++ */ ++ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; ++ unsigned btree_reserve_cache_nr; ++ struct mutex btree_reserve_cache_lock; ++ ++ mempool_t btree_interior_update_pool; ++ struct list_head btree_interior_update_list; ++ struct list_head btree_interior_updates_unwritten; ++ struct mutex btree_interior_update_lock; ++ struct closure_waitlist btree_interior_update_wait; ++ ++ struct workqueue_struct *btree_interior_update_worker; ++ struct work_struct btree_interior_update_work; ++ ++ /* btree_iter.c: */ ++ struct mutex btree_trans_lock; ++ struct list_head btree_trans_list; ++ mempool_t btree_paths_pool; ++ mempool_t btree_trans_mem_pool; ++ struct btree_path_buf __percpu *btree_paths_bufs; ++ ++ struct srcu_struct btree_trans_barrier; ++ bool btree_trans_barrier_initialized; ++ ++ struct btree_key_cache btree_key_cache; ++ unsigned btree_key_cache_btrees; ++ ++ struct workqueue_struct *btree_update_wq; ++ struct workqueue_struct *btree_io_complete_wq; ++ /* copygc needs its own workqueue for index updates.. */ ++ struct workqueue_struct *copygc_wq; ++ ++ /* ALLOCATION */ ++ struct bch_devs_mask rw_devs[BCH_DATA_NR]; ++ ++ u64 capacity; /* sectors */ ++ ++ /* ++ * When capacity _decreases_ (due to a disk being removed), we ++ * increment capacity_gen - this invalidates outstanding reservations ++ * and forces them to be revalidated ++ */ ++ u32 capacity_gen; ++ unsigned bucket_size_max; ++ ++ atomic64_t sectors_available; ++ struct mutex sectors_available_lock; ++ ++ struct bch_fs_pcpu __percpu *pcpu; ++ ++ struct percpu_rw_semaphore mark_lock; ++ ++ seqcount_t usage_lock; ++ struct bch_fs_usage *usage_base; ++ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; ++ struct bch_fs_usage __percpu *usage_gc; ++ u64 __percpu *online_reserved; ++ ++ /* single element mempool: */ ++ struct mutex usage_scratch_lock; ++ struct bch_fs_usage_online *usage_scratch; ++ ++ struct io_clock io_clock[2]; ++ ++ /* JOURNAL SEQ BLACKLIST */ ++ struct journal_seq_blacklist_table * ++ journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; ++ ++ /* ALLOCATOR */ ++ spinlock_t freelist_lock; ++ struct closure_waitlist freelist_wait; ++ u64 blocked_allocate; ++ u64 blocked_allocate_open_bucket; ++ ++ open_bucket_idx_t open_buckets_freelist; ++ open_bucket_idx_t open_buckets_nr_free; ++ struct closure_waitlist open_buckets_wait; ++ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; ++ ++ struct write_point btree_write_point; ++ struct write_point rebalance_write_point; ++ ++ struct write_point write_points[WRITE_POINT_MAX]; ++ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; ++ struct mutex write_points_hash_lock; ++ unsigned write_points_nr; ++ ++ struct buckets_waiting_for_journal buckets_waiting_for_journal; ++ struct work_struct discard_work; ++ struct work_struct invalidate_work; ++ ++ /* GARBAGE COLLECTION */ ++ struct task_struct *gc_thread; ++ atomic_t kick_gc; ++ unsigned long gc_count; ++ ++ enum btree_id gc_gens_btree; ++ struct bpos gc_gens_pos; ++ ++ /* ++ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] ++ * has been marked by GC. ++ * ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) ++ * ++ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread ++ * can read without a lock. ++ */ ++ seqcount_t gc_pos_lock; ++ struct gc_pos gc_pos; ++ ++ /* ++ * The allocation code needs gc_mark in struct bucket to be correct, but ++ * it's not while a gc is in progress. ++ */ ++ struct rw_semaphore gc_lock; ++ struct mutex gc_gens_lock; ++ ++ /* IO PATH */ ++ struct semaphore io_in_flight; ++ struct bio_set bio_read; ++ struct bio_set bio_read_split; ++ struct bio_set bio_write; ++ struct mutex bio_bounce_pages_lock; ++ mempool_t bio_bounce_pages; ++ struct rhashtable promote_table; ++ ++ mempool_t compression_bounce[2]; ++ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; ++ mempool_t decompress_workspace; ++ ZSTD_parameters zstd_params; ++ ++ struct crypto_shash *sha256; ++ struct crypto_sync_skcipher *chacha20; ++ struct crypto_shash *poly1305; ++ ++ atomic64_t key_version; ++ ++ mempool_t large_bkey_pool; ++ ++ /* REBALANCE */ ++ struct bch_fs_rebalance rebalance; ++ ++ /* COPYGC */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct write_point copygc_write_point; ++ s64 copygc_wait; ++ bool copygc_running; ++ wait_queue_head_t copygc_running_wq; ++ ++ /* DATA PROGRESS STATS */ ++ struct list_head data_progress_list; ++ struct mutex data_progress_lock; ++ ++ /* STRIPES: */ ++ GENRADIX(struct stripe) stripes; ++ GENRADIX(struct gc_stripe) gc_stripes; ++ ++ ec_stripes_heap ec_stripes_heap; ++ spinlock_t ec_stripes_heap_lock; ++ ++ /* ERASURE CODING */ ++ struct list_head ec_stripe_head_list; ++ struct mutex ec_stripe_head_lock; ++ ++ struct list_head ec_stripe_new_list; ++ struct mutex ec_stripe_new_lock; ++ ++ struct work_struct ec_stripe_create_work; ++ u64 ec_stripe_hint; ++ ++ struct bio_set ec_bioset; ++ ++ struct work_struct ec_stripe_delete_work; ++ struct llist_head ec_stripe_delete_list; ++ ++ /* REFLINK */ ++ u64 reflink_hint; ++ reflink_gc_table reflink_gc_table; ++ size_t reflink_gc_nr; ++ ++ /* VFS IO PATH - fs-io.c */ ++ struct bio_set writepage_bioset; ++ struct bio_set dio_write_bioset; ++ struct bio_set dio_read_bioset; ++ ++ ++ atomic64_t btree_writes_nr; ++ atomic64_t btree_writes_sectors; ++ spinlock_t btree_write_error_lock; ++ ++ /* ERRORS */ ++ struct list_head fsck_errors; ++ struct mutex fsck_error_lock; ++ bool fsck_alloc_err; ++ ++ /* QUOTAS */ ++ struct bch_memquota_type quotas[QTYP_NR]; ++ ++ /* DEBUG JUNK */ ++ struct dentry *fs_debug_dir; ++ struct dentry *btree_debug_dir; ++ struct btree_debug btree_debug[BTREE_ID_NR]; ++ struct btree *verify_data; ++ struct btree_node *verify_ondisk; ++ struct mutex verify_lock; ++ ++ u64 *unused_inode_hints; ++ unsigned inode_shard_bits; ++ ++ /* ++ * A btree node on disk could have too many bsets for an iterator to fit ++ * on the stack - have to dynamically allocate them ++ */ ++ mempool_t fill_iter; ++ ++ mempool_t btree_bounce_pool; ++ ++ struct journal journal; ++ GENRADIX(struct journal_replay *) journal_entries; ++ u64 journal_entries_base_seq; ++ struct journal_keys journal_keys; ++ struct list_head journal_iters; ++ ++ u64 last_bucket_seq_cleanup; ++ ++ /* TODO rewrite as counters - The rest of this all shows up in sysfs */ ++ atomic_long_t read_realloc_races; ++ atomic_long_t extent_migrate_done; ++ atomic_long_t extent_migrate_raced; ++ atomic_long_t bucket_alloc_fail; ++ ++ u64 counters_on_mount[BCH_COUNTER_NR]; ++ u64 __percpu *counters; ++ ++ unsigned btree_gc_periodic:1; ++ unsigned copy_gc_enabled:1; ++ bool promote_whole_extents; ++ ++ struct time_stats times[BCH_TIME_STAT_NR]; ++ ++ struct lock_held_stats lock_held_stats; ++}; ++ ++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) ++{ ++#ifndef NO_BCACHEFS_FS ++ if (c->vfs_sb) ++ c->vfs_sb->s_bdi->ra_pages = ra_pages; ++#endif ++} ++ ++static inline unsigned bucket_bytes(const struct bch_dev *ca) ++{ ++ return ca->mi.bucket_size << 9; ++} ++ ++static inline unsigned block_bytes(const struct bch_fs *c) ++{ ++ return c->opts.block_size; ++} ++ ++static inline unsigned block_sectors(const struct bch_fs *c) ++{ ++ return c->opts.block_size >> 9; ++} ++ ++static inline size_t btree_sectors(const struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> 9; ++} ++ ++static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) ++{ ++ return c->btree_key_cache_btrees & (1U << btree); ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) ++{ ++ struct timespec64 t; ++ s32 rem; ++ ++ time += c->sb.time_base_lo; ++ ++ t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); ++ t.tv_nsec = rem * c->sb.nsec_per_time_unit; ++ return t; ++} ++ ++static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) ++{ ++ return (ts.tv_sec * c->sb.time_units_per_sec + ++ (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; ++} ++ ++static inline s64 bch2_current_time(const struct bch_fs *c) ++{ ++ struct timespec64 now; ++ ++ ktime_get_coarse_real_ts64(&now); ++ return timespec_to_bch2_time(c, now); ++} ++ ++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) ++{ ++ return dev < c->sb.nr_devices && c->devs[dev]; ++} ++ ++#endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +new file mode 100644 +index 000000000000..147fde1417b0 +--- /dev/null ++++ b/fs/bcachefs/bcachefs_format.h +@@ -0,0 +1,2052 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FORMAT_H ++#define _BCACHEFS_FORMAT_H ++ ++/* ++ * bcachefs on disk data structures ++ * ++ * OVERVIEW: ++ * ++ * There are three main types of on disk data structures in bcachefs (this is ++ * reduced from 5 in bcache) ++ * ++ * - superblock ++ * - journal ++ * - btree ++ * ++ * The btree is the primary structure; most metadata exists as keys in the ++ * various btrees. There are only a small number of btrees, they're not ++ * sharded - we have one btree for extents, another for inodes, et cetera. ++ * ++ * SUPERBLOCK: ++ * ++ * The superblock contains the location of the journal, the list of devices in ++ * the filesystem, and in general any metadata we need in order to decide ++ * whether we can start a filesystem or prior to reading the journal/btree ++ * roots. ++ * ++ * The superblock is extensible, and most of the contents of the superblock are ++ * in variable length, type tagged fields; see struct bch_sb_field. ++ * ++ * Backup superblocks do not reside in a fixed location; also, superblocks do ++ * not have a fixed size. To locate backup superblocks we have struct ++ * bch_sb_layout; we store a copy of this inside every superblock, and also ++ * before the first superblock. ++ * ++ * JOURNAL: ++ * ++ * The journal primarily records btree updates in the order they occurred; ++ * journal replay consists of just iterating over all the keys in the open ++ * journal entries and re-inserting them into the btrees. ++ * ++ * The journal also contains entry types for the btree roots, and blacklisted ++ * journal sequence numbers (see journal_seq_blacklist.c). ++ * ++ * BTREE: ++ * ++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically ++ * 128k-256k) and log structured. We use struct btree_node for writing the first ++ * entry in a given node (offset 0), and struct btree_node_entry for all ++ * subsequent writes. ++ * ++ * After the header, btree node entries contain a list of keys in sorted order. ++ * Values are stored inline with the keys; since values are variable length (and ++ * keys effectively are variable length too, due to packing) we can't do random ++ * access without building up additional in memory tables in the btree node read ++ * path. ++ * ++ * BTREE KEYS (struct bkey): ++ * ++ * The various btrees share a common format for the key - so as to avoid ++ * switching in fastpath lookup/comparison code - but define their own ++ * structures for the key values. ++ * ++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max ++ * size is just under 2k. The common part also contains a type tag for the ++ * value, and a format field indicating whether the key is packed or not (and ++ * also meant to allow adding new key fields in the future, if desired). ++ * ++ * bkeys, when stored within a btree node, may also be packed. In that case, the ++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can ++ * be generous with field sizes in the common part of the key format (64 bit ++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. ++ */ ++ ++#include ++#include ++#include ++#include ++#include "vstructs.h" ++ ++#define BITMASK(name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (k->field >> offset) & ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ k->field &= ~(~(~0ULL << (end - offset)) << offset); \ ++ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ ++} ++ ++#define LE_BITMASK(_bits, name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (__le##_bits##_to_cpu(k->field) >> offset) & \ ++ ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ __u##_bits new = __le##_bits##_to_cpu(k->field); \ ++ \ ++ new &= ~(~(~0ULL << (end - offset)) << offset); \ ++ new |= (v & ~(~0ULL << (end - offset))) << offset; \ ++ k->field = __cpu_to_le##_bits(new); \ ++} ++ ++#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) ++#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) ++#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) ++ ++struct bkey_format { ++ __u8 key_u64s; ++ __u8 nr_fields; ++ /* One unused slot for now: */ ++ __u8 bits_per_field[6]; ++ __le64 field_offset[6]; ++}; ++ ++/* Btree keys - all units are in sectors */ ++ ++struct bpos { ++ /* ++ * Word order matches machine byte order - btree code treats a bpos as a ++ * single large integer, for search/comparison purposes ++ * ++ * Note that wherever a bpos is embedded in another on disk data ++ * structure, it has to be byte swabbed when reading in metadata that ++ * wasn't written in native endian order: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u32 snapshot; ++ __u64 offset; ++ __u64 inode; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u64 inode; ++ __u64 offset; /* Points to end of extent - sectors */ ++ __u32 snapshot; ++#else ++#error edit for your odd byteorder. ++#endif ++} __attribute__((packed, aligned(4))); ++ ++#define KEY_INODE_MAX ((__u64)~0ULL) ++#define KEY_OFFSET_MAX ((__u64)~0ULL) ++#define KEY_SNAPSHOT_MAX ((__u32)~0U) ++#define KEY_SIZE_MAX ((__u32)~0U) ++ ++static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) ++{ ++ return (struct bpos) { ++ .inode = inode, ++ .offset = offset, ++ .snapshot = snapshot, ++ }; ++} ++ ++#define POS_MIN SPOS(0, 0, 0) ++#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) ++#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) ++#define POS(_inode, _offset) SPOS(_inode, _offset, 0) ++ ++/* Empty placeholder struct, for container_of() */ ++struct bch_val { ++ __u64 __nothing[0]; ++}; ++ ++struct bversion { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u64 lo; ++ __u32 hi; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u32 hi; ++ __u64 lo; ++#endif ++} __attribute__((packed, aligned(4))); ++ ++struct bkey { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u8 pad[1]; ++ ++ struct bversion version; ++ __u32 size; /* extent size, in sectors */ ++ struct bpos p; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ struct bpos p; ++ __u32 size; /* extent size, in sectors */ ++ struct bversion version; ++ ++ __u8 pad[1]; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bkey_packed { ++ __u64 _data[0]; ++ ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++ ++ /* ++ * XXX: next incompat on disk format change, switch format and ++ * needs_whiteout - bkey_packed() will be cheaper if format is the high ++ * bits of the bitfield ++ */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ __u8 key_start[0]; ++ ++ /* ++ * We copy bkeys with struct assignment in various places, and while ++ * that shouldn't be done with packed bkeys we can't disallow it in C, ++ * and it's legal to cast a bkey to a bkey_packed - so padding it out ++ * to the same size as struct bkey should hopefully be safest. ++ */ ++ __u8 pad[sizeof(struct bkey) - 3]; ++} __attribute__((packed, aligned(8))); ++ ++#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) ++#define BKEY_U64s_MAX U8_MAX ++#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) ++ ++#define KEY_PACKED_BITS_START 24 ++ ++#define KEY_FORMAT_LOCAL_BTREE 0 ++#define KEY_FORMAT_CURRENT 1 ++ ++enum bch_bkey_fields { ++ BKEY_FIELD_INODE, ++ BKEY_FIELD_OFFSET, ++ BKEY_FIELD_SNAPSHOT, ++ BKEY_FIELD_SIZE, ++ BKEY_FIELD_VERSION_HI, ++ BKEY_FIELD_VERSION_LO, ++ BKEY_NR_FIELDS, ++}; ++ ++#define bkey_format_field(name, field) \ ++ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) ++ ++#define BKEY_FORMAT_CURRENT \ ++((struct bkey_format) { \ ++ .key_u64s = BKEY_U64s, \ ++ .nr_fields = BKEY_NR_FIELDS, \ ++ .bits_per_field = { \ ++ bkey_format_field(INODE, p.inode), \ ++ bkey_format_field(OFFSET, p.offset), \ ++ bkey_format_field(SNAPSHOT, p.snapshot), \ ++ bkey_format_field(SIZE, size), \ ++ bkey_format_field(VERSION_HI, version.hi), \ ++ bkey_format_field(VERSION_LO, version.lo), \ ++ }, \ ++}) ++ ++/* bkey with inline value */ ++struct bkey_i { ++ __u64 _data[0]; ++ ++ union { ++ struct { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ }; ++ struct { ++ struct bkey k; ++ struct bch_val v; ++ }; ++ }; ++}; ++ ++#define KEY(_inode, _offset, _size) \ ++((struct bkey) { \ ++ .u64s = BKEY_U64s, \ ++ .format = KEY_FORMAT_CURRENT, \ ++ .p = POS(_inode, _offset), \ ++ .size = _size, \ ++}) ++ ++static inline void bkey_init(struct bkey *k) ++{ ++ *k = KEY(0, 0, 0); ++} ++ ++#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) ++ ++#define __BKEY_PADDED(key, pad) \ ++ struct { struct bkey_i key; __u64 key ## _pad[pad]; } ++ ++/* ++ * - DELETED keys are used internally to mark keys that should be ignored but ++ * override keys in composition order. Their version number is ignored. ++ * ++ * - DISCARDED keys indicate that the data is all 0s because it has been ++ * discarded. DISCARDs may have a version; if the version is nonzero the key ++ * will be persistent, otherwise the key will be dropped whenever the btree ++ * node is rewritten (like DELETED keys). ++ * ++ * - ERROR: any read of the data returns a read error, as the data was lost due ++ * to a failing device. Like DISCARDED keys, they can be removed (overridden) ++ * by new writes or cluster-wide GC. Node repair can also overwrite them with ++ * the same or a more recent version number, but not with an older version ++ * number. ++ * ++ * - WHITEOUT: for hash table btrees ++*/ ++#define BCH_BKEY_TYPES() \ ++ x(deleted, 0) \ ++ x(whiteout, 1) \ ++ x(error, 2) \ ++ x(cookie, 3) \ ++ x(hash_whiteout, 4) \ ++ x(btree_ptr, 5) \ ++ x(extent, 6) \ ++ x(reservation, 7) \ ++ x(inode, 8) \ ++ x(inode_generation, 9) \ ++ x(dirent, 10) \ ++ x(xattr, 11) \ ++ x(alloc, 12) \ ++ x(quota, 13) \ ++ x(stripe, 14) \ ++ x(reflink_p, 15) \ ++ x(reflink_v, 16) \ ++ x(inline_data, 17) \ ++ x(btree_ptr_v2, 18) \ ++ x(indirect_inline_data, 19) \ ++ x(alloc_v2, 20) \ ++ x(subvolume, 21) \ ++ x(snapshot, 22) \ ++ x(inode_v2, 23) \ ++ x(alloc_v3, 24) \ ++ x(set, 25) \ ++ x(lru, 26) \ ++ x(alloc_v4, 27) \ ++ x(backpointer, 28) ++ ++enum bch_bkey_type { ++#define x(name, nr) KEY_TYPE_##name = nr, ++ BCH_BKEY_TYPES() ++#undef x ++ KEY_TYPE_MAX, ++}; ++ ++struct bch_deleted { ++ struct bch_val v; ++}; ++ ++struct bch_whiteout { ++ struct bch_val v; ++}; ++ ++struct bch_error { ++ struct bch_val v; ++}; ++ ++struct bch_cookie { ++ struct bch_val v; ++ __le64 cookie; ++}; ++ ++struct bch_hash_whiteout { ++ struct bch_val v; ++}; ++ ++struct bch_set { ++ struct bch_val v; ++}; ++ ++/* Extents */ ++ ++/* ++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally ++ * preceded by checksum/compression information (bch_extent_crc32 or ++ * bch_extent_crc64). ++ * ++ * One major determining factor in the format of extents is how we handle and ++ * represent extents that have been partially overwritten and thus trimmed: ++ * ++ * If an extent is not checksummed or compressed, when the extent is trimmed we ++ * don't have to remember the extent we originally allocated and wrote: we can ++ * merely adjust ptr->offset to point to the start of the data that is currently ++ * live. The size field in struct bkey records the current (live) size of the ++ * extent, and is also used to mean "size of region on disk that we point to" in ++ * this case. ++ * ++ * Thus an extent that is not checksummed or compressed will consist only of a ++ * list of bch_extent_ptrs, with none of the fields in ++ * bch_extent_crc32/bch_extent_crc64. ++ * ++ * When an extent is checksummed or compressed, it's not possible to read only ++ * the data that is currently live: we have to read the entire extent that was ++ * originally written, and then return only the part of the extent that is ++ * currently live. ++ * ++ * Thus, in addition to the current size of the extent in struct bkey, we need ++ * to store the size of the originally allocated space - this is the ++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, ++ * when the extent is trimmed, instead of modifying the offset field of the ++ * pointer, we keep a second smaller offset field - "offset into the original ++ * extent of the currently live region". ++ * ++ * The other major determining factor is replication and data migration: ++ * ++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated ++ * write, we will initially write all the replicas in the same format, with the ++ * same checksum type and compression format - however, when copygc runs later (or ++ * tiering/cache promotion, anything that moves data), it is not in general ++ * going to rewrite all the pointers at once - one of the replicas may be in a ++ * bucket on one device that has very little fragmentation while another lives ++ * in a bucket that has become heavily fragmented, and thus is being rewritten ++ * sooner than the rest. ++ * ++ * Thus it will only move a subset of the pointers (or in the case of ++ * tiering/cache promotion perhaps add a single pointer without dropping any ++ * current pointers), and if the extent has been partially overwritten it must ++ * write only the currently live portion (or copygc would not be able to reduce ++ * fragmentation!) - which necessitates a different bch_extent_crc format for ++ * the new pointer. ++ * ++ * But in the interests of space efficiency, we don't want to store one ++ * bch_extent_crc for each pointer if we don't have to. ++ * ++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and ++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the ++ * type of a given entry with a scheme similar to utf8 (except we're encoding a ++ * type, not a size), encoding the type in the position of the first set bit: ++ * ++ * bch_extent_crc32 - 0b1 ++ * bch_extent_ptr - 0b10 ++ * bch_extent_crc64 - 0b100 ++ * ++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and ++ * bch_extent_crc64 is the least constrained). ++ * ++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, ++ * until the next bch_extent_crc32/64. ++ * ++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer ++ * is neither checksummed nor compressed. ++ */ ++ ++/* 128 bits, sufficient for cryptographic MACs: */ ++struct bch_csum { ++ __le64 lo; ++ __le64 hi; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_EXTENT_ENTRY_TYPES() \ ++ x(ptr, 0) \ ++ x(crc32, 1) \ ++ x(crc64, 2) \ ++ x(crc128, 3) \ ++ x(stripe_ptr, 4) ++#define BCH_EXTENT_ENTRY_MAX 5 ++ ++enum bch_extent_entry_type { ++#define x(f, n) BCH_EXTENT_ENTRY_##f = n, ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++/* Compressed/uncompressed size are stored biased by 1: */ ++struct bch_extent_crc32 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u32 type:2, ++ _compressed_size:7, ++ _uncompressed_size:7, ++ offset:7, ++ _unused:1, ++ csum_type:4, ++ compression_type:4; ++ __u32 csum; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u32 csum; ++ __u32 compression_type:4, ++ csum_type:4, ++ _unused:1, ++ offset:7, ++ _uncompressed_size:7, ++ _compressed_size:7, ++ type:2; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++#define CRC32_SIZE_MAX (1U << 7) ++#define CRC32_NONCE_MAX 0 ++ ++struct bch_extent_crc64 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:3, ++ _compressed_size:9, ++ _uncompressed_size:9, ++ offset:9, ++ nonce:10, ++ csum_type:4, ++ compression_type:4, ++ csum_hi:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 csum_hi:16, ++ compression_type:4, ++ csum_type:4, ++ nonce:10, ++ offset:9, ++ _uncompressed_size:9, ++ _compressed_size:9, ++ type:3; ++#endif ++ __u64 csum_lo; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC64_SIZE_MAX (1U << 9) ++#define CRC64_NONCE_MAX ((1U << 10) - 1) ++ ++struct bch_extent_crc128 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:4, ++ _compressed_size:13, ++ _uncompressed_size:13, ++ offset:13, ++ nonce:13, ++ csum_type:4, ++ compression_type:4; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 compression_type:4, ++ csum_type:4, ++ nonce:13, ++ offset:13, ++ _uncompressed_size:13, ++ _compressed_size:13, ++ type:4; ++#endif ++ struct bch_csum csum; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC128_SIZE_MAX (1U << 13) ++#define CRC128_NONCE_MAX ((1U << 13) - 1) ++ ++/* ++ * @reservation - pointer hasn't been written to, just reserved ++ */ ++struct bch_extent_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:1, ++ cached:1, ++ unused:1, ++ reservation:1, ++ offset:44, /* 8 petabytes */ ++ dev:8, ++ gen:8; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 gen:8, ++ dev:8, ++ offset:44, ++ reservation:1, ++ unused:1, ++ cached:1, ++ type:1; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent_stripe_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:5, ++ block:8, ++ redundancy:4, ++ idx:47; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 idx:47, ++ redundancy:4, ++ block:8, ++ type:5; ++#endif ++}; ++ ++struct bch_extent_reservation { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:22, ++ replicas:4, ++ generation:32; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 generation:32, ++ replicas:4, ++ unused:22, ++ type:6; ++#endif ++}; ++ ++union bch_extent_entry { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 ++ unsigned long type; ++#elif __BITS_PER_LONG == 32 ++ struct { ++ unsigned long pad; ++ unsigned long type; ++ }; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define x(f, n) struct bch_extent_##f f; ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++struct bch_btree_ptr { ++ struct bch_val v; ++ ++ __u64 _data[0]; ++ struct bch_extent_ptr start[]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_btree_ptr_v2 { ++ struct bch_val v; ++ ++ __u64 mem_ptr; ++ __le64 seq; ++ __le16 sectors_written; ++ __le16 flags; ++ struct bpos min_key; ++ __u64 _data[0]; ++ struct bch_extent_ptr start[]; ++} __attribute__((packed, aligned(8))); ++ ++LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); ++ ++struct bch_extent { ++ struct bch_val v; ++ ++ __u64 _data[0]; ++ union bch_extent_entry start[]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reservation { ++ struct bch_val v; ++ ++ __le32 generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++} __attribute__((packed, aligned(8))); ++ ++/* Maximum size (in u64s) a single pointer could be: */ ++#define BKEY_EXTENT_PTR_U64s_MAX\ ++ ((sizeof(struct bch_extent_crc128) + \ ++ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ ++/* Maximum possible size of an entire extent value: */ ++#define BKEY_EXTENT_VAL_U64s_MAX \ ++ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) ++ ++/* * Maximum possible size of an entire extent, key + value: */ ++#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* Btree pointers don't carry around checksums: */ ++#define BKEY_BTREE_PTR_VAL_U64s_MAX \ ++ ((sizeof(struct bch_btree_ptr_v2) + \ ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) ++#define BKEY_BTREE_PTR_U64s_MAX \ ++ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) ++ ++/* Inodes */ ++ ++#define BLOCKDEV_INODE_MAX 4096 ++ ++#define BCACHEFS_ROOT_INO 4096 ++ ++struct bch_inode { ++ struct bch_val v; ++ ++ __le64 bi_hash_seed; ++ __le32 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_v2 { ++ struct bch_val v; ++ ++ __le64 bi_journal_seq; ++ __le64 bi_hash_seed; ++ __le64 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_generation { ++ struct bch_val v; ++ ++ __le32 bi_generation; ++ __le32 pad; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * bi_subvol and bi_parent_subvol are only set for subvolume roots: ++ */ ++ ++#define BCH_INODE_FIELDS() \ ++ x(bi_atime, 96) \ ++ x(bi_ctime, 96) \ ++ x(bi_mtime, 96) \ ++ x(bi_otime, 96) \ ++ x(bi_size, 64) \ ++ x(bi_sectors, 64) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) \ ++ x(bi_dir, 64) \ ++ x(bi_dir_offset, 64) \ ++ x(bi_subvol, 32) \ ++ x(bi_parent_subvol, 32) ++ ++/* subset of BCH_INODE_FIELDS */ ++#define BCH_INODE_OPTS() \ ++ x(data_checksum, 8) \ ++ x(compression, 8) \ ++ x(project, 32) \ ++ x(background_compression, 8) \ ++ x(data_replicas, 8) \ ++ x(promote_target, 16) \ ++ x(foreground_target, 16) \ ++ x(background_target, 16) \ ++ x(erasure_code, 16) ++ ++enum inode_opt_id { ++#define x(name, ...) \ ++ Inode_opt_##name, ++ BCH_INODE_OPTS() ++#undef x ++ Inode_opt_nr, ++}; ++ ++enum { ++ /* ++ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL ++ * flags) ++ */ ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, ++ ++ __BCH_INODE_I_SIZE_DIRTY= 5, ++ __BCH_INODE_I_SECTORS_DIRTY= 6, ++ __BCH_INODE_UNLINKED = 7, ++ __BCH_INODE_BACKPTR_UNTRUSTED = 8, ++ ++ /* bits 20+ reserved for packed fields below: */ ++}; ++ ++#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) ++#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) ++#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) ++#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) ++#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) ++#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) ++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) ++#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) ++ ++LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); ++LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); ++ ++LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); ++LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); ++ ++/* Dirents */ ++ ++/* ++ * Dirents (and xattrs) have to implement string lookups; since our b-tree ++ * doesn't support arbitrary length strings for the key, we instead index by a ++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset ++ * field of the key - using linear probing to resolve hash collisions. This also ++ * provides us with the readdir cookie posix requires. ++ * ++ * Linear probing requires us to use whiteouts for deletions, in the event of a ++ * collision: ++ */ ++ ++struct bch_dirent { ++ struct bch_val v; ++ ++ /* Target inode number: */ ++ union { ++ __le64 d_inum; ++ struct { /* DT_SUBVOL */ ++ __le32 d_child_subvol; ++ __le32 d_parent_subvol; ++ }; ++ }; ++ ++ /* ++ * Copy of mode bits 12-15 from the target inode - so userspace can get ++ * the filetype without having to do a stat() ++ */ ++ __u8 d_type; ++ ++ __u8 d_name[]; ++} __attribute__((packed, aligned(8))); ++ ++#define DT_SUBVOL 16 ++#define BCH_DT_MAX 17 ++ ++#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \ ++ sizeof(struct bkey) - \ ++ offsetof(struct bch_dirent, d_name))) ++ ++/* Xattrs */ ++ ++#define KEY_TYPE_XATTR_INDEX_USER 0 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 ++#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 ++#define KEY_TYPE_XATTR_INDEX_SECURITY 4 ++ ++struct bch_xattr { ++ struct bch_val v; ++ __u8 x_type; ++ __u8 x_name_len; ++ __le16 x_val_len; ++ __u8 x_name[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Bucket/allocation information: */ ++ ++struct bch_alloc { ++ struct bch_val v; ++ __u8 fields; ++ __u8 gen; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS_V1() \ ++ x(read_time, 16) \ ++ x(write_time, 16) \ ++ x(data_type, 8) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(oldest_gen, 8) \ ++ x(stripe, 32) \ ++ x(stripe_redundancy, 8) ++ ++enum { ++#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, ++ BCH_ALLOC_FIELDS_V1() ++#undef x ++}; ++ ++struct bch_alloc_v2 { ++ struct bch_val v; ++ __u8 nr_fields; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS_V2() \ ++ x(read_time, 64) \ ++ x(write_time, 64) \ ++ x(dirty_sectors, 32) \ ++ x(cached_sectors, 32) \ ++ x(stripe, 32) \ ++ x(stripe_redundancy, 8) ++ ++struct bch_alloc_v3 { ++ struct bch_val v; ++ __le64 journal_seq; ++ __le32 flags; ++ __u8 nr_fields; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) ++LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) ++ ++struct bch_alloc_v4 { ++ struct bch_val v; ++ __u64 journal_seq; ++ __u32 flags; ++ __u8 gen; ++ __u8 oldest_gen; ++ __u8 data_type; ++ __u8 stripe_redundancy; ++ __u32 dirty_sectors; ++ __u32 cached_sectors; ++ __u64 io_time[2]; ++ __u32 stripe; ++ __u32 nr_external_backpointers; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_V4_U64s_V0 6 ++#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) ++ ++BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) ++BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) ++BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) ++BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) ++ ++#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 ++ ++struct bch_backpointer { ++ struct bch_val v; ++ __u8 btree_id; ++ __u8 level; ++ __u8 data_type; ++ __u64 bucket_offset:40; ++ __u32 bucket_len; ++ struct bpos pos; ++} __attribute__((packed, aligned(8))); ++ ++/* Quotas: */ ++ ++enum quota_types { ++ QTYP_USR = 0, ++ QTYP_GRP = 1, ++ QTYP_PRJ = 2, ++ QTYP_NR = 3, ++}; ++ ++enum quota_counters { ++ Q_SPC = 0, ++ Q_INO = 1, ++ Q_COUNTERS = 2, ++}; ++ ++struct bch_quota_counter { ++ __le64 hardlimit; ++ __le64 softlimit; ++}; ++ ++struct bch_quota { ++ struct bch_val v; ++ struct bch_quota_counter c[Q_COUNTERS]; ++} __attribute__((packed, aligned(8))); ++ ++/* Erasure coding */ ++ ++struct bch_stripe { ++ struct bch_val v; ++ __le16 sectors; ++ __u8 algorithm; ++ __u8 nr_blocks; ++ __u8 nr_redundant; ++ ++ __u8 csum_granularity_bits; ++ __u8 csum_type; ++ __u8 pad; ++ ++ struct bch_extent_ptr ptrs[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Reflink: */ ++ ++struct bch_reflink_p { ++ struct bch_val v; ++ __le64 idx; ++ /* ++ * A reflink pointer might point to an indirect extent which is then ++ * later split (by copygc or rebalance). If we only pointed to part of ++ * the original indirect extent, and then one of the fragments is ++ * outside the range we point to, we'd leak a refcount: so when creating ++ * reflink pointers, we need to store pad values to remember the full ++ * range we were taking a reference on. ++ */ ++ __le32 front_pad; ++ __le32 back_pad; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reflink_v { ++ struct bch_val v; ++ __le64 refcount; ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_indirect_inline_data { ++ struct bch_val v; ++ __le64 refcount; ++ u8 data[0]; ++}; ++ ++/* Inline data */ ++ ++struct bch_inline_data { ++ struct bch_val v; ++ u8 data[0]; ++}; ++ ++/* Subvolumes: */ ++ ++#define SUBVOL_POS_MIN POS(0, 1) ++#define SUBVOL_POS_MAX POS(0, S32_MAX) ++#define BCACHEFS_ROOT_SUBVOL 1 ++ ++struct bch_subvolume { ++ struct bch_val v; ++ __le32 flags; ++ __le32 snapshot; ++ __le64 inode; ++}; ++ ++LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) ++/* ++ * We need to know whether a subvolume is a snapshot so we can know whether we ++ * can delete it (or whether it should just be rm -rf'd) ++ */ ++LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) ++LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) ++ ++/* Snapshots */ ++ ++struct bch_snapshot { ++ struct bch_val v; ++ __le32 flags; ++ __le32 parent; ++ __le32 children[2]; ++ __le32 subvol; ++ __le32 pad; ++}; ++ ++LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) ++ ++/* True if a subvolume points to this snapshot node: */ ++LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) ++ ++/* LRU btree: */ ++ ++struct bch_lru { ++ struct bch_val v; ++ __le64 idx; ++} __attribute__((packed, aligned(8))); ++ ++#define LRU_ID_STRIPES (1U << 16) ++ ++/* Optional/variable size superblock sections: */ ++ ++struct bch_sb_field { ++ __u64 _data[0]; ++ __le32 u64s; ++ __le32 type; ++}; ++ ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) \ ++ x(journal_v2, 9) \ ++ x(counters, 10) ++ ++enum bch_sb_field_type { ++#define x(f, nr) BCH_SB_FIELD_##f = nr, ++ BCH_SB_FIELDS() ++#undef x ++ BCH_SB_FIELD_NR ++}; ++ ++/* ++ * Most superblock fields are replicated in all device's superblocks - a few are ++ * not: ++ */ ++#define BCH_SINGLE_DEVICE_SB_FIELDS \ ++ ((1U << BCH_SB_FIELD_journal)| \ ++ (1U << BCH_SB_FIELD_journal_v2)) ++ ++/* BCH_SB_FIELD_journal: */ ++ ++struct bch_sb_field_journal { ++ struct bch_sb_field field; ++ __le64 buckets[0]; ++}; ++ ++struct bch_sb_field_journal_v2 { ++ struct bch_sb_field field; ++ ++ struct bch_sb_field_journal_v2_entry { ++ __le64 start; ++ __le64 nr; ++ } d[0]; ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 6) ++ ++struct bch_member { ++ uuid_le uuid; ++ __le64 nbuckets; /* device size */ ++ __le16 first_bucket; /* index of first bucket used */ ++ __le16 bucket_size; /* sectors */ ++ __le32 pad; ++ __le64 last_mount; /* time_t */ ++ ++ __le64 flags[2]; ++}; ++ ++LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) ++/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ ++LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) ++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) ++LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) ++LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, ++ struct bch_member, flags[0], 30, 31) ++ ++#if 0 ++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); ++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); ++#endif ++ ++#define BCH_MEMBER_STATES() \ ++ x(rw, 0) \ ++ x(ro, 1) \ ++ x(failed, 2) \ ++ x(spare, 3) ++ ++enum bch_member_state { ++#define x(t, n) BCH_MEMBER_STATE_##t = n, ++ BCH_MEMBER_STATES() ++#undef x ++ BCH_MEMBER_STATE_NR ++}; ++ ++struct bch_sb_field_members { ++ struct bch_sb_field field; ++ struct bch_member members[0]; ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++struct nonce { ++ __le32 d[4]; ++}; ++ ++struct bch_key { ++ __le64 key[4]; ++}; ++ ++#define BCH_KEY_MAGIC \ ++ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ ++ ((u64) 'h' << 16)|((u64) '*' << 24)| \ ++ ((u64) '*' << 32)|((u64) 'k' << 40)| \ ++ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ ++struct bch_encrypted_key { ++ __le64 magic; ++ struct bch_key key; ++}; ++ ++/* ++ * If this field is present in the superblock, it stores an encryption key which ++ * is used encrypt all other data/metadata. The key will normally be encrypted ++ * with the key userspace provides, but if encryption has been turned off we'll ++ * just store the master key unencrypted in the superblock so we can access the ++ * previously encrypted data. ++ */ ++struct bch_sb_field_crypt { ++ struct bch_sb_field field; ++ ++ __le64 flags; ++ __le64 kdf_flags; ++ struct bch_encrypted_key key; ++}; ++ ++LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); ++ ++enum bch_kdf_types { ++ BCH_KDF_SCRYPT = 0, ++ BCH_KDF_NR = 1, ++}; ++ ++/* stored as base 2 log of scrypt params: */ ++LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); ++LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); ++LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); ++ ++/* BCH_SB_FIELD_replicas: */ ++ ++#define BCH_DATA_TYPES() \ ++ x(free, 0) \ ++ x(sb, 1) \ ++ x(journal, 2) \ ++ x(btree, 3) \ ++ x(user, 4) \ ++ x(cached, 5) \ ++ x(parity, 6) \ ++ x(stripe, 7) \ ++ x(need_gc_gens, 8) \ ++ x(need_discard, 9) ++ ++enum bch_data_type { ++#define x(t, n) BCH_DATA_##t, ++ BCH_DATA_TYPES() ++#undef x ++ BCH_DATA_NR ++}; ++ ++static inline bool data_type_is_empty(enum bch_data_type type) ++{ ++ switch (type) { ++ case BCH_DATA_free: ++ case BCH_DATA_need_gc_gens: ++ case BCH_DATA_need_discard: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool data_type_is_hidden(enum bch_data_type type) ++{ ++ switch (type) { ++ case BCH_DATA_sb: ++ case BCH_DATA_journal: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_replicas_entry_v0 { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 devs[]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas_v0 { ++ struct bch_sb_field field; ++ struct bch_replicas_entry_v0 entries[]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_entry { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 nr_required; ++ __u8 devs[]; ++} __attribute__((packed)); ++ ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ ++struct bch_sb_field_replicas { ++ struct bch_sb_field field; ++ struct bch_replicas_entry entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_quota: */ ++ ++struct bch_sb_quota_counter { ++ __le32 timelimit; ++ __le32 warnlimit; ++}; ++ ++struct bch_sb_quota_type { ++ __le64 flags; ++ struct bch_sb_quota_counter c[Q_COUNTERS]; ++}; ++ ++struct bch_sb_field_quota { ++ struct bch_sb_field field; ++ struct bch_sb_quota_type q[QTYP_NR]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_disk_groups: */ ++ ++#define BCH_SB_LABEL_SIZE 32 ++ ++struct bch_disk_group { ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 flags[2]; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) ++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) ++LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) ++ ++struct bch_sb_field_disk_groups { ++ struct bch_sb_field field; ++ struct bch_disk_group entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_counters */ ++ ++#define BCH_PERSISTENT_COUNTERS() \ ++ x(io_read, 0) \ ++ x(io_write, 1) \ ++ x(io_move, 2) \ ++ x(bucket_invalidate, 3) \ ++ x(bucket_discard, 4) ++ ++enum bch_persistent_counters { ++#define x(t, n, ...) BCH_COUNTER_##t, ++ BCH_PERSISTENT_COUNTERS() ++#undef x ++ BCH_COUNTER_NR ++}; ++ ++struct bch_sb_field_counters { ++ struct bch_sb_field field; ++ __le64 d[0]; ++}; ++ ++/* ++ * On clean shutdown, store btree roots and current journal sequence number in ++ * the superblock: ++ */ ++struct jset_entry { ++ __le16 u64s; ++ __u8 btree_id; ++ __u8 level; ++ __u8 type; /* designates what this jset holds */ ++ __u8 pad[3]; ++ ++ union { ++ struct bkey_i start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct bch_sb_field_clean { ++ struct bch_sb_field field; ++ ++ __le32 flags; ++ __le16 _read_clock; /* no longer used */ ++ __le16 _write_clock; ++ __le64 journal_seq; ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct journal_seq_blacklist_entry { ++ __le64 start; ++ __le64 end; ++}; ++ ++struct bch_sb_field_journal_seq_blacklist { ++ struct bch_sb_field field; ++ ++ union { ++ struct journal_seq_blacklist_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++/* Superblock: */ ++ ++/* ++ * New versioning scheme: ++ * One common version number for all on disk data structures - superblock, btree ++ * nodes, journal entries ++ */ ++#define BCH_JSET_VERSION_OLD 2 ++#define BCH_BSET_VERSION_OLD 3 ++ ++#define BCH_METADATA_VERSIONS() \ ++ x(bkey_renumber, 10) \ ++ x(inode_btree_change, 11) \ ++ x(snapshot, 12) \ ++ x(inode_backpointers, 13) \ ++ x(btree_ptr_sectors_written, 14) \ ++ x(snapshot_2, 15) \ ++ x(reflink_p_fix, 16) \ ++ x(subvol_dirent, 17) \ ++ x(inode_v2, 18) \ ++ x(freespace, 19) \ ++ x(alloc_v4, 20) \ ++ x(new_data_types, 21) \ ++ x(backpointers, 22) ++ ++enum bcachefs_metadata_version { ++ bcachefs_metadata_version_min = 9, ++#define x(t, n) bcachefs_metadata_version_##t = n, ++ BCH_METADATA_VERSIONS() ++#undef x ++ bcachefs_metadata_version_max ++}; ++ ++#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) ++ ++#define BCH_SB_SECTOR 8 ++#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ ++ ++struct bch_sb_layout { ++ uuid_le magic; /* bcachefs superblock UUID */ ++ __u8 layout_type; ++ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ ++ __u8 nr_superblocks; ++ __u8 pad[5]; ++ __le64 sb_offset[61]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_SB_LAYOUT_SECTOR 7 ++ ++/* ++ * @offset - sector where this sb was written ++ * @version - on disk format version ++ * @version_min - Oldest metadata version this filesystem contains; so we can ++ * safely drop compatibility code and refuse to mount filesystems ++ * we'd need it for ++ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) ++ * @seq - incremented each time superblock is written ++ * @uuid - used for generating various magic numbers and identifying ++ * member devices, never changes ++ * @user_uuid - user visible UUID, may be changed ++ * @label - filesystem label ++ * @seq - identifies most recent superblock, incremented each time ++ * superblock is written ++ * @features - enabled incompatible features ++ */ ++struct bch_sb { ++ struct bch_csum csum; ++ __le16 version; ++ __le16 version_min; ++ __le16 pad[2]; ++ uuid_le magic; ++ uuid_le uuid; ++ uuid_le user_uuid; ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 offset; ++ __le64 seq; ++ ++ __le16 block_size; ++ __u8 dev_idx; ++ __u8 nr_devices; ++ __le32 u64s; ++ ++ __le64 time_base_lo; ++ __le32 time_base_hi; ++ __le32 time_precision; ++ ++ __le64 flags[8]; ++ __le64 features[2]; ++ __le64 compat[2]; ++ ++ struct bch_sb_layout layout; ++ ++ union { ++ struct bch_sb_field start[0]; ++ __le64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * Flags: ++ * BCH_SB_INITALIZED - set on first mount ++ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect ++ * behaviour of mount/recovery path: ++ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits ++ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 ++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides ++ * DATA/META_CSUM_TYPE. Also indicates encryption ++ * algorithm in use, if/when we get more than one ++ */ ++ ++LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); ++ ++LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); ++LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); ++LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); ++LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); ++ ++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); ++ ++LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); ++LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); ++ ++LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); ++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); ++ ++LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); ++LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); ++LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); ++LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); ++ ++LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); ++ ++LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); ++ ++LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); ++ ++LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); ++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); ++ ++/* ++ * Max size of an extent that may require bouncing to read or write ++ * (checksummed, compressed): 64k ++ */ ++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, ++ struct bch_sb, flags[1], 14, 20); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); ++ ++LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); ++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); ++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); ++ ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++ struct bch_sb, flags[2], 0, 4); ++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); ++ ++LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); ++LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); ++LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); ++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); ++LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); ++/* Obsolete, always enabled: */ ++LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); ++ ++/* ++ * Features: ++ * ++ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist ++ * reflink: gates KEY_TYPE_reflink ++ * inline_data: gates KEY_TYPE_inline_data ++ * new_siphash: gates BCH_STR_HASH_siphash ++ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE ++ */ ++#define BCH_SB_FEATURES() \ ++ x(lz4, 0) \ ++ x(gzip, 1) \ ++ x(zstd, 2) \ ++ x(atomic_nlink, 3) \ ++ x(ec, 4) \ ++ x(journal_seq_blacklist_v3, 5) \ ++ x(reflink, 6) \ ++ x(new_siphash, 7) \ ++ x(inline_data, 8) \ ++ x(new_extent_overwrite, 9) \ ++ x(incompressible, 10) \ ++ x(btree_ptr_v2, 11) \ ++ x(extents_above_btree_updates, 12) \ ++ x(btree_updates_journalled, 13) \ ++ x(reflink_inline_data, 14) \ ++ x(new_varint, 15) \ ++ x(journal_no_flush, 16) \ ++ x(alloc_v2, 17) \ ++ x(extents_across_btree_nodes, 18) ++ ++#define BCH_SB_FEATURES_ALWAYS \ ++ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ ++ (1ULL << BCH_FEATURE_btree_updates_journalled)|\ ++ (1ULL << BCH_FEATURE_alloc_v2)|\ ++ (1ULL << BCH_FEATURE_extents_across_btree_nodes)) ++ ++#define BCH_SB_FEATURES_ALL \ ++ (BCH_SB_FEATURES_ALWAYS| \ ++ (1ULL << BCH_FEATURE_new_siphash)| \ ++ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ ++ (1ULL << BCH_FEATURE_new_varint)| \ ++ (1ULL << BCH_FEATURE_journal_no_flush)) ++ ++enum bch_sb_feature { ++#define x(f, n) BCH_FEATURE_##f, ++ BCH_SB_FEATURES() ++#undef x ++ BCH_FEATURE_NR, ++}; ++ ++#define BCH_SB_COMPAT() \ ++ x(alloc_info, 0) \ ++ x(alloc_metadata, 1) \ ++ x(extents_above_btree_updates_done, 2) \ ++ x(bformat_overflow_done, 3) ++ ++enum bch_sb_compat { ++#define x(f, n) BCH_COMPAT_##f, ++ BCH_SB_COMPAT() ++#undef x ++ BCH_COMPAT_NR, ++}; ++ ++/* options: */ ++ ++#define BCH_REPLICAS_MAX 4U ++ ++#define BCH_BKEY_PTRS_MAX 16U ++ ++#define BCH_ERROR_ACTIONS() \ ++ x(continue, 0) \ ++ x(ro, 1) \ ++ x(panic, 2) ++ ++enum bch_error_actions { ++#define x(t, n) BCH_ON_ERROR_##t = n, ++ BCH_ERROR_ACTIONS() ++#undef x ++ BCH_ON_ERROR_NR ++}; ++ ++#define BCH_STR_HASH_TYPES() \ ++ x(crc32c, 0) \ ++ x(crc64, 1) \ ++ x(siphash_old, 2) \ ++ x(siphash, 3) ++ ++enum bch_str_hash_type { ++#define x(t, n) BCH_STR_HASH_##t = n, ++ BCH_STR_HASH_TYPES() ++#undef x ++ BCH_STR_HASH_NR ++}; ++ ++#define BCH_STR_HASH_OPTS() \ ++ x(crc32c, 0) \ ++ x(crc64, 1) \ ++ x(siphash, 2) ++ ++enum bch_str_hash_opts { ++#define x(t, n) BCH_STR_HASH_OPT_##t = n, ++ BCH_STR_HASH_OPTS() ++#undef x ++ BCH_STR_HASH_OPT_NR ++}; ++ ++#define BCH_CSUM_TYPES() \ ++ x(none, 0) \ ++ x(crc32c_nonzero, 1) \ ++ x(crc64_nonzero, 2) \ ++ x(chacha20_poly1305_80, 3) \ ++ x(chacha20_poly1305_128, 4) \ ++ x(crc32c, 5) \ ++ x(crc64, 6) \ ++ x(xxhash, 7) ++ ++enum bch_csum_type { ++#define x(t, n) BCH_CSUM_##t = n, ++ BCH_CSUM_TYPES() ++#undef x ++ BCH_CSUM_NR ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_none] = 0, ++ [BCH_CSUM_crc32c_nonzero] = 4, ++ [BCH_CSUM_crc32c] = 4, ++ [BCH_CSUM_crc64_nonzero] = 8, ++ [BCH_CSUM_crc64] = 8, ++ [BCH_CSUM_xxhash] = 8, ++ [BCH_CSUM_chacha20_poly1305_80] = 10, ++ [BCH_CSUM_chacha20_poly1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++#define BCH_CSUM_OPTS() \ ++ x(none, 0) \ ++ x(crc32c, 1) \ ++ x(crc64, 2) \ ++ x(xxhash, 3) ++ ++enum bch_csum_opts { ++#define x(t, n) BCH_CSUM_OPT_##t = n, ++ BCH_CSUM_OPTS() ++#undef x ++ BCH_CSUM_OPT_NR ++}; ++ ++#define BCH_COMPRESSION_TYPES() \ ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) \ ++ x(incompressible, 5) ++ ++enum bch_compression_type { ++#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, ++ BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_TYPE_NR ++}; ++ ++#define BCH_COMPRESSION_OPTS() \ ++ x(none, 0) \ ++ x(lz4, 1) \ ++ x(gzip, 2) \ ++ x(zstd, 3) ++ ++enum bch_compression_opts { ++#define x(t, n) BCH_COMPRESSION_OPT_##t = n, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ BCH_COMPRESSION_OPT_NR ++}; ++ ++/* ++ * Magic numbers ++ * ++ * The various other data structures have their own magic numbers, which are ++ * xored with the first part of the cache set's UUID ++ */ ++ ++#define BCACHE_MAGIC \ ++ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ ++ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) ++ ++#define BCACHEFS_STATFS_MAGIC 0xca451a4e ++ ++#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) ++#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) ++ ++static inline __le64 __bch2_sb_magic(struct bch_sb *sb) ++{ ++ __le64 ret; ++ memcpy(&ret, &sb->uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 __jset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); ++} ++ ++static inline __u64 __bset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); ++} ++ ++/* Journal */ ++ ++#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) ++ ++#define BCH_JSET_ENTRY_TYPES() \ ++ x(btree_keys, 0) \ ++ x(btree_root, 1) \ ++ x(prio_ptrs, 2) \ ++ x(blacklist, 3) \ ++ x(blacklist_v2, 4) \ ++ x(usage, 5) \ ++ x(data_usage, 6) \ ++ x(clock, 7) \ ++ x(dev_usage, 8) \ ++ x(log, 9) \ ++ x(overwrite, 10) ++ ++enum { ++#define x(f, nr) BCH_JSET_ENTRY_##f = nr, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++ BCH_JSET_ENTRY_NR ++}; ++ ++/* ++ * Journal sequence numbers can be blacklisted: bsets record the max sequence ++ * number of all the journal entries they contain updates for, so that on ++ * recovery we can ignore those bsets that contain index updates newer that what ++ * made it into the journal. ++ * ++ * This means that we can't reuse that journal_seq - we have to skip it, and ++ * then record that we skipped it so that the next time we crash and recover we ++ * don't think there was a missing journal entry. ++ */ ++struct jset_entry_blacklist { ++ struct jset_entry entry; ++ __le64 seq; ++}; ++ ++struct jset_entry_blacklist_v2 { ++ struct jset_entry entry; ++ __le64 start; ++ __le64 end; ++}; ++ ++#define BCH_FS_USAGE_TYPES() \ ++ x(reserved, 0) \ ++ x(inodes, 1) \ ++ x(key_version, 2) ++ ++enum { ++#define x(f, nr) BCH_FS_USAGE_##f = nr, ++ BCH_FS_USAGE_TYPES() ++#undef x ++ BCH_FS_USAGE_NR ++}; ++ ++struct jset_entry_usage { ++ struct jset_entry entry; ++ __le64 v; ++} __attribute__((packed)); ++ ++struct jset_entry_data_usage { ++ struct jset_entry entry; ++ __le64 v; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++struct jset_entry_clock { ++ struct jset_entry entry; ++ __u8 rw; ++ __u8 pad[7]; ++ __le64 time; ++} __attribute__((packed)); ++ ++struct jset_entry_dev_usage_type { ++ __le64 buckets; ++ __le64 sectors; ++ __le64 fragmented; ++} __attribute__((packed)); ++ ++struct jset_entry_dev_usage { ++ struct jset_entry entry; ++ __le32 dev; ++ __u32 pad; ++ ++ __le64 buckets_ec; ++ __le64 _buckets_unavailable; /* No longer used */ ++ ++ struct jset_entry_dev_usage_type d[]; ++} __attribute__((packed)); ++ ++static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) ++{ ++ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / ++ sizeof(struct jset_entry_dev_usage_type); ++} ++ ++struct jset_entry_log { ++ struct jset_entry entry; ++ u8 d[]; ++} __attribute__((packed)); ++ ++/* ++ * On disk format for a journal entry: ++ * seq is monotonically increasing; every journal entry has its own unique ++ * sequence number. ++ * ++ * last_seq is the oldest journal entry that still has keys the btree hasn't ++ * flushed to disk yet. ++ * ++ * version is for on disk format changes. ++ */ ++struct jset { ++ struct bch_csum csum; ++ ++ __le64 magic; ++ __le64 seq; ++ __le32 version; ++ __le32 flags; ++ ++ __le32 u64s; /* size of d[] in u64s */ ++ ++ __u8 encrypted_start[0]; ++ ++ __le16 _read_clock; /* no longer used */ ++ __le16 _write_clock; ++ ++ /* Sequence number of oldest dirty journal entry */ ++ __le64 last_seq; ++ ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); ++LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); ++ ++#define BCH_JOURNAL_BUCKETS_MIN 8 ++ ++/* Btree: */ ++ ++#define BCH_BTREE_IDS() \ ++ x(extents, 0) \ ++ x(inodes, 1) \ ++ x(dirents, 2) \ ++ x(xattrs, 3) \ ++ x(alloc, 4) \ ++ x(quotas, 5) \ ++ x(stripes, 6) \ ++ x(reflink, 7) \ ++ x(subvolumes, 8) \ ++ x(snapshots, 9) \ ++ x(lru, 10) \ ++ x(freespace, 11) \ ++ x(need_discard, 12) \ ++ x(backpointers, 13) ++ ++enum btree_id { ++#define x(kwd, val) BTREE_ID_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BTREE_ID_NR ++}; ++ ++#define BTREE_MAX_DEPTH 4U ++ ++/* Btree nodes */ ++ ++/* ++ * Btree nodes ++ * ++ * On disk a btree node is a list/log of these; within each set the keys are ++ * sorted ++ */ ++struct bset { ++ __le64 seq; ++ ++ /* ++ * Highest journal entry this bset contains keys for. ++ * If on recovery we don't see that journal entry, this bset is ignored: ++ * this allows us to preserve the order of all index updates after a ++ * crash, since the journal records a total order of all index updates ++ * and anything that didn't make it to the journal doesn't get used. ++ */ ++ __le64 journal_seq; ++ ++ __le32 flags; ++ __le16 version; ++ __le16 u64s; /* count of d[] in u64s */ ++ ++ union { ++ struct bkey_packed start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); ++ ++LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); ++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, ++ struct bset, flags, 5, 6); ++ ++/* Sector offset within the btree node: */ ++LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); ++ ++struct btree_node { ++ struct bch_csum csum; ++ __le64 magic; ++ ++ /* this flags field is encrypted, unlike bset->flags: */ ++ __le64 flags; ++ ++ /* Closed interval: */ ++ struct bpos min_key; ++ struct bpos max_key; ++ struct bch_extent_ptr _ptr; /* not used anymore */ ++ struct bkey_format format; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); ++LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); ++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, ++ struct btree_node, flags, 8, 9); ++/* 9-32 unused */ ++LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); ++ ++struct btree_node_entry { ++ struct bch_csum csum; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++#endif /* _BCACHEFS_FORMAT_H */ +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +new file mode 100644 +index 000000000000..b2edabf58260 +--- /dev/null ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -0,0 +1,368 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IOCTL_H ++#define _BCACHEFS_IOCTL_H ++ ++#include ++#include ++#include "bcachefs_format.h" ++ ++/* ++ * Flags common to multiple ioctls: ++ */ ++#define BCH_FORCE_IF_DATA_LOST (1 << 0) ++#define BCH_FORCE_IF_METADATA_LOST (1 << 1) ++#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) ++#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) ++ ++#define BCH_FORCE_IF_LOST \ ++ (BCH_FORCE_IF_DATA_LOST| \ ++ BCH_FORCE_IF_METADATA_LOST) ++#define BCH_FORCE_IF_DEGRADED \ ++ (BCH_FORCE_IF_DATA_DEGRADED| \ ++ BCH_FORCE_IF_METADATA_DEGRADED) ++ ++/* ++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname ++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the ++ * filesystem: ++ */ ++#define BCH_BY_INDEX (1 << 4) ++ ++/* ++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem ++ * wide superblock: ++ */ ++#define BCH_READ_DEV (1 << 5) ++ ++/* global control dev: */ ++ ++/* These are currently broken, and probably unnecessary: */ ++#if 0 ++#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) ++#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) ++ ++struct bch_ioctl_assemble { ++ __u32 flags; ++ __u32 nr_devs; ++ __u64 pad; ++ __u64 devs[]; ++}; ++ ++struct bch_ioctl_incremental { ++ __u32 flags; ++ __u64 pad; ++ __u64 dev; ++}; ++#endif ++ ++/* filesystem ioctls: */ ++ ++#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) ++ ++/* These only make sense when we also have incremental assembly */ ++#if 0 ++#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) ++#define BCH_IOCTL_STOP _IO(0xbc, 3) ++#endif ++ ++#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) ++#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) ++#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) ++#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) ++#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) ++#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) ++#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) ++ ++#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) ++#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) ++ ++/* ioctl below act on a particular file, not the filesystem as a whole: */ ++ ++#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) ++ ++/* ++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID ++ * ++ * Returns user visible UUID, not internal UUID (which may not ever be changed); ++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with ++ * this UUID. ++ */ ++struct bch_ioctl_query_uuid { ++ uuid_le uuid; ++}; ++ ++#if 0 ++struct bch_ioctl_start { ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ ++/* ++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem ++ * ++ * The specified device must not be open or in use. On success, the new device ++ * will be an online member of the filesystem just like any other member. ++ * ++ * The device must first be prepared by userspace by formatting with a bcachefs ++ * superblock, which is only used for passing in superblock options/parameters ++ * for that device (in struct bch_member). The new device's superblock should ++ * not claim to be a member of any existing filesystem - UUIDs on it will be ++ * ignored. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem ++ * ++ * Any data present on @dev will be permanently deleted, and @dev will be ++ * removed from its slot in the filesystem's list of member devices. The device ++ * may be either offline or offline. ++ * ++ * Will fail removing @dev would leave us with insufficient read write devices ++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are ++ * set. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem ++ * but is not open (e.g. because we started in degraded mode), bring it online ++ * ++ * all existing data on @dev will be available once the device is online, ++ * exactly as if @dev was present when the filesystem was first mounted ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that ++ * block device, without removing it from the filesystem (so it can be brought ++ * back online later) ++ * ++ * Data present on @dev will be unavailable while @dev is offline (unless ++ * replicated), but will still be intact and untouched if @dev is brought back ++ * online ++ * ++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would ++ * leave us with insufficient read write devices or degraded/unavailable data, ++ * unless the approprate BCH_FORCE_IF_* flags are set. ++ */ ++ ++struct bch_ioctl_disk { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem ++ * ++ * @new_state - one of the bch_member_state states (rw, ro, failed, ++ * spare) ++ * ++ * Will refuse to change member state if we would then have insufficient devices ++ * to write to, or if it would result in degraded data (when @new_state is ++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. ++ */ ++struct bch_ioctl_disk_set_state { ++ __u32 flags; ++ __u8 new_state; ++ __u8 pad[3]; ++ __u64 dev; ++}; ++ ++enum bch_data_ops { ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_REWRITE_OLD_NODES = 3, ++ BCH_DATA_OP_NR = 4, ++}; ++ ++/* ++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. ++ * scrub, rereplicate, migrate). ++ * ++ * This ioctl kicks off a job in the background, and returns a file descriptor. ++ * Reading from the file descriptor returns a struct bch_ioctl_data_event, ++ * indicating current progress, and closing the file descriptor will stop the ++ * job. The file descriptor is O_CLOEXEC. ++ */ ++struct bch_ioctl_data { ++ __u16 op; ++ __u8 start_btree; ++ __u8 end_btree; ++ __u32 flags; ++ ++ struct bpos start_pos; ++ struct bpos end_pos; ++ ++ union { ++ struct { ++ __u32 dev; ++ __u32 pad; ++ } migrate; ++ struct { ++ __u64 pad[8]; ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_data_event { ++ BCH_DATA_EVENT_PROGRESS = 0, ++ /* XXX: add an event for reporting errors */ ++ BCH_DATA_EVENT_NR = 1, ++}; ++ ++struct bch_ioctl_data_progress { ++ __u8 data_type; ++ __u8 btree_id; ++ __u8 pad[2]; ++ struct bpos pos; ++ ++ __u64 sectors_done; ++ __u64 sectors_total; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_data_event { ++ __u8 type; ++ __u8 pad[7]; ++ union { ++ struct bch_ioctl_data_progress p; ++ __u64 pad2[15]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_usage { ++ __u64 sectors; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++static inline struct bch_replicas_usage * ++replicas_usage_next(struct bch_replicas_usage *u) ++{ ++ return (void *) u + replicas_entry_bytes(&u->r) + 8; ++} ++ ++/* ++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries ++ * ++ * On success, @replica_entries_bytes will be changed to indicate the number of ++ * bytes actually used. ++ * ++ * Returns -ERANGE if @replica_entries_bytes was too small ++ */ ++struct bch_ioctl_fs_usage { ++ __u64 capacity; ++ __u64 used; ++ __u64 online_reserved; ++ __u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ ++ __u32 replica_entries_bytes; ++ __u32 pad; ++ ++ struct bch_replicas_usage replicas[0]; ++}; ++ ++/* ++ * BCH_IOCTL_DEV_USAGE: query device disk space usage ++ * ++ * Returns disk space usage broken out by data type - both by buckets and ++ * sectors. ++ */ ++struct bch_ioctl_dev_usage { ++ __u64 dev; ++ __u32 flags; ++ __u8 state; ++ __u8 pad[7]; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; ++ ++ __u64 buckets_ec; ++ ++ struct bch_ioctl_dev_usage_type { ++ __u64 buckets; ++ __u64 sectors; ++ __u64 fragmented; ++ } d[BCH_DATA_NR]; ++}; ++ ++/* ++ * BCH_IOCTL_READ_SUPER: read filesystem superblock ++ * ++ * Equivalent to reading the superblock directly from the block device, except ++ * avoids racing with the kernel writing the superblock or having to figure out ++ * which block device to read ++ * ++ * @sb - buffer to read into ++ * @size - size of userspace allocated buffer ++ * @dev - device to read superblock for, if BCH_READ_DEV flag is ++ * specified ++ * ++ * Returns -ERANGE if buffer provided is too small ++ */ ++struct bch_ioctl_read_super { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 size; ++ __u64 sb; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to ++ * determine if disk is a (online) member - if so, returns device's index ++ * ++ * Returns -ENOENT if not found ++ */ ++struct bch_ioctl_disk_get_idx { ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize_journal { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++struct bch_ioctl_subvolume { ++ __u32 flags; ++ __u32 dirfd; ++ __u16 mode; ++ __u16 pad[3]; ++ __u64 dst_ptr; ++ __u64 src_ptr; ++}; ++ ++#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) ++#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) ++ ++#endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +new file mode 100644 +index 000000000000..cc0689635164 +--- /dev/null ++++ b/fs/bcachefs/bkey.c +@@ -0,0 +1,1175 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "bset.h" ++#include "util.h" ++ ++#undef EBUG_ON ++ ++#ifdef DEBUG_BKEYS ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) ++#endif ++ ++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) ++{ ++ unsigned bit = high_bit_offset, done = 0; ++ ++ while (1) { ++ while (bit < 64) { ++ if (done && !(done % 8)) ++ *out++ = ' '; ++ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; ++ bit++; ++ done++; ++ if (done == nr_bits) { ++ *out++ = '\0'; ++ return; ++ } ++ } ++ ++ p = next_word(p); ++ bit = 0; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ struct bkey tmp; ++ ++ BUG_ON(bkeyp_val_u64s(format, packed) != ++ bkey_val_u64s(unpacked)); ++ ++ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); ++ ++ tmp = __bch2_bkey_unpack_key(format, packed); ++ ++ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ char buf3[160], buf4[160]; ++ ++ bch2_bkey_to_text(&buf1, unpacked); ++ bch2_bkey_to_text(&buf2, &tmp); ++ bch2_to_binary(buf3, (void *) unpacked, 80); ++ bch2_to_binary(buf4, high_word(format, packed), 80); ++ ++ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", ++ format->key_u64s, ++ format->bits_per_field[0], ++ format->bits_per_field[1], ++ format->bits_per_field[2], ++ format->bits_per_field[3], ++ format->bits_per_field[4], ++ buf1.buf, buf2.buf, buf3, buf4); ++ } ++} ++ ++#else ++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) {} ++#endif ++ ++struct pack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct pack_state pack_state_init(const struct bkey_format *format, ++ struct bkey_packed *k) ++{ ++ u64 *p = high_word(format, k); ++ ++ return (struct pack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = 0, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static void pack_state_finish(struct pack_state *state, ++ struct bkey_packed *k) ++{ ++ EBUG_ON(state->p < k->_data); ++ EBUG_ON(state->p >= k->_data + state->format->key_u64s); ++ ++ *state->p = state->w; ++} ++ ++struct unpack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ const u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct unpack_state unpack_state_init(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(format, k); ++ ++ return (struct unpack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = *p << high_bit_offset, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static u64 get_inc_field(struct unpack_state *state, unsigned field) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (bits >= state->bits) { ++ v = state->w >> (64 - bits); ++ bits -= state->bits; ++ ++ state->p = next_word(state->p); ++ state->w = *state->p; ++ state->bits = 64; ++ } ++ ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ v |= (state->w >> 1) >> (63 - bits); ++ state->w <<= bits; ++ state->bits -= bits; ++ ++ return v + offset; ++} ++ ++__always_inline ++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (v < offset) ++ return false; ++ ++ v -= offset; ++ ++ if (fls64(v) > bits) ++ return false; ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return true; ++} ++ ++/* ++ * Note: does NOT set out->format (we don't know what it should be here!) ++ * ++ * Also: doesn't work on extents - it doesn't preserve the invariant that ++ * if k is packed bkey_start_pos(k) will successfully pack ++ */ ++static bool bch2_bkey_transform_key(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ struct pack_state out_s = pack_state_init(out_f, out); ++ struct unpack_state in_s = unpack_state_init(in_f, in); ++ u64 *w = out->_data; ++ unsigned i; ++ ++ *w = 0; ++ ++ for (i = 0; i < BKEY_NR_FIELDS; i++) ++ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) ++ return false; ++ ++ /* Can't happen because the val would be too big to unpack: */ ++ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); ++ ++ pack_state_finish(&out_s, out); ++ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ return true; ++} ++ ++bool bch2_bkey_transform(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) ++ return false; ++ ++ memcpy_u64s((u64 *) out + out_f->key_u64s, ++ (u64 *) in + in_f->key_u64s, ++ (in->u64s - in_f->key_u64s)); ++ return true; ++} ++ ++#define bkey_fields() \ ++ x(BKEY_FIELD_INODE, p.inode) \ ++ x(BKEY_FIELD_OFFSET, p.offset) \ ++ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ ++ x(BKEY_FIELD_SIZE, size) \ ++ x(BKEY_FIELD_VERSION_HI, version.hi) \ ++ x(BKEY_FIELD_VERSION_LO, version.lo) ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bkey out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); ++ ++ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; ++ out.format = KEY_FORMAT_CURRENT; ++ out.needs_whiteout = in->needs_whiteout; ++ out.type = in->type; ++ out.pad[0] = 0; ++ ++#define x(id, field) out.field = get_inc_field(&state, id); ++ bkey_fields() ++#undef x ++ ++ return out; ++} ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bpos out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ ++ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); ++ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); ++ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); ++ ++ return out; ++} ++#endif ++ ++/** ++ * bch2_bkey_pack_key -- pack just the key, not the value ++ */ ++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, ++ const struct bkey_format *format) ++{ ++ struct pack_state state = pack_state_init(format, out); ++ u64 *w = out->_data; ++ ++ EBUG_ON((void *) in == (void *) out); ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->format != KEY_FORMAT_CURRENT); ++ ++ *w = 0; ++ ++#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; ++ bkey_fields() ++#undef x ++ ++ /* ++ * Extents - we have to guarantee that if an extent is packed, a trimmed ++ * version will also pack: ++ */ ++ if (bkey_start_offset(in) < ++ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) ++ return false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ bch2_bkey_pack_verify(out, in, format); ++ return true; ++} ++ ++/** ++ * bch2_bkey_unpack -- unpack the key and the value ++ */ ++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, ++ const struct bkey_packed *src) ++{ ++ __bkey_unpack_key(b, &dst->k, src); ++ ++ memcpy_u64s(&dst->v, ++ bkeyp_val(&b->format, src), ++ bkeyp_val_u64s(&b->format, src)); ++} ++ ++/** ++ * bch2_bkey_pack -- pack the key and the value ++ */ ++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, ++ const struct bkey_format *format) ++{ ++ struct bkey_packed tmp; ++ ++ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) ++ return false; ++ ++ memmove_u64s((u64 *) out + format->key_u64s, ++ &in->v, ++ bkey_val_u64s(&in->k)); ++ memcpy_u64s(out, &tmp, format->key_u64s); ++ ++ return true; ++} ++ ++__always_inline ++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ bool ret = true; ++ ++ EBUG_ON(v < offset); ++ v -= offset; ++ ++ if (fls64(v) > bits) { ++ v = ~(~0ULL << bits); ++ ret = false; ++ } ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static bool bkey_packed_successor(struct bkey_packed *out, ++ const struct btree *b, ++ struct bkey_packed k) ++{ ++ const struct bkey_format *f = &b->format; ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned first_bit, offset; ++ u64 *p; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ if (!nr_key_bits) ++ return false; ++ ++ *out = k; ++ ++ first_bit = high_bit_offset + nr_key_bits - 1; ++ p = nth_word(high_word(f, out), first_bit >> 6); ++ offset = 63 - (first_bit & 63); ++ ++ while (nr_key_bits) { ++ unsigned bits = min(64 - offset, nr_key_bits); ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if ((*p & mask) != mask) { ++ *p += 1ULL << offset; ++ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); ++ return true; ++ } ++ ++ *p &= ~mask; ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ offset = 0; ++ } ++ ++ return false; ++} ++#endif ++ ++/* ++ * Returns a packed key that compares <= in ++ * ++ * This is used in bset_search_tree(), where we need a packed pos in order to be ++ * able to compare against the keys in the auxiliary search tree - and it's ++ * legal to use a packed pos that isn't equivalent to the original pos, ++ * _provided_ it compares <= to the original pos. ++ */ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, ++ struct bpos in, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct pack_state state = pack_state_init(f, out); ++ u64 *w = out->_data; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos orig = in; ++#endif ++ bool exact = true; ++ unsigned i; ++ ++ /* ++ * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 ++ * byte header, but pack_pos() won't if the len/version fields are big ++ * enough - we need to make sure to zero them out: ++ */ ++ for (i = 0; i < f->key_u64s; i++) ++ w[i] = 0; ++ ++ if (unlikely(in.snapshot < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { ++ if (!in.offset-- && ++ !in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.offset < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { ++ if (!in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.inode < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) ++ return BKEY_PACK_POS_FAIL; ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) ++ exact = false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = f->key_u64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->type = KEY_TYPE_deleted; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; ++ ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ } ++#endif ++ ++ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; ++} ++ ++void bch2_bkey_format_init(struct bkey_format_state *s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) ++ s->field_min[i] = U64_MAX; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) ++ s->field_max[i] = 0; ++ ++ /* Make sure we can store a size of 0: */ ++ s->field_min[BKEY_FIELD_SIZE] = 0; ++} ++ ++static void __bkey_format_add(struct bkey_format_state *s, ++ unsigned field, u64 v) ++{ ++ s->field_min[field] = min(s->field_min[field], v); ++ s->field_max[field] = max(s->field_max[field], v); ++} ++ ++/* ++ * Changes @format so that @k can be successfully packed with @format ++ */ ++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) ++{ ++#define x(id, field) __bkey_format_add(s, id, k->field); ++ bkey_fields() ++#undef x ++ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); ++} ++ ++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) ++{ ++ unsigned field = 0; ++ ++ __bkey_format_add(s, field++, p.inode); ++ __bkey_format_add(s, field++, p.offset); ++ __bkey_format_add(s, field++, p.snapshot); ++} ++ ++/* ++ * We don't want it to be possible for the packed format to represent fields ++ * bigger than a u64... that will cause confusion and issues (like with ++ * bkey_packed_successor()) ++ */ ++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, ++ unsigned bits, u64 offset) ++{ ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ ++ bits = min(bits, unpacked_bits); ++ ++ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); ++ ++ f->bits_per_field[i] = bits; ++ f->field_offset[i] = cpu_to_le64(offset); ++} ++ ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ struct bkey_format ret = { ++ .nr_fields = BKEY_NR_FIELDS, ++ }; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { ++ s->field_min[i] = min(s->field_min[i], s->field_max[i]); ++ ++ set_format_field(&ret, i, ++ fls64(s->field_max[i] - s->field_min[i]), ++ s->field_min[i]); ++ ++ bits += ret.bits_per_field[i]; ++ } ++ ++ /* allow for extent merging: */ ++ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { ++ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; ++ bits += 4; ++ } ++ ++ ret.key_u64s = DIV_ROUND_UP(bits, 64); ++ ++ /* if we have enough spare bits, round fields up to nearest byte */ ++ bits = ret.key_u64s * 64 - bits; ++ ++ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { ++ unsigned r = round_up(ret.bits_per_field[i], 8) - ++ ret.bits_per_field[i]; ++ ++ if (r <= bits) { ++ set_format_field(&ret, i, ++ ret.bits_per_field[i] + r, ++ le64_to_cpu(ret.field_offset[i])); ++ bits -= r; ++ } ++ } ++ ++ EBUG_ON(bch2_bkey_format_validate(&ret)); ++ return ret; ++} ++ ++const char *bch2_bkey_format_validate(struct bkey_format *f) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ ++ if (f->nr_fields != BKEY_NR_FIELDS) ++ return "incorrect number of fields"; ++ ++ /* ++ * Verify that the packed format can't represent fields larger than the ++ * unpacked format: ++ */ ++ for (i = 0; i < f->nr_fields; i++) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 packed_max = f->bits_per_field[i] ++ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) ++ : 0; ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (packed_max + field_offset < packed_max || ++ packed_max + field_offset > unpacked_max) ++ return "field too large"; ++ ++ bits += f->bits_per_field[i]; ++ } ++ ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) ++ return "incorrect key_u64s"; ++ ++ return NULL; ++} ++ ++/* ++ * Most significant differing bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, ++ const struct bkey_packed *l_k, ++ const struct bkey_packed *r_k) ++{ ++ const u64 *l = high_word(&b->format, l_k); ++ const u64 *r = high_word(&b->format, r_k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned word_bits = 64 - high_bit_offset; ++ u64 l_v, r_v; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ /* for big endian, skip past header */ ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (nr_key_bits) { ++ if (nr_key_bits < word_bits) { ++ l_v >>= word_bits - nr_key_bits; ++ r_v >>= word_bits - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= word_bits; ++ } ++ ++ if (l_v != r_v) ++ return fls64(l_v ^ r_v) - 1 + nr_key_bits; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ word_bits = 64; ++ } ++ ++ return 0; ++} ++ ++/* ++ * First set bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(&b->format, k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned ret = 0, offset; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ offset = nr_key_bits; ++ while (offset > 64) { ++ p = next_word(p); ++ offset -= 64; ++ } ++ ++ offset = 64 - offset; ++ ++ while (nr_key_bits) { ++ unsigned bits = nr_key_bits + offset < 64 ++ ? nr_key_bits ++ : 64 - offset; ++ ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if (*p & mask) ++ return ret + __ffs64(*p & mask) - offset; ++ ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ ret += bits; ++ offset = 0; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_64 ++ ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++ ++#define I(_x) (*(out)++ = (_x)) ++#define I1(i0) I(i0) ++#define I2(i0, i1) (I1(i0), I(i1)) ++#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) ++#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) ++#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) ++ ++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, ++ enum bch_bkey_fields field, ++ unsigned dst_offset, unsigned dst_size, ++ bool *eax_zeroed) ++{ ++ unsigned bits = format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(format->field_offset[field]); ++ unsigned i, byte, bit_offset, align, shl, shr; ++ ++ if (!bits && !offset) { ++ if (!*eax_zeroed) { ++ /* xor eax, eax */ ++ I2(0x31, 0xc0); ++ } ++ ++ *eax_zeroed = true; ++ goto set_field; ++ } ++ ++ if (!bits) { ++ /* just return offset: */ ++ ++ switch (dst_size) { ++ case 8: ++ if (offset > S32_MAX) { ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ ++ I3(0xc7, 0x47, dst_offset + 4); ++ memcpy(out, (void *) &offset + 4, 4); ++ out += 4; ++ } else { ++ /* mov [rdi + dst_offset], offset */ ++ /* sign extended */ ++ I4(0x48, 0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++ } ++ ++ bit_offset = format->key_u64s * 64; ++ for (i = 0; i <= field; i++) ++ bit_offset -= format->bits_per_field[i]; ++ ++ byte = bit_offset / 8; ++ bit_offset -= byte * 8; ++ ++ *eax_zeroed = false; ++ ++ if (bit_offset == 0 && bits == 8) { ++ /* movzx eax, BYTE PTR [rsi + imm8] */ ++ I4(0x0f, 0xb6, 0x46, byte); ++ } else if (bit_offset == 0 && bits == 16) { ++ /* movzx eax, WORD PTR [rsi + imm8] */ ++ I4(0x0f, 0xb7, 0x46, byte); ++ } else if (bit_offset + bits <= 32) { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 32); ++ ++ /* mov eax, [rsi + imm8] */ ++ I3(0x8b, 0x46, byte); ++ ++ if (bit_offset) { ++ /* shr eax, imm8 */ ++ I3(0xc1, 0xe8, bit_offset); ++ } ++ ++ if (bit_offset + bits < 32) { ++ unsigned mask = ~0U >> (32 - bits); ++ ++ /* and eax, imm32 */ ++ I1(0x25); ++ memcpy(out, &mask, 4); ++ out += 4; ++ } ++ } else if (bit_offset + bits <= 64) { ++ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 64); ++ ++ /* mov rax, [rsi + imm8] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ shl = 64 - bit_offset - bits; ++ shr = bit_offset + shl; ++ ++ if (shl) { ++ /* shl rax, imm8 */ ++ I4(0x48, 0xc1, 0xe0, shl); ++ } ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } else { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 96); ++ ++ /* mov rax, [rsi + byte] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ /* mov edx, [rsi + byte + 8] */ ++ I3(0x8b, 0x56, byte + 8); ++ ++ /* bits from next word: */ ++ shr = bit_offset + bits - 64; ++ BUG_ON(shr > bit_offset); ++ ++ /* shr rax, bit_offset */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ ++ /* shl rdx, imm8 */ ++ I4(0x48, 0xc1, 0xe2, 64 - shr); ++ ++ /* or rax, rdx */ ++ I3(0x48, 0x09, 0xd0); ++ ++ shr = bit_offset - shr; ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } ++ ++ /* rax += offset: */ ++ if (offset > S32_MAX) { ++ /* mov rdx, imm64 */ ++ I2(0x48, 0xba); ++ memcpy(out, &offset, 8); ++ out += 8; ++ /* add %rdx, %rax */ ++ I3(0x48, 0x01, 0xd0); ++ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { ++ /* add rax, imm32 */ ++ I2(0x48, 0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } else if (offset) { ++ /* add eax, imm32 */ ++ I1(0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++set_field: ++ switch (dst_size) { ++ case 8: ++ /* mov [rdi + dst_offset], rax */ ++ I4(0x48, 0x89, 0x47, dst_offset); ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], eax */ ++ I3(0x89, 0x47, dst_offset); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++} ++ ++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) ++{ ++ bool eax_zeroed = false; ++ u8 *out = _out; ++ ++ /* ++ * rdi: dst - unpacked key ++ * rsi: src - packed key ++ */ ++ ++ /* k->u64s, k->format, k->type */ ++ ++ /* mov eax, [rsi] */ ++ I2(0x8b, 0x06); ++ ++ /* add eax, BKEY_U64s - format->key_u64s */ ++ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); ++ ++ /* and eax, imm32: mask out k->pad: */ ++ I5(0x25, 0xff, 0xff, 0xff, 0); ++ ++ /* mov [rdi], eax */ ++ I2(0x89, 0x07); ++ ++#define x(id, field) \ ++ out = compile_bkey_field(format, out, id, \ ++ offsetof(struct bkey, field), \ ++ sizeof(((struct bkey *) NULL)->field), \ ++ &eax_zeroed); ++ bkey_fields() ++#undef x ++ ++ /* retq */ ++ I1(0xc3); ++ ++ return (void *) out - _out; ++} ++ ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++} ++ ++__pure __flatten ++int bch2_bkey_cmp_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ struct bkey unpacked; ++ ++ if (likely(bkey_packed(l) && bkey_packed(r))) ++ return __bch2_bkey_cmp_packed_format_checked(l, r, b); ++ ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void*) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void*) &unpacked; ++ } ++ ++ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ const struct bkey *l_unpacked; ++ ++ return unlikely(l_unpacked = packed_to_bkey_c(l)) ++ ? bpos_cmp(l_unpacked->p, *r) ++ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++void bch2_bpos_swab(struct bpos *p) ++{ ++ u8 *l = (u8 *) p; ++ u8 *h = ((u8 *) &p[1]) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ++{ ++ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; ++ u8 *l = k->key_start; ++ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void) ++{ ++ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); ++ struct bkey_packed p; ++ ++ struct bkey_format test_format = { ++ .key_u64s = 3, ++ .nr_fields = BKEY_NR_FIELDS, ++ .bits_per_field = { ++ 13, ++ 64, ++ 32, ++ }, ++ }; ++ ++ struct unpack_state in_s = ++ unpack_state_init(&bch2_bkey_format_current, (void *) &t); ++ struct pack_state out_s = pack_state_init(&test_format, &p); ++ unsigned i; ++ ++ for (i = 0; i < out_s.format->nr_fields; i++) { ++ u64 a, v = get_inc_field(&in_s, i); ++ ++ switch (i) { ++#define x(id, field) case id: a = t.field; break; ++ bkey_fields() ++#undef x ++ default: ++ BUG(); ++ } ++ ++ if (a != v) ++ panic("got %llu actual %llu i %u\n", v, a, i); ++ ++ if (!set_inc_field(&out_s, i, v)) ++ panic("failed at %u\n", i); ++ } ++ ++ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); ++} ++#endif +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +new file mode 100644 +index 000000000000..7dee3d8e0a3d +--- /dev/null ++++ b/fs/bcachefs/bkey.h +@@ -0,0 +1,566 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_H ++#define _BCACHEFS_BKEY_H ++ ++#include ++#include "bcachefs_format.h" ++ ++#include "util.h" ++#include "vstructs.h" ++ ++#ifdef CONFIG_X86_64 ++#define HAVE_BCACHEFS_COMPILED_UNPACK 1 ++#endif ++ ++void bch2_to_binary(char *, const u64 *, unsigned); ++ ++/* bkey with split value, const */ ++struct bkey_s_c { ++ const struct bkey *k; ++ const struct bch_val *v; ++}; ++ ++/* bkey with split value */ ++struct bkey_s { ++ union { ++ struct { ++ struct bkey *k; ++ struct bch_val *v; ++ }; ++ struct bkey_s_c s_c; ++ }; ++}; ++ ++#define bkey_next(_k) vstruct_next(_k) ++ ++#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) ++ ++static inline size_t bkey_val_bytes(const struct bkey *k) ++{ ++ return bkey_val_u64s(k) * sizeof(u64); ++} ++ ++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) ++{ ++ k->u64s = BKEY_U64s + val_u64s; ++} ++ ++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) ++{ ++ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) ++ ++#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) ++ ++#define bkey_whiteout(_k) \ ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) ++ ++enum bkey_lr_packed { ++ BKEY_PACKED_BOTH, ++ BKEY_PACKED_RIGHT, ++ BKEY_PACKED_LEFT, ++ BKEY_PACKED_NONE, ++}; ++ ++#define bkey_lr_packed(_l, _r) \ ++ ((_l)->format + ((_r)->format << 1)) ++ ++#define bkey_copy(_dst, _src) \ ++do { \ ++ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ ++ !type_is(_dst, struct bkey_packed *)); \ ++ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ ++ !type_is(_src, struct bkey_packed *)); \ ++ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ ++ (u64 *) (_dst) < (u64 *) (_src) + \ ++ ((struct bkey *) (_src))->u64s); \ ++ \ ++ memcpy_u64s_small((_dst), (_src), \ ++ ((struct bkey *) (_src))->u64s); \ ++} while (0) ++ ++struct btree; ++ ++struct bkey_format_state { ++ u64 field_min[BKEY_NR_FIELDS]; ++ u64 field_max[BKEY_NR_FIELDS]; ++}; ++ ++void bch2_bkey_format_init(struct bkey_format_state *); ++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); ++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); ++const char *bch2_bkey_format_validate(struct bkey_format *); ++ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++__pure ++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++__pure ++int bch2_bkey_cmp_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++static inline __pure ++int bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, const struct bpos *r) ++{ ++ return __bch2_bkey_cmp_left_packed(b, l, r); ++} ++ ++/* ++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to ++ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ */ ++__pure __flatten ++static inline int bkey_cmp_left_packed_byval(const struct btree *b, ++ const struct bkey_packed *l, ++ struct bpos r) ++{ ++ return bkey_cmp_left_packed(b, l, &r); ++} ++ ++static __always_inline int bpos_cmp(struct bpos l, struct bpos r) ++{ ++ return cmp_int(l.inode, r.inode) ?: ++ cmp_int(l.offset, r.offset) ?: ++ cmp_int(l.snapshot, r.snapshot); ++} ++ ++static __always_inline int bkey_cmp(struct bpos l, struct bpos r) ++{ ++ return cmp_int(l.inode, r.inode) ?: ++ cmp_int(l.offset, r.offset); ++} ++ ++static inline struct bpos bpos_min(struct bpos l, struct bpos r) ++{ ++ return bpos_cmp(l, r) < 0 ? l : r; ++} ++ ++static inline struct bpos bpos_max(struct bpos l, struct bpos r) ++{ ++ return bpos_cmp(l, r) > 0 ? l : r; ++} ++ ++void bch2_bpos_swab(struct bpos *); ++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); ++ ++static __always_inline int bversion_cmp(struct bversion l, struct bversion r) ++{ ++ return cmp_int(l.hi, r.hi) ?: ++ cmp_int(l.lo, r.lo); ++} ++ ++#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) ++#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) ++ ++static __always_inline int bversion_zero(struct bversion v) ++{ ++ return !bversion_cmp(v, ZERO_VERSION); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++/* statement expressions confusing unlikely()? */ ++#define bkey_packed(_k) \ ++ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ ++ (_k)->format != KEY_FORMAT_CURRENT; }) ++#else ++#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) ++#endif ++ ++/* ++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse ++ */ ++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) ++{ ++ return (struct bkey_packed *) k; ++} ++ ++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) ++{ ++ return (const struct bkey_packed *) k; ++} ++ ++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (struct bkey_i *) k; ++} ++ ++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (const struct bkey *) k; ++} ++ ++static inline unsigned bkey_format_key_bits(const struct bkey_format *format) ++{ ++ return format->bits_per_field[BKEY_FIELD_INODE] + ++ format->bits_per_field[BKEY_FIELD_OFFSET] + ++ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; ++} ++ ++static inline struct bpos bpos_successor(struct bpos p) ++{ ++ if (!++p.snapshot && ++ !++p.offset && ++ !++p.inode) ++ BUG(); ++ ++ return p; ++} ++ ++static inline struct bpos bpos_predecessor(struct bpos p) ++{ ++ if (!p.snapshot-- && ++ !p.offset-- && ++ !p.inode--) ++ BUG(); ++ ++ return p; ++} ++ ++static inline struct bpos bpos_nosnap_successor(struct bpos p) ++{ ++ p.snapshot = 0; ++ ++ if (!++p.offset && ++ !++p.inode) ++ BUG(); ++ ++ return p; ++} ++ ++static inline struct bpos bpos_nosnap_predecessor(struct bpos p) ++{ ++ p.snapshot = 0; ++ ++ if (!p.offset-- && ++ !p.inode--) ++ BUG(); ++ ++ return p; ++} ++ ++static inline u64 bkey_start_offset(const struct bkey *k) ++{ ++ return k->p.offset - k->size; ++} ++ ++static inline struct bpos bkey_start_pos(const struct bkey *k) ++{ ++ return (struct bpos) { ++ .inode = k->p.inode, ++ .offset = bkey_start_offset(k), ++ .snapshot = k->p.snapshot, ++ }; ++} ++ ++/* Packed helpers */ ++ ++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; ++ ++ EBUG_ON(k->u64s < ret); ++ return ret; ++} ++ ++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_key_u64s(format, k) * sizeof(u64); ++} ++ ++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return k->u64s - bkeyp_key_u64s(format, k); ++} ++ ++static inline size_t bkeyp_val_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_val_u64s(format, k) * sizeof(u64); ++} ++ ++static inline void set_bkeyp_val_u64s(const struct bkey_format *format, ++ struct bkey_packed *k, unsigned val_u64s) ++{ ++ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; ++} ++ ++#define bkeyp_val(_format, _k) \ ++ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) ++ ++extern const struct bkey_format bch2_bkey_format_current; ++ ++bool bch2_bkey_transform(const struct bkey_format *, ++ struct bkey_packed *, ++ const struct bkey_format *, ++ const struct bkey_packed *); ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *, ++ const struct bkey_packed *); ++#endif ++ ++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, ++ const struct bkey_format *); ++ ++enum bkey_pack_pos_ret { ++ BKEY_PACK_POS_EXACT, ++ BKEY_PACK_POS_SMALLER, ++ BKEY_PACK_POS_FAIL, ++}; ++ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, ++ const struct btree *); ++ ++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, ++ const struct btree *b) ++{ ++ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; ++} ++ ++void bch2_bkey_unpack(const struct btree *, struct bkey_i *, ++ const struct bkey_packed *); ++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, ++ const struct bkey_format *); ++ ++static inline u64 bkey_field_max(const struct bkey_format *f, ++ enum bch_bkey_fields nr) ++{ ++ return f->bits_per_field[nr] < 64 ++ ? (le64_to_cpu(f->field_offset[nr]) + ++ ~(~0ULL << f->bits_per_field[nr])) ++ : U64_MAX; ++} ++ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ ++int bch2_compile_bkey_format(const struct bkey_format *, void *); ++ ++#else ++ ++static inline int bch2_compile_bkey_format(const struct bkey_format *format, ++ void *out) { return 0; } ++ ++#endif ++ ++static inline void bkey_reassemble(struct bkey_i *dst, ++ struct bkey_s_c src) ++{ ++ dst->k = *src.k; ++ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); ++} ++ ++#define bkey_s_null ((struct bkey_s) { .k = NULL }) ++#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) ++ ++#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) ++#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) ++ ++static inline struct bkey_s bkey_to_s(struct bkey *k) ++{ ++ return (struct bkey_s) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) ++{ ++ return (struct bkey_s_c) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) ++{ ++ return (struct bkey_s) { .k = &k->k, .v = &k->v }; ++} ++ ++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) ++{ ++ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; ++} ++ ++/* ++ * For a given type of value (e.g. struct bch_extent), generates the types for ++ * bkey + bch_extent - inline, split, split const - and also all the conversion ++ * functions, which also check that the value is of the correct type. ++ * ++ * We use anonymous unions for upcasting - e.g. converting from e.g. a ++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion ++ * functions. ++ */ ++#define x(name, ...) \ ++struct bkey_i_##name { \ ++ union { \ ++ struct bkey k; \ ++ struct bkey_i k_i; \ ++ }; \ ++ struct bch_##name v; \ ++}; \ ++ \ ++struct bkey_s_c_##name { \ ++ union { \ ++ struct { \ ++ const struct bkey *k; \ ++ const struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++struct bkey_s_##name { \ ++ union { \ ++ struct { \ ++ struct bkey *k; \ ++ struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c_##name c; \ ++ struct bkey_s s; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline const struct bkey_i_##name * \ ++bkey_i_to_##name##_c(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ ++{ \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++name##_i_to_s_c(const struct bkey_i_##name *k) \ ++{ \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++bkey_i_to_s_c_##name(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ ++{ \ ++ struct bkey_i_##name *k = \ ++ container_of(&_k->k, struct bkey_i_##name, k); \ ++ \ ++ bkey_init(&k->k); \ ++ memset(&k->v, 0, sizeof(k->v)); \ ++ k->k.type = KEY_TYPE_##name; \ ++ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ ++ \ ++ return k; \ ++} ++ ++BCH_BKEY_TYPES(); ++#undef x ++ ++/* byte order helpers */ ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return f->key_u64s - 1; ++} ++ ++#define high_bit_offset 0 ++#define nth_word(p, n) ((p) - (n)) ++ ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return 0; ++} ++ ++#define high_bit_offset KEY_PACKED_BITS_START ++#define nth_word(p, n) ((p) + (n)) ++ ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define high_word(f, k) ((k)->_data + high_word_offset(f)) ++#define next_word(p) nth_word(p, 1) ++#define prev_word(p) nth_word(p, -1) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void); ++#else ++static inline void bch2_bkey_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_BKEY_H */ +diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h +new file mode 100644 +index 000000000000..0d7c67a959af +--- /dev/null ++++ b/fs/bcachefs/bkey_buf.h +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_BUF_H ++#define _BCACHEFS_BKEY_BUF_H ++ ++#include "bcachefs.h" ++ ++struct bkey_buf { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_buf_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ ++static inline void bch2_bkey_buf_copy(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct bkey_i *src) ++{ ++ bch2_bkey_buf_realloc(s, c, src->k.u64s); ++ bkey_copy(s->k, src); ++} ++ ++static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, ++ struct bch_fs *c, ++ struct btree *b, ++ struct bkey_packed *src) ++{ ++ bch2_bkey_buf_realloc(s, c, BKEY_U64s + ++ bkeyp_val_u64s(&b->format, src)); ++ bch2_bkey_unpack(b, s->k, src); ++} ++ ++static inline void bch2_bkey_buf_init(struct bkey_buf *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_BUF_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +new file mode 100644 +index 000000000000..e0cbac8811af +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.c +@@ -0,0 +1,503 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "backpointers.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "alloc_background.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "lru.h" ++#include "quota.h" ++#include "reflink.h" ++#include "subvolume.h" ++#include "xattr.h" ++ ++const char * const bch2_bkey_types[] = { ++#define x(name, nr) #name, ++ BCH_BKEY_TYPES() ++#undef x ++ NULL ++}; ++ ++static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ return 0; ++} ++ ++#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_val_bytes(k.k)) { ++ prt_printf(err, "incorrect value size (%zu != 0)", ++ bkey_val_bytes(k.k)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++#define bch2_bkey_ops_error (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++ .key_invalid = key_type_cookie_invalid, \ ++} ++ ++#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ return 0; ++} ++ ++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned datalen = bkey_inline_data_bytes(k.k); ++ ++ prt_printf(out, "datalen %u: %*phN", ++ datalen, min(datalen, 32U), d.v->data); ++} ++ ++#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++ .key_invalid = key_type_inline_data_invalid, \ ++ .val_to_text = key_type_inline_data_to_text, \ ++} ++ ++static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_val_bytes(k.k)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) ++{ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++#define bch2_bkey_ops_set (struct bkey_ops) { \ ++ .key_invalid = key_type_set_invalid, \ ++ .key_merge = key_type_set_merge, \ ++} ++ ++const struct bkey_ops bch2_bkey_ops[] = { ++#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, ++ BCH_BKEY_TYPES() ++#undef x ++}; ++ ++int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (k.k->type >= KEY_TYPE_MAX) { ++ prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); ++ return -EINVAL; ++ } ++ ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err); ++} ++ ++static unsigned bch2_key_types_allowed[] = { ++ [BKEY_TYPE_extents] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| ++ (1U << KEY_TYPE_error)| ++ (1U << KEY_TYPE_cookie)| ++ (1U << KEY_TYPE_extent)| ++ (1U << KEY_TYPE_reservation)| ++ (1U << KEY_TYPE_reflink_p)| ++ (1U << KEY_TYPE_inline_data), ++ [BKEY_TYPE_inodes] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| ++ (1U << KEY_TYPE_inode)| ++ (1U << KEY_TYPE_inode_v2)| ++ (1U << KEY_TYPE_inode_generation), ++ [BKEY_TYPE_dirents] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| ++ (1U << KEY_TYPE_hash_whiteout)| ++ (1U << KEY_TYPE_dirent), ++ [BKEY_TYPE_xattrs] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_whiteout)| ++ (1U << KEY_TYPE_cookie)| ++ (1U << KEY_TYPE_hash_whiteout)| ++ (1U << KEY_TYPE_xattr), ++ [BKEY_TYPE_alloc] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_alloc)| ++ (1U << KEY_TYPE_alloc_v2)| ++ (1U << KEY_TYPE_alloc_v3)| ++ (1U << KEY_TYPE_alloc_v4), ++ [BKEY_TYPE_quotas] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_quota), ++ [BKEY_TYPE_stripes] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_stripe), ++ [BKEY_TYPE_reflink] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_reflink_v)| ++ (1U << KEY_TYPE_indirect_inline_data), ++ [BKEY_TYPE_subvolumes] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_subvolume), ++ [BKEY_TYPE_snapshots] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_snapshot), ++ [BKEY_TYPE_lru] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_lru), ++ [BKEY_TYPE_freespace] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), ++ [BKEY_TYPE_need_discard] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_set), ++ [BKEY_TYPE_backpointers] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_backpointer), ++ [BKEY_TYPE_btree] = ++ (1U << KEY_TYPE_deleted)| ++ (1U << KEY_TYPE_btree_ptr)| ++ (1U << KEY_TYPE_btree_ptr_v2), ++}; ++ ++int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type, ++ int rw, struct printbuf *err) ++{ ++ if (k.k->u64s < BKEY_U64s) { ++ prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); ++ return -EINVAL; ++ } ++ ++ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { ++ prt_printf(err, "invalid key type for btree %s (%s)", ++ bch2_btree_ids[type], bch2_bkey_types[type]); ++ return -EINVAL; ++ } ++ ++ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { ++ if (k.k->size == 0) { ++ prt_printf(err, "size == 0"); ++ return -EINVAL; ++ } ++ ++ if (k.k->size > k.k->p.offset) { ++ prt_printf(err, "size greater than offset (%u > %llu)", ++ k.k->size, k.k->p.offset); ++ return -EINVAL; ++ } ++ } else { ++ if (k.k->size) { ++ prt_printf(err, "size != 0"); ++ return -EINVAL; ++ } ++ } ++ ++ if (type != BKEY_TYPE_btree && ++ !btree_type_has_snapshots(type) && ++ k.k->p.snapshot) { ++ prt_printf(err, "nonzero snapshot"); ++ return -EINVAL; ++ } ++ ++ if (type != BKEY_TYPE_btree && ++ btree_type_has_snapshots(type) && ++ !k.k->p.snapshot) { ++ prt_printf(err, "snapshot == 0"); ++ return -EINVAL; ++ } ++ ++ if (type != BKEY_TYPE_btree && ++ !bkey_cmp(k.k->p, POS_MAX)) { ++ prt_printf(err, "key at POS_MAX"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type, ++ int rw, struct printbuf *err) ++{ ++ return __bch2_bkey_invalid(c, k, type, rw, err) ?: ++ bch2_bkey_val_invalid(c, k, rw, err); ++} ++ ++int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, ++ struct printbuf *err) ++{ ++ if (bpos_cmp(k.k->p, b->data->min_key) < 0) { ++ prt_printf(err, "key before start of btree node"); ++ return -EINVAL; ++ } ++ ++ if (bpos_cmp(k.k->p, b->data->max_key) > 0) { ++ prt_printf(err, "key past end of btree node"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) ++{ ++ if (!bpos_cmp(pos, POS_MIN)) ++ prt_printf(out, "POS_MIN"); ++ else if (!bpos_cmp(pos, POS_MAX)) ++ prt_printf(out, "POS_MAX"); ++ else if (!bpos_cmp(pos, SPOS_MAX)) ++ prt_printf(out, "SPOS_MAX"); ++ else { ++ if (pos.inode == U64_MAX) ++ prt_printf(out, "U64_MAX"); ++ else ++ prt_printf(out, "%llu", pos.inode); ++ prt_printf(out, ":"); ++ if (pos.offset == U64_MAX) ++ prt_printf(out, "U64_MAX"); ++ else ++ prt_printf(out, "%llu", pos.offset); ++ prt_printf(out, ":"); ++ if (pos.snapshot == U32_MAX) ++ prt_printf(out, "U32_MAX"); ++ else ++ prt_printf(out, "%u", pos.snapshot); ++ } ++} ++ ++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) ++{ ++ if (k) { ++ prt_printf(out, "u64s %u type ", k->u64s); ++ ++ if (k->type < KEY_TYPE_MAX) ++ prt_printf(out, "%s ", bch2_bkey_types[k->type]); ++ else ++ prt_printf(out, "%u ", k->type); ++ ++ bch2_bpos_to_text(out, k->p); ++ ++ prt_printf(out, " len %u ver %llu", k->size, k->version.lo); ++ } else { ++ prt_printf(out, "(null)"); ++ } ++} ++ ++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (k.k->type < KEY_TYPE_MAX) { ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++ } else { ++ prt_printf(out, "(invalid type %u)", k.k->type); ++ } ++} ++ ++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_to_text(out, k.k); ++ ++ if (bkey_val_bytes(k.k)) { ++ prt_printf(out, ": "); ++ bch2_val_to_text(out, c, k); ++ } ++} ++ ++void bch2_bkey_swab_val(struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (ops->swab) ++ ops->swab(k); ++} ++ ++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ return ops->key_normalize ++ ? ops->key_normalize(c, k) ++ : false; ++} ++ ++bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ ++ return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); ++} ++ ++static const struct old_bkey_type { ++ u8 btree_node_type; ++ u8 old; ++ u8 new; ++} bkey_renumber_table[] = { ++ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, ++ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, ++ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, ++}; ++ ++void bch2_bkey_renumber(enum btree_node_type btree_node_type, ++ struct bkey_packed *k, ++ int write) ++{ ++ const struct old_bkey_type *i; ++ ++ for (i = bkey_renumber_table; ++ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); ++ i++) ++ if (btree_node_type == i->btree_node_type && ++ k->type == (write ? i->new : i->old)) { ++ k->type = write ? i->old : i->new; ++ break; ++ } ++} ++ ++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops; ++ struct bkey uk; ++ struct bkey_s u; ++ unsigned nr_compat = 5; ++ int i; ++ ++ /* ++ * Do these operations in reverse order in the write path: ++ */ ++ ++ for (i = 0; i < nr_compat; i++) ++ switch (!write ? i : nr_compat - 1 - i) { ++ case 0: ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ break; ++ case 1: ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ break; ++ case 2: ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_inodes) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ break; ++ case 3: ++ if (version < bcachefs_metadata_version_snapshot && ++ (level || btree_type_has_snapshots(btree_id))) { ++ struct bkey_i *u = packed_to_bkey(k); ++ ++ if (u) { ++ u->k.p.snapshot = write ++ ? 0 : U32_MAX; ++ } else { ++ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; ++ u64 max_packed = min_packed + ++ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); ++ ++ uk = __bch2_bkey_unpack_key(f, k); ++ uk.p.snapshot = write ++ ? min_packed : min_t(u64, U32_MAX, max_packed); ++ ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); ++ } ++ } ++ ++ break; ++ case 4: ++ if (!bkey_packed(k)) { ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); ++ } ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ ops = &bch2_bkey_ops[k->type]; ++ ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +new file mode 100644 +index 000000000000..db894b40d2ca +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.h +@@ -0,0 +1,175 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_METHODS_H ++#define _BCACHEFS_BKEY_METHODS_H ++ ++#include "bkey.h" ++ ++struct bch_fs; ++struct btree; ++struct btree_trans; ++struct bkey; ++enum btree_node_type; ++ ++extern const char * const bch2_bkey_types[]; ++ ++/* ++ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If ++ * invalid, entire key will be deleted. ++ * ++ * When invalid, error string is returned via @err. @rw indicates whether key is ++ * being read or written; more aggressive checks can be enabled when rw == WRITE. ++*/ ++struct bkey_ops { ++ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err); ++ void (*val_to_text)(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ void (*swab)(struct bkey_s); ++ bool (*key_normalize)(struct bch_fs *, struct bkey_s); ++ bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, ++ struct bkey_s_c, struct bkey_i *, unsigned); ++ int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); ++ void (*compat)(enum btree_id id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s); ++}; ++ ++extern const struct bkey_ops bch2_bkey_ops[]; ++ ++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type, int, struct printbuf *); ++int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type, int, struct printbuf *); ++int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); ++ ++void bch2_bpos_to_text(struct printbuf *, struct bpos); ++void bch2_bkey_to_text(struct printbuf *, const struct bkey *); ++void bch2_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_bkey_swab_val(struct bkey_s); ++ ++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); ++ ++static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) ++{ ++ return l->type == r->type && ++ !bversion_cmp(l->version, r->version) && ++ !bpos_cmp(l->p, bkey_start_pos(r)) && ++ (u64) l->size + r->size <= KEY_SIZE_MAX && ++ bch2_bkey_ops[l->type].key_merge && ++ !bch2_key_merging_disabled; ++} ++ ++bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ ++static inline int bch2_mark_key(struct btree_trans *trans, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; ++ ++ return ops->atomic_trigger ++ ? ops->atomic_trigger(trans, old, new, flags) ++ : 0; ++} ++ ++enum btree_update_flags { ++ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, ++ __BTREE_UPDATE_KEY_CACHE_RECLAIM, ++ ++ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ ++ ++ __BTREE_TRIGGER_INSERT, ++ __BTREE_TRIGGER_OVERWRITE, ++ ++ __BTREE_TRIGGER_GC, ++ __BTREE_TRIGGER_BUCKET_INVALIDATE, ++ __BTREE_TRIGGER_NOATOMIC, ++}; ++ ++#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ++#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) ++ ++#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) ++ ++#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) ++#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) ++ ++#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) ++#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) ++#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) ++ ++#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ ++ ((1U << KEY_TYPE_alloc)| \ ++ (1U << KEY_TYPE_alloc_v2)| \ ++ (1U << KEY_TYPE_alloc_v3)| \ ++ (1U << KEY_TYPE_alloc_v4)| \ ++ (1U << KEY_TYPE_stripe)| \ ++ (1U << KEY_TYPE_inode)| \ ++ (1U << KEY_TYPE_inode_v2)| \ ++ (1U << KEY_TYPE_snapshot)) ++ ++static inline int bch2_trans_mark_key(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; ++ ++ return ops->trans_trigger ++ ? ops->trans_trigger(trans, btree_id, level, old, new, flags) ++ : 0; ++} ++ ++static inline int bch2_trans_mark_old(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = old.k->p; ++ ++ return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++} ++ ++static inline int bch2_trans_mark_new(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_i *new, unsigned flags) ++{ ++ struct bkey_i deleted; ++ ++ bkey_init(&deleted.k); ++ deleted.k.p = new->k.p; ++ ++ return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, ++ BTREE_TRIGGER_INSERT|flags); ++} ++ ++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); ++ ++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, ++ int, struct bkey_format *, struct bkey_packed *); ++ ++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ if (version < bcachefs_metadata_version_current || ++ big_endian != CPU_BIG_ENDIAN) ++ __bch2_bkey_compat(level, btree_id, version, ++ big_endian, write, f, k); ++ ++} ++ ++#endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +new file mode 100644 +index 000000000000..b1385a77da11 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.c +@@ -0,0 +1,198 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "bkey_sort.h" ++#include "bset.h" ++#include "extents.h" ++ ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); ++ ++static inline bool sort_iter_end(struct sort_iter *iter) ++{ ++ return !iter->used; ++} ++ ++static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, ++ sort_cmp_fn cmp) ++{ ++ unsigned i; ++ ++ for (i = from; ++ i + 1 < iter->used && ++ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; ++ i++) ++ swap(iter->data[i], iter->data[i + 1]); ++} ++ ++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ unsigned i = iter->used; ++ ++ while (i--) ++ sort_iter_sift(iter, i, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) ++{ ++ return !sort_iter_end(iter) ? iter->data->k : NULL; ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ struct sort_iter_set *i = iter->data; ++ ++ BUG_ON(!iter->used); ++ ++ i->k = bkey_next(i->k); ++ ++ BUG_ON(i->k > i->end); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, 0); ++ else ++ sort_iter_sift(iter, 0, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, ++ sort_cmp_fn cmp) ++{ ++ struct bkey_packed *ret = sort_iter_peek(iter); ++ ++ if (ret) ++ sort_iter_advance(iter, cmp); ++ ++ return ret; ++} ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ */ ++static inline int key_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bch2_bkey_cmp_packed(b, l, r) ?: ++ cmp_int((unsigned long) l, (unsigned long) r); ++} ++ ++static inline bool should_drop_next_key(struct sort_iter *iter) ++{ ++ /* ++ * key_sort_cmp() ensures that when keys compare equal the older key ++ * comes first; so if l->k compares equal to r->k then l->k is older ++ * and should be dropped. ++ */ ++ return iter->used >= 2 && ++ !bch2_bkey_cmp_packed(iter->b, ++ iter->data[0].k, ++ iter->data[1].k); ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct bkey_packed *out = dst->start; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); ++ ++ while ((k = sort_iter_peek(iter))) { ++ if (!bkey_deleted(k) && ++ !should_drop_next_key(iter)) { ++ bkey_copy(out, k); ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* Sort + repack in a new format: */ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *dst, struct btree *src, ++ struct btree_node_iter *src_iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_format *in_f = &src->format; ++ struct bkey_packed *in, *out = vstruct_last(dst); ++ struct btree_nr_keys nr; ++ bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { ++ if (filter_whiteouts && bkey_deleted(in)) ++ continue; ++ ++ if (!transform) ++ bkey_copy(out, in); ++ else if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ else ++ bch2_bkey_unpack(src, (void *) out, in); ++ ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++static inline int sort_keys_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bch2_bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: ++ (int) l->needs_whiteout - (int) r->needs_whiteout; ++} ++ ++unsigned bch2_sort_keys(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *next, *out = dst; ++ ++ sort_iter_sort(iter, sort_keys_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ bool needs_whiteout = false; ++ ++ if (bkey_deleted(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ while ((next = sort_iter_peek(iter)) && ++ !bch2_bkey_cmp_packed(iter->b, in, next)) { ++ BUG_ON(in->needs_whiteout && ++ next->needs_whiteout); ++ needs_whiteout |= in->needs_whiteout; ++ in = sort_iter_next(iter, sort_keys_cmp); ++ } ++ ++ if (bkey_deleted(in)) { ++ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); ++ set_bkeyp_val_u64s(f, out, 0); ++ } else { ++ bkey_copy(out, in); ++ } ++ out->needs_whiteout |= needs_whiteout; ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +new file mode 100644 +index 000000000000..79cf11d1b4e7 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_SORT_H ++#define _BCACHEFS_BKEY_SORT_H ++ ++struct sort_iter { ++ struct btree *b; ++ unsigned used; ++ unsigned size; ++ ++ struct sort_iter_set { ++ struct bkey_packed *k, *end; ++ } data[MAX_BSETS + 1]; ++}; ++ ++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) ++{ ++ iter->b = b; ++ iter->used = 0; ++ iter->size = ARRAY_SIZE(iter->data); ++} ++ ++static inline void sort_iter_add(struct sort_iter *iter, ++ struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ BUG_ON(iter->used >= iter->size); ++ ++ if (k != end) ++ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++ ++unsigned bch2_sort_keys(struct bkey_packed *, ++ struct sort_iter *, bool); ++ ++#endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +new file mode 100644 +index 000000000000..fa60ef84e4ef +--- /dev/null ++++ b/fs/bcachefs/bset.c +@@ -0,0 +1,1598 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for working with individual keys, and sorted sets of keys with in a ++ * btree node ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "bset.h" ++#include "eytzinger.h" ++#include "util.h" ++ ++#include ++#include ++#include ++#include ++ ++/* hack.. */ ++#include "alloc_types.h" ++#include ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, ++ struct btree *); ++ ++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) ++{ ++ unsigned n = ARRAY_SIZE(iter->data); ++ ++ while (n && __btree_node_iter_set_end(iter, n - 1)) ++ --n; ++ ++ return n; ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (offset <= t->end_offset) { ++ EBUG_ON(offset < btree_bkey_first_offset(t)); ++ return t; ++ } ++ ++ BUG(); ++} ++ ++/* ++ * There are never duplicate live keys in the btree - but including keys that ++ * have been flagged as deleted (and will be cleaned up later) we _will_ see ++ * duplicates. ++ * ++ * Thus the sort order is: usual key comparison first, but for keys that compare ++ * equal the deleted key(s) come first, and the (at most one) live version comes ++ * last. ++ * ++ * The main reason for this is insertion: to handle overwrites, we first iterate ++ * over keys that compare equal to our insert key, and then insert immediately ++ * prior to the first key greater than the key we're inserting - our insert ++ * position will be after all keys that compare equal to our insert key, which ++ * by the time we actually do the insert will all be deleted. ++ */ ++ ++void bch2_dump_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned set) ++{ ++ struct bkey_packed *_k, *_n; ++ struct bkey uk, n; ++ struct bkey_s_c k; ++ struct printbuf buf = PRINTBUF; ++ ++ if (!i->u64s) ++ return; ++ ++ for (_k = i->start; ++ _k < vstruct_last(i); ++ _k = _n) { ++ _n = bkey_next(_k); ++ ++ k = bkey_disassemble(b, _k, &uk); ++ ++ printbuf_reset(&buf); ++ if (c) ++ bch2_bkey_val_to_text(&buf, c, k); ++ else ++ bch2_bkey_to_text(&buf, k.k); ++ printk(KERN_ERR "block %u key %5zu: %s\n", set, ++ _k->_data - i->_data, buf.buf); ++ ++ if (_n == vstruct_last(i)) ++ continue; ++ ++ n = bkey_unpack_key(b, _n); ++ ++ if (bpos_cmp(n.p, k.k->p) < 0) { ++ printk(KERN_ERR "Key skipped backwards\n"); ++ continue; ++ } ++ ++ if (!bkey_deleted(k.k) && ++ !bpos_cmp(n.p, k.k->p)) ++ printk(KERN_ERR "Duplicate keys\n"); ++ } ++ ++ printbuf_exit(&buf); ++} ++ ++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ console_lock(); ++ for_each_bset(b, t) ++ bch2_dump_bset(c, b, bset(b, t), t - b->set); ++ console_unlock(); ++} ++ ++void bch2_dump_btree_node_iter(struct btree *b, ++ struct btree_node_iter *iter) ++{ ++ struct btree_node_iter_set *set; ++ struct printbuf buf = PRINTBUF; ++ ++ printk(KERN_ERR "btree node iter with %u/%u sets:\n", ++ __btree_node_iter_used(iter), b->nsets); ++ ++ btree_node_iter_for_each(iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_to_text(&buf, &uk); ++ printk(KERN_ERR "set %zu key %u: %s\n", ++ t - b->set, set->k, buf.buf); ++ } ++ ++ printbuf_exit(&buf); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr = { 0 }; ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_deleted(k)) ++ btree_keys_account_key_add(&nr, t - b->set, k); ++ ++ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); ++} ++ ++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++ struct btree *b) ++{ ++ struct btree_node_iter iter = *_iter; ++ const struct bkey_packed *k, *n; ++ ++ k = bch2_btree_node_iter_peek_all(&iter, b); ++ __bch2_btree_node_iter_advance(&iter, b); ++ n = bch2_btree_node_iter_peek_all(&iter, b); ++ ++ bkey_unpack_key(b, k); ++ ++ if (n && ++ bkey_iter_cmp(b, k, n) > 0) { ++ struct btree_node_iter_set *set; ++ struct bkey ku = bkey_unpack_key(b, k); ++ struct bkey nu = bkey_unpack_key(b, n); ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&buf1, &ku); ++ bch2_bkey_to_text(&buf2, &nu); ++ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", ++ buf1.buf, buf2.buf); ++ printk(KERN_ERR "iter was:"); ++ ++ btree_node_iter_for_each(_iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ printk(" [%zi %zi]", t - b->set, ++ k->_data - bset(b, t)->_data); ++ } ++ panic("\n"); ++ } ++} ++ ++void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *set, *s2; ++ struct bkey_packed *k, *p; ++ struct bset_tree *t; ++ ++ if (bch2_btree_node_iter_end(iter)) ++ return; ++ ++ /* Verify no duplicates: */ ++ btree_node_iter_for_each(iter, set) { ++ BUG_ON(set->k > set->end); ++ btree_node_iter_for_each(iter, s2) ++ BUG_ON(set != s2 && set->end == s2->end); ++ } ++ ++ /* Verify that set->end is correct: */ ++ btree_node_iter_for_each(iter, set) { ++ for_each_bset(b, t) ++ if (set->end == t->end_offset) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(set->k < btree_bkey_first_offset(t) || ++ set->k >= t->end_offset); ++ } ++ ++ /* Verify iterator is sorted: */ ++ btree_node_iter_for_each(iter, set) ++ BUG_ON(set != iter->data && ++ btree_node_iter_cmp(b, set[-1], set[0]) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(iter, b); ++ ++ for_each_bset(b, t) { ++ if (iter->data[0].end == t->end_offset) ++ continue; ++ ++ p = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ ++ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); ++ } ++} ++ ++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); ++ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++#if 0 ++ BUG_ON(prev && ++ bkey_iter_cmp(b, prev, insert) > 0); ++#else ++ if (prev && ++ bkey_iter_cmp(b, prev, insert) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, prev); ++ struct bkey k2 = bkey_unpack_key(b, insert); ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); ++ ++ panic("prev > insert:\n" ++ "prev key %s\n" ++ "insert key %s\n", ++ buf1.buf, buf2.buf); ++ } ++#endif ++#if 0 ++ BUG_ON(next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0); ++#else ++ if (next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, insert); ++ struct bkey k2 = bkey_unpack_key(b, next); ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&buf1, &k1); ++ bch2_bkey_to_text(&buf2, &k2); ++ ++ panic("insert > next:\n" ++ "insert key %s\n" ++ "next key %s\n", ++ buf1.buf, buf2.buf); ++ } ++#endif ++} ++ ++#else ++ ++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++/* Auxiliary search trees */ ++ ++#define BFLOAT_FAILED_UNPACKED U8_MAX ++#define BFLOAT_FAILED U8_MAX ++ ++struct bkey_float { ++ u8 exponent; ++ u8 key_offset; ++ u16 mantissa; ++}; ++#define BKEY_MANTISSA_BITS 16 ++ ++static unsigned bkey_float_byte_offset(unsigned idx) ++{ ++ return idx * sizeof(struct bkey_float); ++} ++ ++struct ro_aux_tree { ++ struct bkey_float f[0]; ++}; ++ ++struct rw_aux_tree { ++ u16 offset; ++ struct bpos k; ++}; ++ ++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) ++{ ++ BUG_ON(t->aux_data_offset == U16_MAX); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return t->aux_data_offset; ++ case BSET_RO_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + ++ t->size * sizeof(u8), 8); ++ case BSET_RW_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned bset_aux_tree_buf_start(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return t == b->set ++ ? DIV_ROUND_UP(b->unpack_fn_len, 8) ++ : bset_aux_tree_buf_end(t - 1); ++} ++ ++static void *__aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return b->aux_data + t->aux_data_offset * 8; ++} ++ ++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++static u8 *ro_aux_tree_prev(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); ++} ++ ++static struct bkey_float *bkey_float(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned idx) ++{ ++ return ro_aux_tree_base(b, t)->f + idx; ++} ++ ++static void bset_aux_tree_verify(const struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ const struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ } ++#endif ++} ++ ++void bch2_btree_keys_init(struct btree *b) ++{ ++ unsigned i; ++ ++ b->nsets = 0; ++ memset(&b->nr, 0, sizeof(b->nr)); ++ ++ for (i = 0; i < MAX_BSETS; i++) ++ b->set[i].data_offset = U16_MAX; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++/* Binary tree stuff for auxiliary search trees */ ++ ++/* ++ * Cacheline/offset <-> bkey pointer arithmetic: ++ * ++ * t->tree is a binary search tree in an array; each node corresponds to a key ++ * in one cacheline in t->set (BSET_CACHELINE bytes). ++ * ++ * This means we don't have to store the full index of the key that a node in ++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and ++ * then bkey_float->m gives us the offset within that cacheline, in units of 8 ++ * bytes. ++ * ++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to ++ * make this work. ++ * ++ * To construct the bfloat for an arbitrary key we need to know what the key ++ * immediately preceding it is: we have to check if the two keys differ in the ++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size ++ * of the previous key so we can walk backwards to it from t->tree[j]'s key. ++ */ ++ ++static inline void *bset_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline) ++{ ++ return (void *) round_down((unsigned long) btree_bkey_first(b, t), ++ L1_CACHE_BYTES) + ++ cacheline * BSET_CACHELINE; ++} ++ ++static struct bkey_packed *cacheline_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ unsigned offset) ++{ ++ return bset_cacheline(b, t, cacheline) + offset * 8; ++} ++ ++static unsigned bkey_to_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bkey_packed *k) ++{ ++ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; ++} ++ ++static ssize_t __bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); ++} ++ ++static unsigned bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); ++ ++ EBUG_ON(m > U8_MAX); ++ return m; ++} ++ ++static inline struct bkey_packed *tree_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ return cacheline_to_bkey(b, t, ++ __eytzinger1_to_inorder(j, t->size - 1, t->extra), ++ bkey_float(b, t, j)->key_offset); ++} ++ ++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; ++ ++ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); ++} ++ ++static struct rw_aux_tree *rw_aux_tree(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++/* ++ * For the write set - the one we're currently inserting keys into - we don't ++ * maintain a full search tree, we just keep a simple lookup table in t->prev. ++ */ ++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, ++ struct bset_tree *t, ++ unsigned j) ++{ ++ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); ++} ++ ++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, ++ unsigned j, struct bkey_packed *k) ++{ ++ EBUG_ON(k >= btree_bkey_last(b, t)); ++ ++ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { ++ .offset = __btree_node_key_to_offset(b, k), ++ .k = bkey_unpack_pos(b, k), ++ }; ++} ++ ++static void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ unsigned j = 0; ++ ++ if (!bch2_expensive_debug_checks) ++ return; ++ ++ BUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ BUG_ON(t->size < 1); ++ BUG_ON(rw_aux_to_bkey(b, t, j) != k); ++ ++ goto start; ++ while (1) { ++ if (rw_aux_to_bkey(b, t, j) == k) { ++ BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, ++ bkey_unpack_pos(b, k))); ++start: ++ if (++j == t->size) ++ break; ++ ++ BUG_ON(rw_aux_tree(b, t)[j].offset <= ++ rw_aux_tree(b, t)[j - 1].offset); ++ } ++ ++ k = bkey_next(k); ++ BUG_ON(k >= btree_bkey_last(b, t)); ++ } ++} ++ ++/* returns idx of first entry >= offset: */ ++static unsigned rw_aux_tree_bsearch(struct btree *b, ++ struct bset_tree *t, ++ unsigned offset) ++{ ++ unsigned bset_offs = offset - btree_bkey_first_offset(t); ++ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); ++ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ EBUG_ON(!t->size); ++ EBUG_ON(idx > t->size); ++ ++ while (idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset) ++ idx++; ++ ++ while (idx && ++ rw_aux_tree(b, t)[idx - 1].offset >= offset) ++ idx--; ++ ++ EBUG_ON(idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset); ++ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); ++ EBUG_ON(idx + 1 < t->size && ++ rw_aux_tree(b, t)[idx].offset == ++ rw_aux_tree(b, t)[idx + 1].offset); ++ ++ return idx; ++} ++ ++static inline unsigned bkey_mantissa(const struct bkey_packed *k, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++ u64 v; ++ ++ EBUG_ON(!bkey_packed(k)); ++ ++ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); ++ ++ /* ++ * In little endian, we're shifting off low bits (and then the bits we ++ * want are at the low end), in big endian we're shifting off high bits ++ * (and then the bits we want are at the high end, so we shift them ++ * back down): ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ v >>= f->exponent & 7; ++#else ++ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; ++#endif ++ return (u16) v; ++} ++ ++__always_inline ++static inline void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_float *f = bkey_float(b, t, j); ++ struct bkey_packed *m = tree_to_bkey(b, t, j); ++ struct bkey_packed *l = is_power_of_2(j) ++ ? min_key ++ : tree_to_prev_bkey(b, t, j >> ffs(j)); ++ struct bkey_packed *r = is_power_of_2(j + 1) ++ ? max_key ++ : tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ unsigned mantissa; ++ int shift, exponent, high_bit; ++ ++ /* ++ * for failed bfloats, the lookup code falls back to comparing against ++ * the original key. ++ */ ++ ++ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || ++ !b->nr_key_bits) { ++ f->exponent = BFLOAT_FAILED_UNPACKED; ++ return; ++ } ++ ++ /* ++ * The greatest differing bit of l and r is the first bit we must ++ * include in the bfloat mantissa we're creating in order to do ++ * comparisons - that bit always becomes the high bit of ++ * bfloat->mantissa, and thus the exponent we're calculating here is ++ * the position of what will become the low bit in bfloat->mantissa: ++ * ++ * Note that this may be negative - we may be running off the low end ++ * of the key: we handle this later: ++ */ ++ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), ++ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); ++ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); ++ ++ /* ++ * Then we calculate the actual shift value, from the start of the key ++ * (k->_data), to get the key bits starting at exponent: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; ++ ++ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); ++#else ++ shift = high_bit_offset + ++ b->nr_key_bits - ++ exponent - ++ BKEY_MANTISSA_BITS; ++ ++ EBUG_ON(shift < KEY_PACKED_BITS_START); ++#endif ++ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); ++ ++ f->exponent = shift; ++ mantissa = bkey_mantissa(m, f, j); ++ ++ /* ++ * If we've got garbage bits, set them to all 1s - it's legal for the ++ * bfloat to compare larger than the original key, but not smaller: ++ */ ++ if (exponent < 0) ++ mantissa |= ~(~0U << -exponent); ++ ++ f->mantissa = mantissa; ++} ++ ++/* bytes remaining - only valid for last bset: */ ++static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) ++{ ++ bset_aux_tree_verify(b); ++ ++ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); ++} ++ ++static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / ++ (sizeof(struct bkey_float) + sizeof(u8)); ++} ++ ++static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); ++} ++ ++static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k; ++ ++ t->size = 1; ++ t->extra = BSET_RW_AUX_TREE_VAL; ++ rw_aux_tree(b, t)[0].offset = ++ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); ++ ++ bset_tree_for_each_key(b, t, k) { ++ if (t->size == bset_rw_tree_capacity(b, t)) ++ break; ++ ++ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > ++ L1_CACHE_BYTES) ++ rw_aux_tree_set(b, t, t->size++, k); ++ } ++} ++ ++static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); ++ struct bkey_i min_key, max_key; ++ unsigned j, cacheline = 1; ++ ++ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), ++ bset_ro_tree_capacity(b, t)); ++retry: ++ if (t->size < 2) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ return; ++ } ++ ++ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; ++ ++ /* First we figure out where the first key in each cacheline is */ ++ eytzinger1_for_each(j, t->size - 1) { ++ while (bkey_to_cacheline(b, t, k) < cacheline) ++ prev = k, k = bkey_next(k); ++ ++ if (k >= btree_bkey_last(b, t)) { ++ /* XXX: this path sucks */ ++ t->size--; ++ goto retry; ++ } ++ ++ ro_aux_tree_prev(b, t)[j] = prev->u64s; ++ bkey_float(b, t, j)->key_offset = ++ bkey_to_cacheline_offset(b, t, cacheline++, k); ++ ++ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); ++ EBUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++ ++ while (k != btree_bkey_last(b, t)) ++ prev = k, k = bkey_next(k); ++ ++ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { ++ bkey_init(&min_key.k); ++ min_key.k.p = b->data->min_key; ++ } ++ ++ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { ++ bkey_init(&max_key.k); ++ max_key.k.p = b->data->max_key; ++ } ++ ++ /* Then we build the tree */ ++ eytzinger1_for_each(j, t->size - 1) ++ make_bfloat(b, t, j, ++ bkey_to_packed(&min_key), ++ bkey_to_packed(&max_key)); ++} ++ ++static void bset_alloc_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bset_tree *i; ++ ++ for (i = b->set; i != t; i++) ++ BUG_ON(bset_has_rw_aux_tree(i)); ++ ++ bch2_bset_set_no_aux_tree(b, t); ++ ++ /* round up to next cacheline: */ ++ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), ++ SMP_CACHE_BYTES / sizeof(u64)); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, ++ bool writeable) ++{ ++ if (writeable ++ ? bset_has_rw_aux_tree(t) ++ : bset_has_ro_aux_tree(t)) ++ return; ++ ++ bset_alloc_tree(b, t); ++ ++ if (!__bset_tree_capacity(b, t)) ++ return; ++ ++ if (writeable) ++ __build_rw_aux_tree(b, t); ++ else ++ __build_ro_aux_tree(b, t); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_init_first(struct btree *b, struct bset *i) ++{ ++ struct bset_tree *t; ++ ++ BUG_ON(b->nsets); ++ ++ memset(i, 0, sizeof(*i)); ++ get_random_bytes(&i->seq, sizeof(i->seq)); ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++void bch2_bset_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_node_entry *bne) ++{ ++ struct bset *i = &bne->keys; ++ struct bset_tree *t; ++ ++ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); ++ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ memset(i, 0, sizeof(*i)); ++ i->seq = btree_bset_first(b)->seq; ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++/* ++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the ++ * immediate predecessor: ++ */ ++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed *p; ++ unsigned offset; ++ int j; ++ ++ EBUG_ON(k < btree_bkey_first(b, t) || ++ k > btree_bkey_last(b, t)); ++ ++ if (k == btree_bkey_first(b, t)) ++ return NULL; ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ p = btree_bkey_first(b, t); ++ break; ++ case BSET_RO_AUX_TREE: ++ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); ++ ++ do { ++ p = j ? tree_to_bkey(b, t, ++ __inorder_to_eytzinger1(j--, ++ t->size - 1, t->extra)) ++ : btree_bkey_first(b, t); ++ } while (p >= k); ++ break; ++ case BSET_RW_AUX_TREE: ++ offset = __btree_node_key_to_offset(b, k); ++ j = rw_aux_tree_bsearch(b, t, offset); ++ p = j ? rw_aux_to_bkey(b, t, j - 1) ++ : btree_bkey_first(b, t); ++ break; ++ } ++ ++ return p; ++} ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; ++ ++ while ((p = __bkey_prev(b, t, k)) && !ret) { ++ for (i = p; i != k; i = bkey_next(i)) ++ if (i->type >= min_key_type) ++ ret = i; ++ ++ k = p; ++ } ++ ++ if (bch2_expensive_debug_checks) { ++ BUG_ON(ret >= orig_k); ++ ++ for (i = ret ++ ? bkey_next(ret) ++ : btree_bkey_first(b, t); ++ i != orig_k; ++ i = bkey_next(i)) ++ BUG_ON(i->type >= min_key_type); ++ } ++ ++ return ret; ++} ++ ++/* Insert */ ++ ++static void bch2_bset_fix_lookup_table(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *_where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ int shift = new_u64s - clobber_u64s; ++ unsigned l, j, where = __btree_node_key_to_offset(b, _where); ++ ++ EBUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ /* returns first entry >= where */ ++ l = rw_aux_tree_bsearch(b, t, where); ++ ++ if (!l) /* never delete first entry */ ++ l++; ++ else if (l < t->size && ++ where < t->end_offset && ++ rw_aux_tree(b, t)[l].offset == where) ++ rw_aux_tree_set(b, t, l++, _where); ++ ++ /* l now > where */ ++ ++ for (j = l; ++ j < t->size && ++ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; ++ j++) ++ ; ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset + shift == ++ rw_aux_tree(b, t)[l - 1].offset) ++ j++; ++ ++ memmove(&rw_aux_tree(b, t)[l], ++ &rw_aux_tree(b, t)[j], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[j]); ++ t->size -= j - l; ++ ++ for (j = l; j < t->size; j++) ++ rw_aux_tree(b, t)[j].offset += shift; ++ ++ EBUG_ON(l < t->size && ++ rw_aux_tree(b, t)[l].offset == ++ rw_aux_tree(b, t)[l - 1].offset); ++ ++ if (t->size < bset_rw_tree_capacity(b, t) && ++ (l < t->size ++ ? rw_aux_tree(b, t)[l].offset ++ : t->end_offset) - ++ rw_aux_tree(b, t)[l - 1].offset > ++ L1_CACHE_BYTES / sizeof(u64)) { ++ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); ++ struct bkey_packed *end = l < t->size ++ ? rw_aux_to_bkey(b, t, l) ++ : btree_bkey_last(b, t); ++ struct bkey_packed *k = start; ++ ++ while (1) { ++ k = bkey_next(k); ++ if (k == end) ++ break; ++ ++ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { ++ memmove(&rw_aux_tree(b, t)[l + 1], ++ &rw_aux_tree(b, t)[l], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[l]); ++ t->size++; ++ rw_aux_tree_set(b, t, l, k); ++ break; ++ } ++ } ++ } ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_insert(struct btree *b, ++ struct btree_node_iter *iter, ++ struct bkey_packed *where, ++ struct bkey_i *insert, ++ unsigned clobber_u64s) ++{ ++ struct bkey_format *f = &b->format; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bkey_packed packed, *src = bkey_to_packed(insert); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); ++ ++ if (bch2_bkey_pack_key(&packed, &insert->k, f)) ++ src = &packed; ++ ++ if (!bkey_deleted(&insert->k)) ++ btree_keys_account_key_add(&b->nr, t - b->set, src); ++ ++ if (src->u64s != clobber_u64s) { ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data + src->u64s; ++ ++ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < ++ (int) clobber_u64s - src->u64s); ++ ++ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); ++ set_btree_bset_end(b, t); ++ } ++ ++ memcpy_u64s(where, src, ++ bkeyp_key_u64s(f, src)); ++ memcpy_u64s(bkeyp_val(f, where), &insert->v, ++ bkeyp_val_u64s(f, src)); ++ ++ if (src->u64s != clobber_u64s) ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_bset_delete(struct btree *b, ++ struct bkey_packed *where, ++ unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data; ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ ++ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); ++ ++ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); ++ set_btree_bset_end(b, t); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); ++} ++ ++/* Lookup */ ++ ++__flatten ++static struct bkey_packed *bset_search_write_set(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search) ++{ ++ unsigned l = 0, r = t->size; ++ ++ while (l + 1 != r) { ++ unsigned m = (l + r) >> 1; ++ ++ if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ l = m; ++ else ++ r = m; ++ } ++ ++ return rw_aux_to_bkey(b, t, l); ++} ++ ++static inline void prefetch_four_cachelines(void *p) ++{ ++#ifdef CONFIG_X86_64 ++ asm("prefetcht0 (-127 + 64 * 0)(%0);" ++ "prefetcht0 (-127 + 64 * 1)(%0);" ++ "prefetcht0 (-127 + 64 * 2)(%0);" ++ "prefetcht0 (-127 + 64 * 3)(%0);" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++} ++ ++static inline bool bkey_mantissa_bits_dropped(const struct btree *b, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; ++ ++ return f->exponent > key_bits_start; ++#else ++ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; ++ ++ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; ++#endif ++} ++ ++__flatten ++static struct bkey_packed *bset_search_tree(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ struct ro_aux_tree *base = ro_aux_tree_base(b, t); ++ struct bkey_float *f; ++ struct bkey_packed *k; ++ unsigned inorder, n = 1, l, r; ++ int cmp; ++ ++ do { ++ if (likely(n << 4 < t->size)) ++ prefetch(&base->f[n << 4]); ++ ++ f = &base->f[n]; ++ if (unlikely(f->exponent >= BFLOAT_FAILED)) ++ goto slowpath; ++ ++ l = f->mantissa; ++ r = bkey_mantissa(packed_search, f, n); ++ ++ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) ++ goto slowpath; ++ ++ n = n * 2 + (l < r); ++ continue; ++slowpath: ++ k = tree_to_bkey(b, t, n); ++ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); ++ if (!cmp) ++ return k; ++ ++ n = n * 2 + (cmp < 0); ++ } while (n < t->size); ++ ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); ++ ++ /* ++ * n would have been the node we recursed to - the low bit tells us if ++ * we recursed left or recursed right. ++ */ ++ if (likely(!(n & 1))) { ++ --inorder; ++ if (unlikely(!inorder)) ++ return btree_bkey_first(b, t); ++ ++ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; ++ } ++ ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++} ++ ++static __always_inline __flatten ++struct bkey_packed *__bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ ++ /* ++ * First, we search for a cacheline, then lastly we do a linear search ++ * within that cacheline. ++ * ++ * To search for the cacheline, there's three different possibilities: ++ * * The set is too small to have a search tree, so we just do a linear ++ * search over the whole set. ++ * * The set is the one we're currently inserting into; keeping a full ++ * auxiliary search tree up to date would be too expensive, so we ++ * use a much simpler lookup table to do a binary search - ++ * bset_search_write_set(). ++ * * Or we use the auxiliary search tree we constructed earlier - ++ * bset_search_tree() ++ */ ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return btree_bkey_first(b, t); ++ case BSET_RW_AUX_TREE: ++ return bset_search_write_set(b, t, search); ++ case BSET_RO_AUX_TREE: ++ return bset_search_tree(b, t, search, lossy_packed_search); ++ default: ++ unreachable(); ++ } ++} ++ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search_linear(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search, ++ struct bkey_packed *m) ++{ ++ if (lossy_packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_cmp_p_or_unp(b, m, ++ lossy_packed_search, search) < 0) ++ m = bkey_next(m); ++ ++ if (!packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_pos_cmp(b, m, search) < 0) ++ m = bkey_next(m); ++ ++ if (bch2_expensive_debug_checks) { ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); ++ ++ BUG_ON(prev && ++ bkey_iter_cmp_p_or_unp(b, prev, ++ packed_search, search) >= 0); ++ } ++ ++ return m; ++} ++ ++/* Btree node iterator */ ++ ++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set *pos; ++ ++ btree_node_iter_for_each(iter, pos) ++ ; ++ ++ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); ++ *pos = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++} ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ __bch2_btree_node_iter_push(iter, b, k, end); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++noinline __flatten __attribute__((cold)) ++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bkey_packed *k; ++ ++ trace_bkey_pack_pos_fail(search); ++ ++ bch2_btree_node_iter_init_from_start(iter, b); ++ ++ while ((k = bch2_btree_node_iter_peek(iter, b)) && ++ bkey_iter_pos_cmp(b, k, search) < 0) ++ bch2_btree_node_iter_advance(iter, b); ++} ++ ++/** ++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a ++ * given position ++ * ++ * Main entry point to the lookup code for individual btree nodes: ++ * ++ * NOTE: ++ * ++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate ++ * keys. This doesn't matter for most code, but it does matter for lookups. ++ * ++ * Some adjacent keys with a string of equal keys: ++ * i j k k k k l m ++ * ++ * If you search for k, the lookup code isn't guaranteed to return you any ++ * specific k. The lookup code is conceptually doing a binary search and ++ * iterating backwards is very expensive so if the pivot happens to land at the ++ * last k that's what you'll get. ++ * ++ * This works out ok, but it's something to be aware of: ++ * ++ * - For non extents, we guarantee that the live key comes last - see ++ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't ++ * see will only be deleted keys you don't care about. ++ * ++ * - For extents, deleted keys sort last (see the comment at the top of this ++ * file). But when you're searching for extents, you actually want the first ++ * key strictly greater than your search key - an extent that compares equal ++ * to the search key is going to have 0 sectors after the search key. ++ * ++ * But this does mean that we can't just search for ++ * bpos_successor(start_of_range) to get the first extent that overlaps with ++ * the range we want - if we're unlucky and there's an extent that ends ++ * exactly where we searched, then there could be a deleted key at the same ++ * position and we'd get that when we search instead of the preceding extent ++ * we needed. ++ * ++ * So we've got to search for start_of_range, then after the lookup iterate ++ * past any extents that compare equal to the position we searched for. ++ */ ++__flatten ++void bch2_btree_node_iter_init(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bkey_packed p, *packed_search = NULL; ++ struct btree_node_iter_set *pos = iter->data; ++ struct bkey_packed *k[MAX_BSETS]; ++ unsigned i; ++ ++ EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); ++ EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0); ++ bset_aux_tree_verify(b); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { ++ case BKEY_PACK_POS_EXACT: ++ packed_search = &p; ++ break; ++ case BKEY_PACK_POS_SMALLER: ++ packed_search = NULL; ++ break; ++ case BKEY_PACK_POS_FAIL: ++ btree_node_iter_init_pack_failed(iter, b, search); ++ return; ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ k[i] = __bch2_bset_search(b, b->set + i, search, &p); ++ prefetch_four_cachelines(k[i]); ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ struct bset_tree *t = b->set + i; ++ struct bkey_packed *end = btree_bkey_last(b, t); ++ ++ k[i] = bch2_bset_search_linear(b, t, search, ++ packed_search, &p, k[i]); ++ if (k[i] != end) ++ *pos++ = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k[i]), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) ++ return __btree_node_offset_to_key(b, set->k); ++ ++ return btree_bkey_last(b, t); ++} ++ ++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned first) ++{ ++ bool ret; ++ ++ if ((ret = (btree_node_iter_cmp(b, ++ iter->data[first], ++ iter->data[first + 1]) > 0))) ++ swap(iter->data[first], iter->data[first + 1]); ++ return ret; ++} ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ /* unrolled bubble sort: */ ++ ++ if (!__btree_node_iter_set_end(iter, 2)) { ++ btree_node_iter_sort_two(iter, b, 0); ++ btree_node_iter_sort_two(iter, b, 1); ++ } ++ ++ if (!__btree_node_iter_set_end(iter, 1)) ++ btree_node_iter_sort_two(iter, b, 0); ++} ++ ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, ++ struct btree_node_iter_set *set) ++{ ++ struct btree_node_iter_set *last = ++ iter->data + ARRAY_SIZE(iter->data) - 1; ++ ++ memmove(&set[0], &set[1], (void *) last - (void *) set); ++ *last = (struct btree_node_iter_set) { 0, 0 }; ++} ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; ++ ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ if (unlikely(__btree_node_iter_set_end(iter, 0))) { ++ bch2_btree_node_iter_set_drop(iter, iter->data); ++ return; ++ } ++ ++ if (__btree_node_iter_set_end(iter, 1)) ++ return; ++ ++ if (!btree_node_iter_sort_two(iter, b, 0)) ++ return; ++ ++ if (__btree_node_iter_set_end(iter, 2)) ++ return; ++ ++ btree_node_iter_sort_two(iter, b, 1); ++} ++ ++void bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ if (bch2_expensive_debug_checks) { ++ bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_next_check(iter, b); ++ } ++ ++ __bch2_btree_node_iter_advance(iter, b); ++} ++ ++/* ++ * Expensive: ++ */ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct btree_node_iter_set *set; ++ struct bset_tree *t; ++ unsigned end = 0; ++ ++ if (bch2_expensive_debug_checks) ++ bch2_btree_node_iter_verify(iter, b); ++ ++ for_each_bset(b, t) { ++ k = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ if (k && ++ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { ++ prev = k; ++ end = t->end_offset; ++ } ++ } ++ ++ if (!prev) ++ return NULL; ++ ++ /* ++ * We're manually memmoving instead of just calling sort() to ensure the ++ * prev we picked ends up in slot 0 - sort won't necessarily put it ++ * there because of duplicate deleted keys: ++ */ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == end) ++ goto found; ++ ++ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); ++found: ++ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); ++ ++ memmove(&iter->data[1], ++ &iter->data[0], ++ (void *) set - (void *) &iter->data[0]); ++ ++ iter->data[0].k = __btree_node_key_to_offset(b, prev); ++ iter->data[0].end = end; ++ ++ if (bch2_expensive_debug_checks) ++ bch2_btree_node_iter_verify(iter, b); ++ return prev; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *prev; ++ ++ do { ++ prev = bch2_btree_node_iter_prev_all(iter, b); ++ } while (prev && bkey_deleted(prev)); ++ ++ return prev; ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bkey *u) ++{ ++ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); ++ ++ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; ++} ++ ++/* Mergesort */ ++ ++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ enum bset_aux_tree_type type = bset_aux_tree_type(t); ++ size_t j; ++ ++ stats->sets[type].nr++; ++ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * ++ sizeof(u64); ++ ++ if (bset_has_ro_aux_tree(t)) { ++ stats->floats += t->size - 1; ++ ++ for (j = 1; j < t->size; j++) ++ stats->failed += ++ bkey_float(b, t, j)->exponent == ++ BFLOAT_FAILED; ++ } ++ } ++} ++ ++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ++ struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk; ++ unsigned j, inorder; ++ ++ if (!bset_has_ro_aux_tree(t)) ++ return; ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ if (!inorder || inorder >= t->size) ++ return; ++ ++ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); ++ if (k != tree_to_bkey(b, t, j)) ++ return; ++ ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED: ++ uk = bkey_unpack_key(b, k); ++ prt_printf(out, ++ " failed unpacked at depth %u\n" ++ "\t", ++ ilog2(j)); ++ bch2_bpos_to_text(out, uk.p); ++ prt_printf(out, "\n"); ++ break; ++ } ++} +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +new file mode 100644 +index 000000000000..0d46534c3dcd +--- /dev/null ++++ b/fs/bcachefs/bset.h +@@ -0,0 +1,615 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BSET_H ++#define _BCACHEFS_BSET_H ++ ++#include ++#include ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "util.h" /* for time_stats */ ++#include "vstructs.h" ++ ++/* ++ * BKEYS: ++ * ++ * A bkey contains a key, a size field, a variable number of pointers, and some ++ * ancillary flag bits. ++ * ++ * We use two different functions for validating bkeys, bkey_invalid and ++ * bkey_deleted(). ++ * ++ * The one exception to the rule that ptr_invalid() filters out invalid keys is ++ * that it also filters out keys of size 0 - these are keys that have been ++ * completely overwritten. It'd be safe to delete these in memory while leaving ++ * them on disk, just unnecessary work - so we filter them out when resorting ++ * instead. ++ * ++ * We can't filter out stale keys when we're resorting, because garbage ++ * collection needs to find them to ensure bucket gens don't wrap around - ++ * unless we're rewriting the btree node those stale keys still exist on disk. ++ * ++ * We also implement functions here for removing some number of sectors from the ++ * front or the back of a bkey - this is mainly used for fixing overlapping ++ * extents, by removing the overlapping sectors from the older key. ++ * ++ * BSETS: ++ * ++ * A bset is an array of bkeys laid out contiguously in memory in sorted order, ++ * along with a header. A btree node is made up of a number of these, written at ++ * different times. ++ * ++ * There could be many of them on disk, but we never allow there to be more than ++ * 4 in memory - we lazily resort as needed. ++ * ++ * We implement code here for creating and maintaining auxiliary search trees ++ * (described below) for searching an individial bset, and on top of that we ++ * implement a btree iterator. ++ * ++ * BTREE ITERATOR: ++ * ++ * Most of the code in bcache doesn't care about an individual bset - it needs ++ * to search entire btree nodes and iterate over them in sorted order. ++ * ++ * The btree iterator code serves both functions; it iterates through the keys ++ * in a btree node in sorted order, starting from either keys after a specific ++ * point (if you pass it a search key) or the start of the btree node. ++ * ++ * AUXILIARY SEARCH TREES: ++ * ++ * Since keys are variable length, we can't use a binary search on a bset - we ++ * wouldn't be able to find the start of the next key. But binary searches are ++ * slow anyways, due to terrible cache behaviour; bcache originally used binary ++ * searches and that code topped out at under 50k lookups/second. ++ * ++ * So we need to construct some sort of lookup table. Since we only insert keys ++ * into the last (unwritten) set, most of the keys within a given btree node are ++ * usually in sets that are mostly constant. We use two different types of ++ * lookup tables to take advantage of this. ++ * ++ * Both lookup tables share in common that they don't index every key in the ++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search ++ * is used for the rest. ++ * ++ * For sets that have been written to disk and are no longer being inserted ++ * into, we construct a binary search tree in an array - traversing a binary ++ * search tree in an array gives excellent locality of reference and is very ++ * fast, since both children of any node are adjacent to each other in memory ++ * (and their grandchildren, and great grandchildren...) - this means ++ * prefetching can be used to great effect. ++ * ++ * It's quite useful performance wise to keep these nodes small - not just ++ * because they're more likely to be in L2, but also because we can prefetch ++ * more nodes on a single cacheline and thus prefetch more iterations in advance ++ * when traversing this tree. ++ * ++ * Nodes in the auxiliary search tree must contain both a key to compare against ++ * (we don't want to fetch the key from the set, that would defeat the purpose), ++ * and a pointer to the key. We use a few tricks to compress both of these. ++ * ++ * To compress the pointer, we take advantage of the fact that one node in the ++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have ++ * a function (to_inorder()) that takes the index of a node in a binary tree and ++ * returns what its index would be in an inorder traversal, so we only have to ++ * store the low bits of the offset. ++ * ++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To ++ * compress that, we take advantage of the fact that when we're traversing the ++ * search tree at every iteration we know that both our search key and the key ++ * we're looking for lie within some range - bounded by our previous ++ * comparisons. (We special case the start of a search so that this is true even ++ * at the root of the tree). ++ * ++ * So we know the key we're looking for is between a and b, and a and b don't ++ * differ higher than bit 50, we don't need to check anything higher than bit ++ * 50. ++ * ++ * We don't usually need the rest of the bits, either; we only need enough bits ++ * to partition the key range we're currently checking. Consider key n - the ++ * key our auxiliary search tree node corresponds to, and key p, the key ++ * immediately preceding n. The lowest bit we need to store in the auxiliary ++ * search tree is the highest bit that differs between n and p. ++ * ++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the ++ * comparison. But we'd really like our nodes in the auxiliary search tree to be ++ * of fixed size. ++ * ++ * The solution is to make them fixed size, and when we're constructing a node ++ * check if p and n differed in the bits we needed them to. If they don't we ++ * flag that node, and when doing lookups we fallback to comparing against the ++ * real key. As long as this doesn't happen to often (and it seems to reliably ++ * happen a bit less than 1% of the time), we win - even on failures, that key ++ * is then more likely to be in cache than if we were doing binary searches all ++ * the way, since we're touching so much less memory. ++ * ++ * The keys in the auxiliary search tree are stored in (software) floating ++ * point, with an exponent and a mantissa. The exponent needs to be big enough ++ * to address all the bits in the original key, but the number of bits in the ++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. ++ * ++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys ++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. ++ * We need one node per 128 bytes in the btree node, which means the auxiliary ++ * search trees take up 3% as much memory as the btree itself. ++ * ++ * Constructing these auxiliary search trees is moderately expensive, and we ++ * don't want to be constantly rebuilding the search tree for the last set ++ * whenever we insert another key into it. For the unwritten set, we use a much ++ * simpler lookup table - it's just a flat array, so index i in the lookup table ++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing ++ * within each byte range works the same as with the auxiliary search trees. ++ * ++ * These are much easier to keep up to date when we insert a key - we do it ++ * somewhat lazily; when we shift a key up we usually just increment the pointer ++ * to it, only when it would overflow do we go to the trouble of finding the ++ * first key in that range of bytes again. ++ */ ++ ++enum bset_aux_tree_type { ++ BSET_NO_AUX_TREE, ++ BSET_RO_AUX_TREE, ++ BSET_RW_AUX_TREE, ++}; ++ ++#define BSET_TREE_NR_TYPES 3 ++ ++#define BSET_NO_AUX_TREE_VAL (U16_MAX) ++#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) ++ ++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) ++{ ++ switch (t->extra) { ++ case BSET_NO_AUX_TREE_VAL: ++ EBUG_ON(t->size); ++ return BSET_NO_AUX_TREE; ++ case BSET_RW_AUX_TREE_VAL: ++ EBUG_ON(!t->size); ++ return BSET_RW_AUX_TREE; ++ default: ++ EBUG_ON(!t->size); ++ return BSET_RO_AUX_TREE; ++ } ++} ++ ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 256 ++ ++static inline size_t btree_keys_cachelines(const struct btree *b) ++{ ++ return (1U << b->byte_order) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(const struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(const struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (bch2_expensive_debug_checks) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } ++#else ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++#endif ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ ++#define for_each_bset(_b, _t) \ ++ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) ++ ++#define bset_tree_for_each_key(_b, _t, _k) \ ++ for (_k = btree_bkey_first(_b, _t); \ ++ _k != btree_bkey_last(_b, _t); \ ++ _k = bkey_next(_k)) ++ ++static inline bool bset_has_ro_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; ++} ++ ++static inline bool bset_has_rw_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; ++} ++ ++static inline void bch2_bset_set_no_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ BUG_ON(t < b->set); ++ ++ for (; t < b->set + ARRAY_SIZE(b->set); t++) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ t->aux_data_offset = U16_MAX; ++ } ++} ++ ++static inline void btree_node_set_format(struct btree *b, ++ struct bkey_format f) ++{ ++ int len; ++ ++ b->format = f; ++ b->nr_key_bits = bkey_format_key_bits(&f); ++ ++ len = bch2_compile_bkey_format(&b->format, b->aux_data); ++ BUG_ON(len < 0 || len > U8_MAX); ++ ++ b->unpack_fn_len = len; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++static inline struct bset *bset_next_set(struct btree *b, ++ unsigned block_bytes) ++{ ++ struct bset *i = btree_bset_last(b); ++ ++ EBUG_ON(!is_power_of_2(block_bytes)); ++ ++ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); ++} ++ ++void bch2_btree_keys_init(struct btree *); ++ ++void bch2_bset_init_first(struct btree *, struct bset *); ++void bch2_bset_init_next(struct bch_fs *, struct btree *, ++ struct btree_node_entry *); ++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); ++ ++void bch2_bset_insert(struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, struct bkey_i *, unsigned); ++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); ++ ++/* Bkey utility code */ ++ ++/* packed or unpacked */ ++static inline int bkey_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ EBUG_ON(r_packed && !bkey_packed(r_packed)); ++ ++ if (unlikely(!bkey_packed(l))) ++ return bpos_cmp(packed_to_bkey_c(l)->p, *r); ++ ++ if (likely(r_packed)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); ++ ++ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, ++ struct bkey_packed *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 1); ++} ++ ++/* Btree key iteration */ ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, ++ struct bpos *); ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, ++ struct btree *, ++ struct bset_tree *); ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *, ++ struct btree_node_iter_set *); ++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); ++ ++#define btree_node_iter_for_each(_iter, _set) \ ++ for (_set = (_iter)->data; \ ++ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ ++ (_set)->k != (_set)->end; \ ++ _set++) ++ ++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, ++ unsigned i) ++{ ++ return iter->data[i].k == iter->data[i].end; ++} ++ ++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) ++{ ++ return __btree_node_iter_set_end(iter, 0); ++} ++ ++/* ++ * When keys compare equal, deleted keys compare first: ++ * ++ * XXX: only need to compare pointers for keys that are both within a ++ * btree_node_iterator - we need to break ties for prev() to work correctly ++ */ ++static inline int bkey_iter_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bch2_bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ++ ?: cmp_int(l, r); ++} ++ ++static inline int btree_node_iter_cmp(const struct btree *b, ++ struct btree_node_iter_set l, ++ struct btree_node_iter_set r) ++{ ++ return bkey_iter_cmp(b, ++ __btree_node_offset_to_key(b, l.k), ++ __btree_node_offset_to_key(b, r.k)); ++} ++ ++/* These assume r (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp_left_packed(b, l, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ return bkey_cmp_p_or_unp(b, l, r_packed, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline struct bkey_packed * ++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ return !bch2_btree_node_iter_end(iter) ++ ? __btree_node_offset_to_key(b, iter->data->k) ++ : NULL; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *k; ++ ++ while ((k = bch2_btree_node_iter_peek_all(iter, b)) && ++ bkey_deleted(k)) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return k; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return ret; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, ++ struct btree *); ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ++ struct btree *, ++ struct bkey *); ++ ++#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ ++ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ ++ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ ++ bch2_btree_node_iter_advance(iter, b)) ++ ++/* Accounting: */ ++ ++static inline void btree_keys_account_key(struct btree_nr_keys *n, ++ unsigned bset, ++ struct bkey_packed *k, ++ int sign) ++{ ++ n->live_u64s += k->u64s * sign; ++ n->bset_u64s[bset] += k->u64s * sign; ++ ++ if (bkey_packed(k)) ++ n->packed_keys += sign; ++ else ++ n->unpacked_keys += sign; ++} ++ ++static inline void btree_keys_account_val_delta(struct btree *b, ++ struct bkey_packed *k, ++ int delta) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ b->nr.live_u64s += delta; ++ b->nr.bset_u64s[t - b->set] += delta; ++} ++ ++#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, 1) ++#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, -1) ++ ++#define btree_account_key_add(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) ++#define btree_account_key_drop(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) ++ ++struct bset_stats { ++ struct { ++ size_t nr, bytes; ++ } sets[BSET_TREE_NR_TYPES]; ++ ++ size_t floats; ++ size_t failed; ++}; ++ ++void bch2_btree_keys_stats(struct btree *, struct bset_stats *); ++void bch2_bfloat_to_text(struct printbuf *, struct btree *, ++ struct bkey_packed *); ++ ++/* Debug stuff */ ++ ++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct bch_fs *, struct btree *); ++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *); ++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); ++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, ++ struct bkey_packed *, unsigned); ++ ++#else ++ ++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} ++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) {} ++#endif ++ ++static inline void bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ if (bch2_debug_check_btree_accounting) ++ __bch2_verify_btree_nr_keys(b); ++} ++ ++#endif /* _BCACHEFS_BSET_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +new file mode 100644 +index 000000000000..579a8f8c6a65 +--- /dev/null ++++ b/fs/bcachefs/btree_cache.c +@@ -0,0 +1,1170 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++#include "errcode.h" ++#include "error.h" ++ ++#include ++#include ++#include ++ ++struct lock_class_key bch2_btree_node_lock_key; ++ ++const char * const bch2_btree_node_flags[] = { ++#define x(f) #f, ++ BTREE_FLAGS() ++#undef x ++ NULL ++}; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *c) ++{ ++ unsigned i, reserve = 16; ++ ++ if (!c->btree_roots[0].b) ++ reserve += 8; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ reserve += min_t(unsigned, 1, ++ c->btree_roots[i].b->c.level) * 8; ++ ++ c->btree_cache.reserve = reserve; ++} ++ ++static inline unsigned btree_cache_can_free(struct btree_cache *bc) ++{ ++ return max_t(int, 0, bc->used - bc->reserve); ++} ++ ++static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) ++{ ++ if (b->c.lock.readers) ++ list_move(&b->list, &bc->freed_pcpu); ++ else ++ list_move(&b->list, &bc->freed_nonpcpu); ++} ++ ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ EBUG_ON(btree_node_write_in_flight(b)); ++ ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++#ifdef __KERNEL__ ++ vfree(b->aux_data); ++#else ++ munmap(b->aux_data, btree_aux_data_bytes(b)); ++#endif ++ b->aux_data = NULL; ++ ++ bc->used--; ++ ++ btree_node_to_freedlist(bc, b); ++} ++ ++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct btree *b = obj; ++ const u64 *v = arg->key; ++ ++ return b->hash_val == *v ? 0 : 1; ++} ++ ++static const struct rhashtable_params bch_btree_cache_params = { ++ .head_offset = offsetof(struct btree, hash), ++ .key_offset = offsetof(struct btree, hash_val), ++ .key_len = sizeof(u64), ++ .obj_cmpfn = bch2_btree_cache_cmp_fn, ++}; ++ ++static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ BUG_ON(b->data || b->aux_data); ++ ++ b->data = kvpmalloc(btree_bytes(c), gfp); ++ if (!b->data) ++ return -ENOMEM; ++#ifdef __KERNEL__ ++ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++#else ++ b->aux_data = mmap(NULL, btree_aux_data_bytes(b), ++ PROT_READ|PROT_WRITE|PROT_EXEC, ++ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); ++ if (b->aux_data == MAP_FAILED) ++ b->aux_data = NULL; ++#endif ++ if (!b->aux_data) { ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static struct btree *__btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ if (!b) ++ return NULL; ++ ++ bkey_btree_ptr_init(&b->key); ++ __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ b->byte_order = ilog2(btree_bytes(c)); ++ return b; ++} ++ ++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b = __btree_node_mem_alloc(c); ++ if (!b) ++ return NULL; ++ ++ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { ++ kfree(b); ++ return NULL; ++ } ++ ++ bc->used++; ++ list_add(&b->list, &bc->freeable); ++ return b; ++} ++ ++/* Btree in memory cache - hash table */ ++ ++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) ++{ ++ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ BUG_ON(ret); ++ ++ /* Cause future lookups for this node to fail: */ ++ b->hash_val = 0; ++ ++ six_lock_wakeup_all(&b->c.lock); ++} ++ ++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) ++{ ++ BUG_ON(b->hash_val); ++ b->hash_val = btree_ptr_hash_val(&b->key); ++ ++ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, ++ bch_btree_cache_params); ++} ++ ++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, ++ unsigned level, enum btree_id id) ++{ ++ int ret; ++ ++ b->c.level = level; ++ b->c.btree_id = id; ++ ++ mutex_lock(&bc->lock); ++ ret = __bch2_btree_node_hash_insert(bc, b); ++ if (!ret) ++ list_add(&b->list, &bc->live); ++ mutex_unlock(&bc->lock); ++ ++ return ret; ++} ++ ++__flatten ++static inline struct btree *btree_cache_find(struct btree_cache *bc, ++ const struct bkey_i *k) ++{ ++ u64 v = btree_ptr_hash_val(k); ++ ++ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++wait_on_io: ++ if (b->flags & ((1U << BTREE_NODE_dirty)| ++ (1U << BTREE_NODE_read_in_flight)| ++ (1U << BTREE_NODE_write_in_flight))) { ++ if (!flush) ++ return -ENOMEM; ++ ++ /* XXX: waiting on IO with btree cache lock held */ ++ bch2_btree_node_wait_on_read(b); ++ bch2_btree_node_wait_on_write(b); ++ } ++ ++ if (!six_trylock_intent(&b->c.lock)) ++ return -ENOMEM; ++ ++ if (!six_trylock_write(&b->c.lock)) ++ goto out_unlock_intent; ++ ++ /* recheck under lock */ ++ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| ++ (1U << BTREE_NODE_write_in_flight))) { ++ if (!flush) ++ goto out_unlock; ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; ++ } ++ ++ if (btree_node_noevict(b) || ++ btree_node_write_blocked(b) || ++ btree_node_will_make_reachable(b)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b)) { ++ if (!flush) ++ goto out_unlock; ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (bch2_verify_btree_ondisk) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); ++ else ++ __bch2_btree_node_write(c, b, 0); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; ++ } ++out: ++ if (b->hash_val && !ret) ++ trace_btree_node_reap(c, b); ++ return ret; ++out_unlock: ++ six_unlock_write(&b->c.lock); ++out_unlock_intent: ++ six_unlock_intent(&b->c.lock); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, false); ++} ++ ++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, true); ++} ++ ++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b, *t; ++ unsigned long nr = sc->nr_to_scan; ++ unsigned long can_free = 0; ++ unsigned long touched = 0; ++ unsigned long freed = 0; ++ unsigned i, flags; ++ unsigned long ret = SHRINK_STOP; ++ ++ if (bch2_btree_shrinker_disabled) ++ return SHRINK_STOP; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out_norestore; ++ ++ flags = memalloc_nofs_save(); ++ ++ /* ++ * It's _really_ critical that we don't free too many btree nodes - we ++ * have to always leave ourselves a reserve. The reserve is how we ++ * guarantee that allocating memory for a new btree node can always ++ * succeed, so that inserting keys into the btree can always succeed and ++ * IO can always make forward progress: ++ */ ++ can_free = btree_cache_can_free(bc); ++ nr = min_t(unsigned long, nr, can_free); ++ ++ i = 0; ++ list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ /* ++ * Leave a few nodes on the freeable list, so that a btree split ++ * won't have to hit the system allocator: ++ */ ++ if (++i <= 3) ++ continue; ++ ++ touched++; ++ ++ if (touched >= nr) ++ break; ++ ++ if (!btree_node_reclaim(c, b)) { ++ btree_node_data_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ freed++; ++ } ++ } ++restart: ++ list_for_each_entry_safe(b, t, &bc->live, list) { ++ /* tweak this */ ++ if (btree_node_accessed(b)) { ++ clear_btree_node_accessed(b); ++ goto touched; ++ } ++ ++ if (!btree_node_reclaim(c, b)) { ++ /* can't call bch2_btree_node_hash_remove under lock */ ++ freed++; ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ ++ btree_node_data_free(c, b); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ if (freed >= nr) ++ goto out; ++ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out; ++ goto restart; ++ } else { ++ continue; ++ } ++touched: ++ touched++; ++ ++ if (touched >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } ++ } ++ ++ mutex_unlock(&bc->lock); ++out: ++ ret = freed; ++ memalloc_nofs_restore(flags); ++out_norestore: ++ trace_btree_cache_scan(sc->nr_to_scan, can_free, ret); ++ return ret; ++} ++ ++static unsigned long bch2_btree_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bch2_btree_shrinker_disabled) ++ return 0; ++ ++ return btree_cache_can_free(bc); ++} ++ ++static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ ++ bch2_btree_cache_to_text(out, c); ++} ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ unsigned i, flags; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ /* vfree() can allocate memory: */ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ if (c->verify_data) ++ list_move(&c->verify_data->list, &bc->live); ++ ++ kvpfree(c->verify_ondisk, btree_bytes(c)); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ list_add(&c->btree_roots[i].b->list, &bc->live); ++ ++ list_splice(&bc->freeable, &bc->live); ++ ++ while (!list_empty(&bc->live)) { ++ b = list_first_entry(&bc->live, struct btree, list); ++ ++ BUG_ON(btree_node_read_in_flight(b) || ++ btree_node_write_in_flight(b)); ++ ++ if (btree_node_dirty(b)) ++ bch2_btree_complete_write(c, b, btree_current_write(b)); ++ clear_btree_node_dirty_acct(c, b); ++ ++ btree_node_data_free(c, b); ++ } ++ ++ BUG_ON(atomic_read(&c->btree_cache.dirty)); ++ ++ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); ++ ++ while (!list_empty(&bc->freed_nonpcpu)) { ++ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); ++ list_del(&b->list); ++ six_lock_pcpu_free(&b->c.lock); ++ kfree(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++} ++ ++int bch2_fs_btree_cache_init(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); ++ if (ret) ++ goto out; ++ ++ bc->table_init_done = true; ++ ++ bch2_recalc_btree_reserve(c); ++ ++ for (i = 0; i < bc->reserve; i++) ++ if (!__bch2_btree_node_mem_alloc(c)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_splice_init(&bc->live, &bc->freeable); ++ ++ mutex_init(&c->verify_lock); ++ ++ bc->shrink.count_objects = bch2_btree_cache_count; ++ bc->shrink.scan_objects = bch2_btree_cache_scan; ++ bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; ++ bc->shrink.seeks = 4; ++ ret = register_shrinker(&bc->shrink); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++void bch2_fs_btree_cache_init_early(struct btree_cache *bc) ++{ ++ mutex_init(&bc->lock); ++ INIT_LIST_HEAD(&bc->live); ++ INIT_LIST_HEAD(&bc->freeable); ++ INIT_LIST_HEAD(&bc->freed_pcpu); ++ INIT_LIST_HEAD(&bc->freed_nonpcpu); ++} ++ ++/* ++ * We can only have one thread cannibalizing other cached btree nodes at a time, ++ * or we'll deadlock. We use an open coded mutex to ensure that, which a ++ * cannibalize_bucket() will take. This means every time we unlock the root of ++ * the btree, we need to release this lock if we have it held. ++ */ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bc->alloc_lock == current) { ++ trace_btree_node_cannibalize_unlock(c); ++ bc->alloc_lock = NULL; ++ closure_wake_up(&bc->alloc_wait); ++ } ++} ++ ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct task_struct *old; ++ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) ++ goto success; ++ ++ if (!cl) { ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -ENOMEM; ++ } ++ ++ closure_wait(&bc->alloc_wait, cl); ++ ++ /* Try again, after adding ourselves to waitlist */ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) { ++ /* We raced */ ++ closure_wake_up(&bc->alloc_wait); ++ goto success; ++ } ++ ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -EAGAIN; ++ ++success: ++ trace_btree_node_cannibalize_lock(c); ++ return 0; ++} ++ ++static struct btree *btree_node_cannibalize(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_reclaim(c, b)) ++ return b; ++ ++ while (1) { ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_write_and_reclaim(c, b)) ++ return b; ++ ++ /* ++ * Rare case: all nodes were intent-locked. ++ * Just busy-wait. ++ */ ++ WARN_ONCE(1, "btree cache cannibalize failed\n"); ++ cond_resched(); ++ } ++} ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct list_head *freed = pcpu_read_locks ++ ? &bc->freed_pcpu ++ : &bc->freed_nonpcpu; ++ struct btree *b, *b2; ++ u64 start_time = local_clock(); ++ unsigned flags; ++ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ /* ++ * We never free struct btree itself, just the memory that holds the on ++ * disk node. Check the freed list before allocating a new one: ++ */ ++ list_for_each_entry(b, freed, list) ++ if (!btree_node_reclaim(c, b)) { ++ list_del_init(&b->list); ++ goto got_node; ++ } ++ ++ b = __btree_node_mem_alloc(c); ++ if (!b) ++ goto err_locked; ++ ++ if (pcpu_read_locks) ++ six_lock_pcpu_alloc(&b->c.lock); ++ ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); ++got_node: ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b2, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b2)) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ btree_node_to_freedlist(bc, b2); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ goto got_mem; ++ } ++ ++ mutex_unlock(&bc->lock); ++ ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; ++ ++ mutex_lock(&bc->lock); ++ bc->used++; ++got_mem: ++ mutex_unlock(&bc->lock); ++ ++ BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_write_in_flight(b)); ++out: ++ b->flags = 0; ++ b->written = 0; ++ b->nsets = 0; ++ b->sib_u64s[0] = 0; ++ b->sib_u64s[1] = 0; ++ b->whiteout_u64s = 0; ++ bch2_btree_keys_init(b); ++ set_btree_node_accessed(b); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], ++ start_time); ++ ++ memalloc_nofs_restore(flags); ++ return b; ++err: ++ mutex_lock(&bc->lock); ++err_locked: ++ /* Try to cannibalize another cached btree node: */ ++ if (bc->alloc_lock == current) { ++ b2 = btree_node_cannibalize(c); ++ bch2_btree_node_hash_remove(bc, b2); ++ ++ if (b) { ++ swap(b->data, b2->data); ++ swap(b->aux_data, b2->aux_data); ++ btree_node_to_freedlist(bc, b2); ++ six_unlock_write(&b2->c.lock); ++ six_unlock_intent(&b2->c.lock); ++ } else { ++ b = b2; ++ list_del_init(&b->list); ++ } ++ ++ mutex_unlock(&bc->lock); ++ ++ trace_btree_node_cannibalize(c); ++ goto out; ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* Slowpath, don't want it inlined into btree_iter_traverse() */ ++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level, ++ enum six_lock_type lock_type, ++ bool sync) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ u32 seq; ++ ++ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); ++ /* ++ * Parent node must be locked, else we could read in a btree node that's ++ * been freed: ++ */ ++ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { ++ trace_trans_restart_relock_parent_for_fill(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); ++ } ++ ++ b = bch2_btree_node_mem_alloc(c, level != 0); ++ ++ if (trans && b == ERR_PTR(-ENOMEM)) { ++ trans->memory_allocation_failure = true; ++ trace_trans_restart_memory_allocation_failure(trans->fn, ++ _THIS_IP_, btree_id, &path->pos); ++ ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); ++ } ++ ++ if (IS_ERR(b)) ++ return b; ++ ++ bkey_copy(&b->key, k); ++ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { ++ /* raced with another fill: */ ++ ++ /* mark as unhashed... */ ++ b->hash_val = 0; ++ ++ mutex_lock(&bc->lock); ++ list_add(&b->list, &bc->freeable); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ set_btree_node_read_in_flight(b); ++ ++ six_unlock_write(&b->c.lock); ++ seq = b->c.lock.state.seq; ++ six_unlock_intent(&b->c.lock); ++ ++ /* Unlock before doing IO: */ ++ if (trans && sync) ++ bch2_trans_unlock(trans); ++ ++ bch2_btree_node_read(c, b, sync); ++ ++ if (!sync) ++ return NULL; ++ ++ if (trans) { ++ int ret = bch2_trans_relock(trans) ?: ++ bch2_btree_path_relock_intent(trans, path); ++ if (ret) { ++ BUG_ON(!trans->restarted); ++ return ERR_PTR(ret); ++ } ++ } ++ ++ if (!six_relock_type(&b->c.lock, lock_type, seq)) { ++ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, ++ btree_id, &path->pos); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); ++ } ++ ++ return b; ++} ++ ++static int lock_node_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ const struct bkey_i *k = p; ++ ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ return BCH_ERR_lock_fail_node_reused; ++ return 0; ++} ++ ++static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) ++{ ++ struct printbuf buf = PRINTBUF; ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ prt_printf(&buf, ++ "btree node header doesn't match ptr\n" ++ "btree %s level %u\n" ++ "ptr: ", ++ bch2_btree_ids[b->c.btree_id], b->c.level); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ ++ prt_printf(&buf, "\nheader: btree %s level %llu\n" ++ "min ", ++ bch2_btree_ids[BTREE_NODE_ID(b->data)], ++ BTREE_NODE_LEVEL(b->data)); ++ bch2_bpos_to_text(&buf, b->data->min_key); ++ ++ prt_printf(&buf, "\nmax "); ++ bch2_bpos_to_text(&buf, b->data->max_key); ++ ++ bch2_fs_inconsistent(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++} ++ ++static inline void btree_check_header(struct bch_fs *c, struct btree *b) ++{ ++ if (b->c.btree_id != BTREE_NODE_ID(b->data) || ++ b->c.level != BTREE_NODE_LEVEL(b->data) || ++ bpos_cmp(b->data->max_key, b->key.k.p) || ++ (b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ bpos_cmp(b->data->min_key, ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) ++ btree_bad_header(c, b); ++} ++ ++/** ++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it ++ * in from disk if necessary. ++ * ++ * If IO is necessary and running under generic_make_request, returns -EAGAIN. ++ * ++ * The btree node will have either a read or a write lock held, depending on ++ * the @write parameter. ++ */ ++struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, ++ const struct bkey_i *k, unsigned level, ++ enum six_lock_type lock_type, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ int ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ ++ /* ++ * Check b->hash_val _before_ calling btree_node_lock() - this might not ++ * be the node we want anymore, and trying to lock the wrong node could ++ * cause an unneccessary transaction restart: ++ */ ++ if (likely(c->opts.btree_node_mem_ptr_optimization && ++ b && ++ b->hash_val == btree_ptr_hash_val(k))) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ /* ++ * We must have the parent locked to call bch2_btree_node_fill(), ++ * else we could read in a btree node from disk that's been ++ * freed: ++ */ ++ b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, ++ level, lock_type, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ /* ++ * There's a potential deadlock with splits and insertions into ++ * interior nodes we have to avoid: ++ * ++ * The other thread might be holding an intent lock on the node ++ * we want, and they want to update its parent node so they're ++ * going to upgrade their intent lock on the parent node to a ++ * write lock. ++ * ++ * But if we're holding a read lock on the parent, and we're ++ * trying to get the intent lock they're holding, we deadlock. ++ * ++ * So to avoid this we drop the read locks on parent nodes when ++ * we're starting to take intent locks - and handle the race. ++ * ++ * The race is that they might be about to free the node we ++ * want, and dropping our read lock on the parent node lets them ++ * update the parent marking the node we want as freed, and then ++ * free it: ++ * ++ * To guard against this, btree nodes are evicted from the cache ++ * when they're freed - and b->hash_val is zeroed out, which we ++ * check for after we lock the node. ++ * ++ * Then, bch2_btree_node_relock() on the parent will fail - because ++ * the parent was modified, when the pointer to the node we want ++ * was removed - and we'll bail out: ++ */ ++ if (btree_node_read_locked(path, level + 1)) ++ btree_node_unlock(trans, path, level + 1); ++ ++ ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type, ++ lock_node_check_fn, (void *) k, trace_ip); ++ if (unlikely(ret)) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) ++ goto retry; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ERR_PTR(ret); ++ BUG(); ++ } ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.level != level || ++ race_fault())) { ++ six_unlock_type(&b->c.lock, lock_type); ++ if (bch2_btree_node_relock(trans, path, level + 1)) ++ goto retry; ++ ++ trace_trans_restart_btree_node_reused(trans->fn, ++ trace_ip, ++ path->btree_id, ++ &path->pos); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); ++ } ++ } ++ ++ if (unlikely(btree_node_read_in_flight(b))) { ++ u32 seq = b->c.lock.state.seq; ++ ++ six_unlock_type(&b->c.lock, lock_type); ++ bch2_trans_unlock(trans); ++ ++ bch2_btree_node_wait_on_read(b); ++ ++ /* ++ * should_be_locked is not set on this path yet, so we need to ++ * relock it specifically: ++ */ ++ if (trans) { ++ int ret = bch2_trans_relock(trans) ?: ++ bch2_btree_path_relock_intent(trans, path); ++ if (ret) { ++ BUG_ON(!trans->restarted); ++ return ERR_PTR(ret); ++ } ++ } ++ ++ if (!six_relock_type(&b->c.lock, lock_type, seq)) ++ goto retry; ++ } ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_type(&b->c.lock, lock_type); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != path->btree_id); ++ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); ++ btree_check_header(c, b); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level, ++ bool nofill) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ int ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (c->opts.btree_node_mem_ptr_optimization) { ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++ } ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ if (nofill) ++ goto out; ++ ++ b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, ++ level, SIX_LOCK_read, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b) && ++ !bch2_btree_cache_cannibalize_lock(c, NULL)) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ goto out; ++ } else { ++lock_node: ++ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); ++ if (ret) ++ goto retry; ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.btree_id != btree_id || ++ b->c.level != level)) { ++ six_unlock_read(&b->c.lock); ++ goto retry; ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ __bch2_btree_node_wait_on_read(b); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_read(&b->c.lock); ++ b = ERR_PTR(-EIO); ++ goto out; ++ } ++ ++ EBUG_ON(b->c.btree_id != btree_id); ++ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); ++ btree_check_header(c, b); ++out: ++ bch2_btree_cache_cannibalize_unlock(c); ++ return b; ++} ++ ++int bch2_btree_node_prefetch(struct bch_fs *c, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ const struct bkey_i *k, ++ enum btree_id btree_id, unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(trans && !btree_node_locked(path, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_cache_find(bc, k); ++ if (b) ++ return 0; ++ ++ b = bch2_btree_node_fill(c, trans, path, k, btree_id, ++ level, SIX_LOCK_read, false); ++ return PTR_ERR_OR_ZERO(b); ++} ++ ++void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ b = btree_cache_find(bc, k); ++ if (!b) ++ return; ++wait_on_io: ++ /* not allowed to wait on io with btree locks held: */ ++ ++ /* XXX we're called from btree_gc which will be holding other btree ++ * nodes locked ++ * */ ++ __bch2_btree_node_wait_on_read(b); ++ __bch2_btree_node_wait_on_write(b); ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ if (btree_node_dirty(b)) { ++ __bch2_btree_node_write(c, b, 0); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ goto wait_on_io; ++ } ++ ++ BUG_ON(btree_node_dirty(b)); ++ ++ mutex_lock(&bc->lock); ++ btree_node_data_free(c, b); ++ bch2_btree_node_hash_remove(bc, b); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_stats stats; ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ bch2_btree_keys_stats(b, &stats); ++ ++ prt_printf(out, "l %u ", b->c.level); ++ bch2_bpos_to_text(out, b->data->min_key); ++ prt_printf(out, " - "); ++ bch2_bpos_to_text(out, b->data->max_key); ++ prt_printf(out, ":\n" ++ " ptrs: "); ++ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ ++ prt_printf(out, "\n" ++ " format: u64s %u fields %u %u %u %u %u\n" ++ " unpack fn len: %u\n" ++ " bytes used %zu/%zu (%zu%% full)\n" ++ " sib u64s: %u, %u (merge threshold %u)\n" ++ " nr packed keys %u\n" ++ " nr unpacked keys %u\n" ++ " floats %zu\n" ++ " failed unpacked %zu\n", ++ f->key_u64s, ++ f->bits_per_field[0], ++ f->bits_per_field[1], ++ f->bits_per_field[2], ++ f->bits_per_field[3], ++ f->bits_per_field[4], ++ b->unpack_fn_len, ++ b->nr.live_u64s * sizeof(u64), ++ btree_bytes(c) - sizeof(struct btree_node), ++ b->nr.live_u64s * 100 / btree_max_u64s(c), ++ b->sib_u64s[0], ++ b->sib_u64s[1], ++ c->btree_foreground_merge_threshold, ++ b->nr.packed_keys, ++ b->nr.unpacked_keys, ++ stats.floats, ++ stats.failed); ++} ++ ++void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); ++ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); ++ prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +new file mode 100644 +index 000000000000..25906127c023 +--- /dev/null ++++ b/fs/bcachefs/btree_cache.h +@@ -0,0 +1,107 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_CACHE_H ++#define _BCACHEFS_BTREE_CACHE_H ++ ++#include "bcachefs.h" ++#include "btree_types.h" ++ ++extern struct lock_class_key bch2_btree_node_lock_key; ++ ++extern const char * const bch2_btree_node_flags[]; ++ ++struct btree_iter; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *); ++ ++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); ++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, ++ unsigned, enum btree_id); ++ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); ++ ++struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); ++ ++struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, ++ const struct bkey_i *, unsigned, ++ enum six_lock_type, unsigned long); ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, ++ enum btree_id, unsigned, bool); ++ ++int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, ++ const struct bkey_i *, enum btree_id, unsigned); ++ ++void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *); ++int bch2_fs_btree_cache_init(struct bch_fs *); ++void bch2_fs_btree_cache_init_early(struct btree_cache *); ++ ++static inline u64 btree_ptr_hash_val(const struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ case KEY_TYPE_btree_ptr_v2: ++ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; ++ default: ++ return 0; ++ } ++} ++ ++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr ++ : NULL; ++} ++ ++/* is btree node in hash table? */ ++static inline bool btree_node_hashed(struct btree *b) ++{ ++ return b->hash_val != 0; ++} ++ ++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ ++ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ ++ &(_c)->btree_cache.table), \ ++ _iter = 0; _iter < (_tbl)->size; _iter++) \ ++ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) ++ ++static inline size_t btree_bytes(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size; ++} ++ ++static inline size_t btree_max_u64s(struct bch_fs *c) ++{ ++ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); ++} ++ ++static inline size_t btree_pages(struct bch_fs *c) ++{ ++ return btree_bytes(c) / PAGE_SIZE; ++} ++ ++static inline unsigned btree_blocks(struct bch_fs *c) ++{ ++ return btree_sectors(c) >> c->block_bits; ++} ++ ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) ++ ++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) ++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) ++ ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) ++ ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, ++ struct btree *); ++void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +new file mode 100644 +index 000000000000..2f563365ea4c +--- /dev/null ++++ b/fs/bcachefs/btree_gc.c +@@ -0,0 +1,2098 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * Copyright (C) 2014 Datera Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "bkey_buf.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "recovery.h" ++#include "reflink.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DROP_THIS_NODE 10 ++#define DROP_PREV_NODE 11 ++ ++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ preempt_disable(); ++ write_seqcount_begin(&c->gc_pos_lock); ++ c->gc_pos = new_pos; ++ write_seqcount_end(&c->gc_pos_lock); ++ preempt_enable(); ++} ++ ++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); ++ __gc_pos_set(c, new_pos); ++} ++ ++/* ++ * Missing: if an interior btree node is empty, we need to do something - ++ * perhaps just kill it ++ */ ++static int bch2_gc_check_topology(struct bch_fs *c, ++ struct btree *b, ++ struct bkey_buf *prev, ++ struct bkey_buf cur, ++ bool is_last) ++{ ++ struct bpos node_start = b->data->min_key; ++ struct bpos node_end = b->data->max_key; ++ struct bpos expected_start = bkey_deleted(&prev->k->k) ++ ? node_start ++ : bpos_successor(prev->k->k.p); ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ int ret = 0; ++ ++ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); ++ ++ if (bpos_cmp(expected_start, bp->v.min_key)) { ++ bch2_topology_error(c); ++ ++ if (bkey_deleted(&prev->k->k)) { ++ prt_printf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, node_start); ++ } else { ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); ++ } ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); ++ ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " cur %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ ret = -BCH_ERR_need_topology_repair; ++ goto err; ++ } else { ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ } ++ } ++ } ++ ++ if (is_last && bpos_cmp(cur.k->k.p, node_end)) { ++ bch2_topology_error(c); ++ ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); ++ bch2_bpos_to_text(&buf2, node_end); ++ ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " %s\n" ++ " expected %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ ret = -BCH_ERR_need_topology_repair; ++ goto err; ++ } else { ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ } ++ } ++ ++ bch2_bkey_buf_copy(prev, c, cur.k); ++err: ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) ++{ ++ switch (b->key.k.type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); ++ ++ dst->k.p = src->k.p; ++ dst->v.mem_ptr = 0; ++ dst->v.seq = b->data->keys.seq; ++ dst->v.sectors_written = 0; ++ dst->v.flags = 0; ++ dst->v.min_key = b->data->min_key; ++ set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); ++ memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); ++ break; ++ } ++ case KEY_TYPE_btree_ptr_v2: ++ bkey_copy(&dst->k_i, &b->key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void bch2_btree_node_update_key_early(struct bch_fs *c, ++ enum btree_id btree, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new) ++{ ++ struct btree *b; ++ struct bkey_buf tmp; ++ int ret; ++ ++ bch2_bkey_buf_init(&tmp); ++ bch2_bkey_buf_reassemble(&tmp, c, old); ++ ++ b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); ++ if (!IS_ERR_OR_NULL(b)) { ++ mutex_lock(&c->btree_cache.lock); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ bch2_bkey_buf_exit(&tmp, c); ++} ++ ++static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) ++{ ++ struct bkey_i_btree_ptr_v2 *new; ++ int ret; ++ ++ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ btree_ptr_to_v2(b, new); ++ b->data->min_key = new_min; ++ new->v.min_key = new_min; ++ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ++ ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ if (ret) { ++ kfree(new); ++ return ret; ++ } ++ ++ bch2_btree_node_drop_keys_outside_node(b); ++ bkey_copy(&b->key, &new->k_i); ++ return 0; ++} ++ ++static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) ++{ ++ struct bkey_i_btree_ptr_v2 *new; ++ int ret; ++ ++ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); ++ if (ret) ++ return ret; ++ ++ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ btree_ptr_to_v2(b, new); ++ b->data->max_key = new_max; ++ new->k.p = new_max; ++ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); ++ ++ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); ++ if (ret) { ++ kfree(new); ++ return ret; ++ } ++ ++ bch2_btree_node_drop_keys_outside_node(b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, &new->k_i); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ return 0; ++} ++ ++static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, ++ struct btree *prev, struct btree *cur) ++{ ++ struct bpos expected_start = !prev ++ ? b->data->min_key ++ : bpos_successor(prev->key.k.p); ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ int ret = 0; ++ ++ if (!prev) { ++ prt_printf(&buf1, "start of node: "); ++ bch2_bpos_to_text(&buf1, b->data->min_key); ++ } else { ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); ++ } ++ ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); ++ ++ if (prev && ++ bpos_cmp(expected_start, cur->data->min_key) > 0 && ++ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { ++ /* cur overwrites prev: */ ++ ++ if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key, ++ cur->data->min_key) >= 0, c, ++ "btree node overwritten by next node at btree %s level %u:\n" ++ " node %s\n" ++ " next %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf)) { ++ ret = DROP_PREV_NODE; ++ goto out; ++ } ++ ++ if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, ++ bpos_predecessor(cur->data->min_key)), c, ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " node %s\n" ++ " next %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf)) ++ ret = set_node_max(c, prev, ++ bpos_predecessor(cur->data->min_key)); ++ } else { ++ /* prev overwrites cur: */ ++ ++ if (mustfix_fsck_err_on(bpos_cmp(expected_start, ++ cur->data->max_key) >= 0, c, ++ "btree node overwritten by prev node at btree %s level %u:\n" ++ " prev %s\n" ++ " node %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf)) { ++ ret = DROP_THIS_NODE; ++ goto out; ++ } ++ ++ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, ++ "btree node with incorrect min_key at btree %s level %u:\n" ++ " prev %s\n" ++ " node %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf)) ++ ret = set_node_min(c, cur, expected_start); ++ } ++out: ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++static int btree_repair_node_end(struct bch_fs *c, struct btree *b, ++ struct btree *child) ++{ ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ int ret = 0; ++ ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); ++ bch2_bpos_to_text(&buf2, b->key.k.p); ++ ++ if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, ++ "btree node with incorrect max_key at btree %s level %u:\n" ++ " %s\n" ++ " expected %s", ++ bch2_btree_ids[b->c.btree_id], b->c.level, ++ buf1.buf, buf2.buf)) { ++ ret = set_node_max(c, child, b->key.k.p); ++ if (ret) ++ goto err; ++ } ++err: ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bkey_buf prev_k, cur_k; ++ struct btree *prev = NULL, *cur = NULL; ++ bool have_child, dropped_children = false; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (!b->c.level) ++ return 0; ++again: ++ prev = NULL; ++ have_child = dropped_children = false; ++ bch2_bkey_buf_init(&prev_k); ++ bch2_bkey_buf_init(&cur_k); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ bch2_bkey_buf_reassemble(&cur_k, c, k); ++ ++ cur = bch2_btree_node_get_noiter(c, cur_k.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(cur); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); ++ ++ if (mustfix_fsck_err_on(ret == -EIO, c, ++ "Topology repair: unreadable btree node at btree %s level %u:\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level - 1, ++ buf.buf)) { ++ bch2_btree_node_evict(c, cur_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur_k.k->k.p); ++ cur = NULL; ++ if (ret) ++ break; ++ continue; ++ } ++ ++ if (ret) { ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); ++ break; ++ } ++ ++ ret = btree_repair_node_boundaries(c, b, prev, cur); ++ ++ if (ret == DROP_THIS_NODE) { ++ six_unlock_read(&cur->c.lock); ++ bch2_btree_node_evict(c, cur_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur_k.k->k.p); ++ cur = NULL; ++ if (ret) ++ break; ++ continue; ++ } ++ ++ if (prev) ++ six_unlock_read(&prev->c.lock); ++ prev = NULL; ++ ++ if (ret == DROP_PREV_NODE) { ++ bch2_btree_node_evict(c, prev_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, prev_k.k->k.p); ++ if (ret) ++ break; ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_bkey_buf_exit(&prev_k, c); ++ bch2_bkey_buf_exit(&cur_k, c); ++ goto again; ++ } else if (ret) ++ break; ++ ++ prev = cur; ++ cur = NULL; ++ bch2_bkey_buf_copy(&prev_k, c, cur_k.k); ++ } ++ ++ if (!ret && !IS_ERR_OR_NULL(prev)) { ++ BUG_ON(cur); ++ ret = btree_repair_node_end(c, b, prev); ++ } ++ ++ if (!IS_ERR_OR_NULL(prev)) ++ six_unlock_read(&prev->c.lock); ++ prev = NULL; ++ if (!IS_ERR_OR_NULL(cur)) ++ six_unlock_read(&cur->c.lock); ++ cur = NULL; ++ ++ if (ret) ++ goto err; ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_buf_reassemble(&cur_k, c, k); ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ cur = bch2_btree_node_get_noiter(c, cur_k.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(cur); ++ ++ if (ret) { ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ ret = bch2_btree_repair_topology_recurse(c, cur); ++ six_unlock_read(&cur->c.lock); ++ cur = NULL; ++ ++ if (ret == DROP_THIS_NODE) { ++ bch2_btree_node_evict(c, cur_k.k); ++ ret = bch2_journal_key_delete(c, b->c.btree_id, ++ b->c.level, cur_k.k->k.p); ++ dropped_children = true; ++ } ++ ++ if (ret) ++ goto err; ++ ++ have_child = true; ++ } ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ ++ if (mustfix_fsck_err_on(!have_child, c, ++ "empty interior btree node at btree %s level %u\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level, buf.buf)) ++ ret = DROP_THIS_NODE; ++err: ++fsck_err: ++ if (!IS_ERR_OR_NULL(prev)) ++ six_unlock_read(&prev->c.lock); ++ if (!IS_ERR_OR_NULL(cur)) ++ six_unlock_read(&cur->c.lock); ++ ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_bkey_buf_exit(&prev_k, c); ++ bch2_bkey_buf_exit(&cur_k, c); ++ ++ if (!ret && dropped_children) ++ goto again; ++ ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_repair_topology(struct bch_fs *c) ++{ ++ struct btree *b; ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR && !ret; i++) { ++ b = c->btree_roots[i].b; ++ if (btree_node_fake(b)) ++ continue; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ ret = bch2_btree_repair_topology_recurse(c, b); ++ six_unlock_read(&b->c.lock); ++ ++ if (ret == DROP_THIS_NODE) { ++ bch_err(c, "empty btree root - repair unimplemented"); ++ ret = -BCH_ERR_fsck_repair_unimplemented; ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, bool is_root, ++ struct bkey_s_c *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p = { 0 }; ++ bool do_update = false; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ /* ++ * XXX ++ * use check_bucket_ref here ++ */ ++ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); ++ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); ++ ++ if (c->opts.reconstruct_alloc || ++ fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { ++ if (!p.ptr.cached) { ++ g->gen_valid = true; ++ g->gen = p.ptr.gen; ++ } else { ++ do_update = true; ++ } ++ } ++ ++ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, g->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { ++ if (!p.ptr.cached) { ++ g->gen_valid = true; ++ g->gen = p.ptr.gen; ++ g->data_type = 0; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } ++ ++ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) ++ do_update = true; ++ ++ if (fsck_err_on(!p.ptr.cached && ++ gen_cmp(p.ptr.gen, g->gen) < 0, c, ++ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[ptr_data_type(k->k, &p.ptr)], ++ p.ptr.gen, g->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) ++ do_update = true; ++ ++ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) ++ continue; ++ ++ if (fsck_err_on(g->data_type && ++ g->data_type != data_type, c, ++ "bucket %u:%zu different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), ++ bch2_data_types[g->data_type], ++ bch2_data_types[data_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { ++ if (data_type == BCH_DATA_btree) { ++ g->data_type = data_type; ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ } else { ++ do_update = true; ++ } ++ } ++ ++ if (p.has_ec) { ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); ++ ++ if (fsck_err_on(!m || !m->alive, c, ++ "pointer to nonexistent stripe %llu\n" ++ "while marking %s", ++ (u64) p.ec.idx, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) ++ do_update = true; ++ ++ if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, ++ "pointer does not match stripe %llu\n" ++ "while marking %s", ++ (u64) p.ec.idx, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) ++ do_update = true; ++ } ++ } ++ ++ if (do_update) { ++ struct bkey_ptrs ptrs; ++ union bch_extent_entry *entry; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *new; ++ ++ if (is_root) { ++ bch_err(c, "cannot update btree roots yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!new) { ++ bch_err(c, "%s: error allocating new key", __func__); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(new, *k); ++ ++ if (level) { ++ /* ++ * We don't want to drop btree node pointers - if the ++ * btree node isn't there anymore, the read path will ++ * sort it out: ++ */ ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); ++ ++ ptr->gen = g->gen; ++ } ++ } else { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_GC_BUCKET(ca, ptr); ++ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); ++ ++ (ptr->cached && ++ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || ++ (!ptr->cached && ++ gen_cmp(ptr->gen, g->gen) < 0) || ++ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || ++ (g->data_type && ++ g->data_type != data_type); ++ })); ++again: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { ++ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, ++ entry->stripe_ptr.idx); ++ union bch_extent_entry *next_ptr; ++ ++ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) ++ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) ++ goto found; ++ next_ptr = NULL; ++found: ++ if (!next_ptr) { ++ bch_err(c, "aieee, found stripe ptr with no data ptr"); ++ continue; ++ } ++ ++ if (!m || !m->alive || ++ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], ++ &next_ptr->ptr, ++ m->sectors)) { ++ bch2_bkey_extent_entry_drop(new, entry); ++ goto again; ++ } ++ } ++ } ++ } ++ ++ ret = bch2_journal_key_insert_take(c, btree_id, level, new); ++ if (ret) { ++ kfree(new); ++ goto err; ++ } ++ ++ if (level) ++ bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); ++ ++ if (c->opts.verbose) { ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, *k); ++ bch_info(c, "updated %s", buf.buf); ++ ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ bch_info(c, "new key %s", buf.buf); ++ } ++ ++ *k = bkey_i_to_s_c(new); ++ } ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++/* marking of btree keys/nodes: */ ++ ++static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, ++ unsigned level, bool is_root, ++ struct bkey_s_c *k, ++ bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey deleted = KEY(0, 0, 0); ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; ++ unsigned flags = ++ BTREE_TRIGGER_GC| ++ (initial ? BTREE_TRIGGER_NOATOMIC : 0); ++ int ret = 0; ++ ++ deleted.p = k->k->p; ++ ++ if (initial) { ++ BUG_ON(bch2_journal_seq_verify && ++ k->k->version.lo > atomic64_read(&c->journal.seq)); ++ ++ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, ++ "key version number higher than recorded: %llu > %llu", ++ k->k->version.lo, ++ atomic64_read(&c->key_version))) ++ atomic64_set(&c->key_version, k->k->version.lo); ++ } ++ ++ ret = commit_do(trans, NULL, NULL, 0, ++ bch2_mark_key(trans, old, *k, flags)); ++fsck_err: ++err: ++ if (ret) ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); ++ return ret; ++} ++ ++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_node_iter iter; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ struct bkey_buf prev, cur; ++ int ret = 0; ++ ++ if (!btree_node_type_needs_gc(btree_node_type(b))) ++ return 0; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ bch2_bkey_buf_init(&prev); ++ bch2_bkey_buf_init(&cur); ++ bkey_init(&prev.k->k); ++ ++ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, ++ &k, initial); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (b->c.level) { ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ ++ ret = bch2_gc_check_topology(c, b, &prev, cur, ++ bch2_btree_node_iter_end(&iter)); ++ if (ret) ++ break; ++ } ++ } ++ ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); ++ return ret; ++} ++ ++static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, ++ bool initial, bool metadata_only) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct btree *b; ++ unsigned depth = metadata_only ? 1 : 0; ++ int ret = 0; ++ ++ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); ++ ++ __for_each_btree_node(trans, iter, btree_id, POS_MIN, ++ 0, depth, BTREE_ITER_PREFETCH, b, ret) { ++ bch2_verify_btree_nr_keys(b); ++ ++ gc_pos_set(c, gc_pos_btree_node(b)); ++ ++ ret = btree_gc_mark_node(trans, b, initial); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->btree_root_lock); ++ b = c->btree_roots[btree_id].b; ++ if (!btree_node_fake(b)) { ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ true, &k, initial); ++ } ++ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); ++ mutex_unlock(&c->btree_root_lock); ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, ++ unsigned target_depth) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bkey_buf cur, prev; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ bch2_bkey_buf_init(&prev); ++ bch2_bkey_buf_init(&cur); ++ bkey_init(&prev.k->k); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ++ ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, ++ false, &k, true); ++ if (ret) { ++ bch_err(c, "%s: error from bch2_gc_mark_key: %s", ++ __func__, bch2_err_str(ret)); ++ goto fsck_err; ++ } ++ ++ if (b->c.level) { ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ k = bkey_i_to_s_c(cur.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ ret = bch2_gc_check_topology(c, b, ++ &prev, cur, ++ !bch2_btree_and_journal_iter_peek(&iter).k); ++ if (ret) ++ goto fsck_err; ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ if (b->c.level > target_depth) { ++ bch2_btree_and_journal_iter_exit(&iter); ++ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ struct btree *child; ++ ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ child = bch2_btree_node_get_noiter(c, cur.k, ++ b->c.btree_id, b->c.level - 1, ++ false); ++ ret = PTR_ERR_OR_ZERO(child); ++ ++ if (ret == -EIO) { ++ bch2_topology_error(c); ++ ++ if (__fsck_err(c, ++ FSCK_CAN_FIX| ++ FSCK_CAN_IGNORE| ++ FSCK_NO_RATELIMIT, ++ "Unreadable btree node at btree %s level %u:\n" ++ " %s", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level - 1, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { ++ ret = -BCH_ERR_need_topology_repair; ++ bch_info(c, "Halting mark and sweep to start topology repair pass"); ++ goto fsck_err; ++ } else { ++ /* Continue marking when opted to not ++ * fix the error: */ ++ ret = 0; ++ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); ++ continue; ++ } ++ } else if (ret) { ++ bch_err(c, "%s: error getting btree node: %s", ++ __func__, bch2_err_str(ret)); ++ break; ++ } ++ ++ ret = bch2_gc_btree_init_recurse(trans, child, ++ target_depth); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } ++fsck_err: ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); ++ bch2_btree_and_journal_iter_exit(&iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_gc_btree_init(struct btree_trans *trans, ++ enum btree_id btree_id, ++ bool metadata_only) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b; ++ unsigned target_depth = metadata_only ? 1 : 0; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ b = c->btree_roots[btree_id].b; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->min_key); ++ if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, ++ "btree root with incorrect min_key: %s", buf.buf)) { ++ bch_err(c, "repair unimplemented"); ++ ret = -BCH_ERR_fsck_repair_unimplemented; ++ goto fsck_err; ++ } ++ ++ printbuf_reset(&buf); ++ bch2_bpos_to_text(&buf, b->data->max_key); ++ if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, ++ "btree root with incorrect max_key: %s", buf.buf)) { ++ bch_err(c, "repair unimplemented"); ++ ret = -BCH_ERR_fsck_repair_unimplemented; ++ goto fsck_err; ++ } ++ ++ if (b->c.level >= target_depth) ++ ret = bch2_gc_btree_init_recurse(trans, b, target_depth); ++ ++ if (!ret) { ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ ++ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, ++ &k, true); ++ } ++fsck_err: ++ six_unlock_read(&b->c.lock); ++ ++ if (ret < 0) ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) ++{ ++ return (int) btree_id_to_gc_phase(l) - ++ (int) btree_id_to_gc_phase(r); ++} ++ ++static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ enum btree_id ids[BTREE_ID_NR]; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ if (initial) ++ trans.is_initial_gc = true; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ ids[i] = i; ++ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); ++ ++ for (i = 0; i < BTREE_ID_NR && !ret; i++) ++ ret = initial ++ ? bch2_gc_btree_init(&trans, ids[i], metadata_only) ++ : bch2_gc_btree(&trans, ids[i], initial, metadata_only); ++ ++ if (ret < 0) ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ unsigned flags) ++{ ++ u64 b = sector_to_bucket(ca, start); ++ ++ do { ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ bch2_mark_metadata_bucket(c, ca, b, type, sectors, ++ gc_phase(GC_PHASE_SB), flags); ++ b++; ++ start += sectors; ++ } while (start < end); ++} ++ ++static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ unsigned i; ++ u64 b; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) ++ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, ++ BCH_DATA_sb, flags); ++ ++ mark_metadata_sectors(c, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_sb, flags); ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ b = ca->journal.buckets[i]; ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), flags); ++ } ++} ++ ++static void bch2_mark_superblocks(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&c->sb_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_SB)); ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); ++ mutex_unlock(&c->sb_lock); ++} ++ ++#if 0 ++/* Also see bch2_pending_btree_node_free_insert_done() */ ++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) ++{ ++ struct btree_update *as; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); ++ ++ for_each_pending_btree_node_free(c, as, d) ++ if (d->index_update_done) ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++#endif ++ ++static void bch2_gc_free(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ genradix_free(&c->reflink_gc_table); ++ genradix_free(&c->gc_stripes); ++ ++ for_each_member_device(ca, c, i) { ++ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ca->buckets_gc = NULL; ++ ++ free_percpu(ca->usage_gc); ++ ca->usage_gc = NULL; ++ } ++ ++ free_percpu(c->usage_gc); ++ c->usage_gc = NULL; ++} ++ ++static int bch2_gc_done(struct bch_fs *c, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca = NULL; ++ struct printbuf buf = PRINTBUF; ++ bool verify = !metadata_only && ++ !c->opts.reconstruct_alloc && ++ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); ++ unsigned i, dev; ++ int ret = 0; ++ ++ percpu_down_write(&c->mark_lock); ++ ++#define copy_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f && \ ++ (!verify || \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f))) \ ++ dst->_f = src->_f ++#define copy_stripe_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f && \ ++ (!verify || \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f))) \ ++ dst->_f = src->_f ++#define copy_dev_field(_f, _msg, ...) \ ++ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) ++#define copy_fs_field(_f, _msg, ...) \ ++ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++ ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); ++ ++ for_each_member_device(ca, c, dev) { ++ struct bch_dev_usage *dst = ca->usage_base; ++ struct bch_dev_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) ca->usage_gc, ++ dev_usage_u64s()); ++ ++ copy_dev_field(buckets_ec, "buckets_ec"); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); ++ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); ++ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); ++ } ++ }; ++ ++ { ++ unsigned nr = fs_usage_u64s(c); ++ struct bch_fs_usage *dst = c->usage_base; ++ struct bch_fs_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ ++ copy_fs_field(hidden, "hidden"); ++ copy_fs_field(btree, "btree"); ++ ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (metadata_only && ++ (e->data_type == BCH_DATA_user || ++ e->data_type == BCH_DATA_cached)) ++ continue; ++ ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, e); ++ ++ copy_fs_field(replicas[i], "%s", buf.buf); ++ } ++ } ++ ++#undef copy_fs_field ++#undef copy_dev_field ++#undef copy_stripe_field ++#undef copy_field ++fsck_err: ++ if (ca) ++ percpu_ref_put(&ca->ref); ++ if (ret) ++ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); ++ ++ percpu_up_write(&c->mark_lock); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct bch_dev *ca = NULL; ++ unsigned i; ++ ++ BUG_ON(c->usage_gc); ++ ++ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), ++ sizeof(u64), GFP_KERNEL); ++ if (!c->usage_gc) { ++ bch_err(c, "error allocating c->usage_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ BUG_ON(ca->buckets_gc); ++ BUG_ON(ca->usage_gc); ++ ++ ca->usage_gc = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage_gc) { ++ bch_err(c, "error allocating ca->usage_gc"); ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ ++ this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, ++ ca->mi.nbuckets - ca->mi.first_bucket); ++ } ++ ++ return 0; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, ++ struct bch_alloc_v4 r) ++{ ++ return l.gen != r.gen || ++ l.oldest_gen != r.oldest_gen || ++ l.data_type != r.data_type || ++ l.dirty_sectors != r.dirty_sectors || ++ l.cached_sectors != r.cached_sectors || ++ l.stripe_redundancy != r.stripe_redundancy || ++ l.stripe != r.stripe; ++} ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ bool metadata_only) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ struct bucket gc, *b; ++ struct bkey_i_alloc_v4 *a; ++ struct bch_alloc_v4 old, new; ++ enum bch_data_type type; ++ int ret; ++ ++ if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) ++ return 1; ++ ++ bch2_alloc_to_v4(k, &old); ++ new = old; ++ ++ percpu_down_read(&c->mark_lock); ++ b = gc_bucket(ca, iter->pos.offset); ++ ++ /* ++ * b->data_type doesn't yet include need_discard & need_gc_gen states - ++ * fix that here: ++ */ ++ type = __alloc_data_type(b->dirty_sectors, ++ b->cached_sectors, ++ b->stripe, ++ old, ++ b->data_type); ++ if (b->data_type != type) { ++ struct bch_dev_usage *u; ++ ++ preempt_disable(); ++ u = this_cpu_ptr(ca->usage_gc); ++ u->d[b->data_type].buckets--; ++ b->data_type = type; ++ u->d[b->data_type].buckets++; ++ preempt_enable(); ++ } ++ ++ gc = *b; ++ percpu_up_read(&c->mark_lock); ++ ++ if (metadata_only && ++ gc.data_type != BCH_DATA_sb && ++ gc.data_type != BCH_DATA_journal && ++ gc.data_type != BCH_DATA_btree) ++ return 0; ++ ++ if (gen_after(old.gen, gc.gen)) ++ return 0; ++ ++#define copy_bucket_field(_f) \ ++ if (c->opts.reconstruct_alloc || \ ++ fsck_err_on(new._f != gc._f, c, \ ++ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", \ ++ iter->pos.inode, iter->pos.offset, \ ++ gc.gen, \ ++ bch2_data_types[gc.data_type], \ ++ new._f, gc._f)) \ ++ new._f = gc._f; \ ++ ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ copy_bucket_field(stripe_redundancy); ++ copy_bucket_field(stripe); ++#undef copy_bucket_field ++ ++ if (!bch2_alloc_v4_cmp(old, new)) ++ return 0; ++ ++ a = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; ++ ++ a->v = new; ++ ++ /* ++ * The trigger normally makes sure this is set, but we're not running ++ * triggers: ++ */ ++ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) ++ a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); ++ ++ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) { ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS(ca->dev_idx, ca->mi.first_bucket), ++ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_alloc_write_key(&trans, &iter, k, metadata_only)); ++ ++ if (ret < 0) { ++ bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); ++ percpu_ref_put(&ca->ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bucket *g; ++ struct bch_alloc_v4 a; ++ unsigned i; ++ int ret; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!buckets) { ++ percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; ++ } ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = ca->mi.nbuckets; ++ rcu_assign_pointer(ca->buckets_gc, buckets); ++ }; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = gc_bucket(ca, k.k->p.offset); ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ g->gen_valid = 1; ++ g->gen = a.gen; ++ ++ if (metadata_only && ++ (a.data_type == BCH_DATA_user || ++ a.data_type == BCH_DATA_cached || ++ a.data_type == BCH_DATA_parity)) { ++ g->data_type = a.data_type; ++ g->dirty_sectors = a.dirty_sectors; ++ g->cached_sectors = a.cached_sectors; ++ g->stripe = a.stripe; ++ g->stripe_redundancy = a.stripe_redundancy; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); ++ ++ return ret; ++} ++ ++static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *buckets = gc_bucket_array(ca); ++ struct bucket *g; ++ ++ for_each_bucket(g, buckets) { ++ if (metadata_only && ++ (g->data_type == BCH_DATA_user || ++ g->data_type == BCH_DATA_cached || ++ g->data_type == BCH_DATA_parity)) ++ continue; ++ g->data_type = 0; ++ g->dirty_sectors = 0; ++ g->cached_sectors = 0; ++ } ++ }; ++} ++ ++static int bch2_gc_write_reflink_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ size_t *idx) ++{ ++ struct bch_fs *c = trans->c; ++ const __le64 *refcount = bkey_refcount_c(k); ++ struct printbuf buf = PRINTBUF; ++ struct reflink_gc *r; ++ int ret = 0; ++ ++ if (!refcount) ++ return 0; ++ ++ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && ++ r->offset < k.k->p.offset) ++ ++*idx; ++ ++ if (!r || ++ r->offset != k.k->p.offset || ++ r->size != k.k->size) { ++ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); ++ return -EINVAL; ++ } ++ ++ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, ++ "reflink key has wrong refcount:\n" ++ " %s\n" ++ " should be %u", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ r->refcount)) { ++ struct bkey_i *new; ++ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(new, k); ++ ++ if (!r->refcount) ++ new->k.type = KEY_TYPE_deleted; ++ else ++ *bkey_refcount(new) = cpu_to_le64(r->refcount); ++ ++ ret = bch2_trans_update(trans, iter, new, 0); ++ } ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ size_t idx = 0; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); ++ ++ c->reflink_gc_nr = 0; ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int bch2_gc_reflink_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct reflink_gc *r; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ c->reflink_gc_nr = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ continue; ++ ++ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, ++ GFP_KERNEL); ++ if (!r) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ r->offset = k.k->p.offset; ++ r->size = k.k->size; ++ r->refcount = 0; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) ++{ ++ struct genradix_iter iter; ++ struct reflink_gc *r; ++ ++ genradix_for_each(&c->reflink_gc_table, iter, r) ++ r->refcount = 0; ++} ++ ++static int bch2_gc_write_stripes_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct printbuf buf = PRINTBUF; ++ const struct bch_stripe *s; ++ struct gc_stripe *m; ++ unsigned i; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return 0; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) ++ goto inconsistent; ++ return 0; ++inconsistent: ++ if (fsck_err_on(true, c, ++ "stripe has wrong block sector count %u:\n" ++ " %s\n" ++ " should be %u", i, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ m ? m->block_sectors[i] : 0)) { ++ struct bkey_i_stripe *new; ++ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(&new->k_i, k); ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); ++ ++ ret = bch2_trans_update(trans, iter, &new->k_i, 0); ++ } ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ if (metadata_only) ++ return 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_gc_write_stripes_key(&trans, &iter, k)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) ++{ ++ genradix_free(&c->gc_stripes); ++} ++ ++/** ++ * bch2_gc - walk _all_ references to buckets, and recompute them: ++ * ++ * Order matters here: ++ * - Concurrent GC relies on the fact that we have a total ordering for ++ * everything that GC walks - see gc_will_visit_node(), ++ * gc_will_visit_root() ++ * ++ * - also, references move around in the course of index updates and ++ * various other crap: everything needs to agree on the ordering ++ * references are allowed to move around in - e.g., we're allowed to ++ * start with a reference owned by an open_bucket (the allocator) and ++ * move it to the btree, but not the reverse. ++ * ++ * This is necessary to ensure that gc doesn't miss references that ++ * move around - if references move backwards in the ordering GC ++ * uses, GC could skip past them ++ */ ++int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) ++{ ++ unsigned iter = 0; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ down_write(&c->gc_lock); ++ ++ bch2_btree_interior_updates_flush(c); ++ ++ ret = bch2_gc_start(c, metadata_only) ?: ++ bch2_gc_alloc_start(c, metadata_only) ?: ++ bch2_gc_reflink_start(c, metadata_only); ++ if (ret) ++ goto out; ++again: ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); ++ ++ bch2_mark_superblocks(c); ++ ++ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && ++ c->opts.fix_errors != FSCK_OPT_NO) { ++ bch_info(c, "Starting topology repair pass"); ++ ret = bch2_repair_topology(c); ++ if (ret) ++ goto out; ++ bch_info(c, "Topology repair pass done"); ++ ++ set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); ++ } ++ ++ ret = bch2_gc_btrees(c, initial, metadata_only); ++ ++ if (ret == -BCH_ERR_need_topology_repair && ++ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { ++ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true); ++ ret = 0; ++ } ++ ++ if (ret == -BCH_ERR_need_topology_repair) ++ ret = -BCH_ERR_fsck_errors_not_fixed; ++ ++ if (ret) ++ goto out; ++ ++#if 0 ++ bch2_mark_pending_btree_node_frees(c); ++#endif ++ c->gc_count++; ++ ++ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || ++ (!iter && bch2_test_restart_gc)) { ++ if (iter++ > 2) { ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* ++ * XXX: make sure gens we fixed got saved ++ */ ++ bch_info(c, "Second GC pass needed, restarting:"); ++ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_stripes_reset(c, metadata_only); ++ bch2_gc_alloc_reset(c, metadata_only); ++ bch2_gc_reflink_reset(c, metadata_only); ++ ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ goto again; ++ } ++out: ++ if (!ret) { ++ bch2_journal_block(&c->journal); ++ ++ ret = bch2_gc_stripes_done(c, metadata_only) ?: ++ bch2_gc_reflink_done(c, metadata_only) ?: ++ bch2_gc_alloc_done(c, metadata_only) ?: ++ bch2_gc_done(c, initial, metadata_only); ++ ++ bch2_journal_unblock(&c->journal); ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ ++ up_write(&c->gc_lock); ++ ++ /* ++ * At startup, allocations can happen directly instead of via the ++ * allocator thread - issue wakeup in case they blocked on gc_lock: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ return ret; ++} ++ ++static int gc_btree_gens_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ struct bkey_i *u; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ptr_stale(ca, ptr) > 16) { ++ percpu_up_read(&c->mark_lock); ++ goto update; ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; ++ ++ if (gen_after(*gen, ptr->gen)) ++ *gen = ptr->gen; ++ } ++ percpu_up_read(&c->mark_lock); ++ return 0; ++update: ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ ++ bch2_extent_normalize(c, bkey_i_to_s(u)); ++ return bch2_trans_update(trans, iter, u, 0); ++} ++ ++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); ++ struct bch_alloc_v4 a; ++ struct bkey_i_alloc_v4 *a_mut; ++ int ret; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) ++ return 0; ++ ++ a_mut = bch2_alloc_to_v4_mut(trans, k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ return ret; ++ ++ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; ++ a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); ++ ++ return bch2_trans_update(trans, iter, &a_mut->k_i, 0); ++} ++ ++int bch2_gc_gens(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ u64 b, start_time = local_clock(); ++ unsigned i; ++ int ret; ++ ++ /* ++ * Ideally we would be using state_lock and not gc_lock here, but that ++ * introduces a deadlock in the RO path - we currently take the state ++ * lock at the start of going RO, thus the gc thread may get stuck: ++ */ ++ if (!mutex_trylock(&c->gc_gens_lock)) ++ return 0; ++ ++ trace_gc_gens_start(c); ++ down_read(&c->gc_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_gens *gens; ++ ++ BUG_ON(ca->oldest_gen); ++ ++ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); ++ if (!ca->oldest_gen) { ++ percpu_ref_put(&ca->ref); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ gens = bucket_gens(ca); ++ ++ for (b = gens->first_bucket; ++ b < gens->nbuckets; b++) ++ ca->oldest_gen[b] = gens->b[b]; ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if ((1 << i) & BTREE_ID_HAS_PTRS) { ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ c->gc_gens_btree = i; ++ c->gc_gens_pos = POS_MIN; ++ ret = for_each_btree_key_commit(&trans, iter, i, ++ POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ gc_btree_gens_key(&trans, &iter, k)); ++ if (ret) { ++ bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, ++ POS_MIN, ++ BTREE_ITER_PREFETCH, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_alloc_write_oldest_gen(&trans, &iter, k)); ++ if (ret) { ++ bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ c->gc_gens_btree = 0; ++ c->gc_gens_pos = POS_MIN; ++ ++ c->gc_count++; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); ++ trace_gc_gens_end(c); ++err: ++ for_each_member_device(ca, c, i) { ++ kvfree(ca->oldest_gen); ++ ca->oldest_gen = NULL; ++ } ++ ++ bch2_trans_exit(&trans); ++ up_read(&c->gc_lock); ++ mutex_unlock(&c->gc_gens_lock); ++ return ret; ++} ++ ++static int bch2_gc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last = atomic64_read(&clock->now); ++ unsigned last_kick = atomic_read(&c->kick_gc); ++ int ret; ++ ++ set_freezable(); ++ ++ while (1) { ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (atomic_read(&c->kick_gc) != last_kick) ++ break; ++ ++ if (c->btree_gc_periodic) { ++ unsigned long next = last + c->capacity / 16; ++ ++ if (atomic64_read(&clock->now) >= next) ++ break; ++ ++ bch2_io_clock_schedule_timeout(clock, next); ++ } else { ++ schedule(); ++ } ++ ++ try_to_freeze(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ last = atomic64_read(&clock->now); ++ last_kick = atomic_read(&c->kick_gc); ++ ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ ret = bch2_gc(c, false, false); ++#else ++ ret = bch2_gc_gens(c); ++#endif ++ if (ret < 0) ++ bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); ++ ++ debug_check_no_locks_held(); ++ } ++ ++ return 0; ++} ++ ++void bch2_gc_thread_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ p = c->gc_thread; ++ c->gc_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_gc_thread_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ if (c->gc_thread) ++ return 0; ++ ++ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); ++ if (IS_ERR(p)) { ++ bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); ++ return PTR_ERR(p); ++ } ++ ++ get_task_struct(p); ++ c->gc_thread = p; ++ wake_up_process(p); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +new file mode 100644 +index 000000000000..95d803b5743d +--- /dev/null ++++ b/fs/bcachefs/btree_gc.h +@@ -0,0 +1,112 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_GC_H ++#define _BCACHEFS_BTREE_GC_H ++ ++#include "btree_types.h" ++ ++int bch2_gc(struct bch_fs *, bool, bool); ++int bch2_gc_gens(struct bch_fs *); ++void bch2_gc_thread_stop(struct bch_fs *); ++int bch2_gc_thread_start(struct bch_fs *); ++ ++/* ++ * For concurrent mark and sweep (with other index updates), we define a total ++ * ordering of _all_ references GC walks: ++ * ++ * Note that some references will have the same GC position as others - e.g. ++ * everything within the same btree node; in those cases we're relying on ++ * whatever locking exists for where those references live, i.e. the write lock ++ * on a btree node. ++ * ++ * That locking is also required to ensure GC doesn't pass the updater in ++ * between the updater adding/removing the reference and updating the GC marks; ++ * without that, we would at best double count sometimes. ++ * ++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ ++ * be held that prevents GC from passing the position the updater is at. ++ * ++ * (What about the start of gc, when we're clearing all the marks? GC clears the ++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc ++ * position inside its cmpxchg loop, so crap magically works). ++ */ ++ ++/* Position of (the start of) a gc phase: */ ++static inline struct gc_pos gc_phase(enum gc_phase phase) ++{ ++ return (struct gc_pos) { ++ .phase = phase, ++ .pos = POS_MIN, ++ .level = 0, ++ }; ++} ++ ++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) ++{ ++ return cmp_int(l.phase, r.phase) ?: ++ bpos_cmp(l.pos, r.pos) ?: ++ cmp_int(l.level, r.level); ++} ++ ++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) ++{ ++ switch (id) { ++#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; ++ BCH_BTREE_IDS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct gc_pos gc_pos_btree(enum btree_id id, ++ struct bpos pos, unsigned level) ++{ ++ return (struct gc_pos) { ++ .phase = btree_id_to_gc_phase(id), ++ .pos = pos, ++ .level = level, ++ }; ++} ++ ++/* ++ * GC position of the pointers within a btree node: note, _not_ for &b->key ++ * itself, that lives in the parent node: ++ */ ++static inline struct gc_pos gc_pos_btree_node(struct btree *b) ++{ ++ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); ++} ++ ++/* ++ * GC position of the pointer to a btree root: we don't use ++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with ++ * btree_split() increasing the tree depth - the new root will have level > the ++ * old root and thus have a greater gc position than the old root, but that ++ * would be incorrect since once gc has marked the root it's not coming back. ++ */ ++static inline struct gc_pos gc_pos_btree_root(enum btree_id id) ++{ ++ return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); ++} ++ ++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) ++{ ++ unsigned seq; ++ bool ret; ++ ++ do { ++ seq = read_seqcount_begin(&c->gc_pos_lock); ++ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ++ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); ++ ++ return ret; ++} ++ ++static inline void bch2_do_gc_gens(struct bch_fs *c) ++{ ++ atomic_inc(&c->kick_gc); ++ if (c->gc_thread) ++ wake_up_process(c->gc_thread); ++} ++ ++#endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +new file mode 100644 +index 000000000000..ae731b3a3908 +--- /dev/null ++++ b/fs/bcachefs/btree_io.c +@@ -0,0 +1,2150 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++void bch2_btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ ++ clear_btree_node_write_in_flight_inner(b); ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_node_io_lock(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void __bch2_btree_node_wait_on_read(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void __bch2_btree_node_wait_on_write(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void bch2_btree_node_wait_on_read(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++void bch2_btree_node_wait_on_write(struct btree *b) ++{ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static void verify_no_dups(struct btree *b, ++ struct bkey_packed *start, ++ struct bkey_packed *end) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bkey_packed *k, *p; ++ ++ if (start == end) ++ return; ++ ++ for (p = start, k = bkey_next(start); ++ k != end; ++ p = k, k = bkey_next(k)) { ++ struct bkey l = bkey_unpack_key(b, p); ++ struct bkey r = bkey_unpack_key(b, k); ++ ++ BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ } ++#endif ++} ++ ++static void set_needs_whiteout(struct bset *i, int v) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ k->needs_whiteout = v; ++} ++ ++static void btree_bounce_free(struct bch_fs *c, size_t size, ++ bool used_mempool, void *p) ++{ ++ if (used_mempool) ++ mempool_free(p, &c->btree_bounce_pool); ++ else ++ vpfree(p, size); ++} ++ ++static void *btree_bounce_alloc(struct bch_fs *c, size_t size, ++ bool *used_mempool) ++{ ++ unsigned flags = memalloc_nofs_save(); ++ void *p; ++ ++ BUG_ON(size > btree_bytes(c)); ++ ++ *used_mempool = false; ++ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); ++ if (!p) { ++ *used_mempool = true; ++ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ } ++ memalloc_nofs_restore(flags); ++ return p; ++} ++ ++static void sort_bkey_ptrs(const struct btree *bt, ++ struct bkey_packed **ptrs, unsigned nr) ++{ ++ unsigned n = nr, a = nr / 2, b, c, d; ++ ++ if (!a) ++ return; ++ ++ /* Heap sort: see lib/sort.c: */ ++ while (1) { ++ if (a) ++ a--; ++ else if (--n) ++ swap(ptrs[0], ptrs[n]); ++ else ++ break; ++ ++ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) ++ b = bch2_bkey_cmp_packed(bt, ++ ptrs[c], ++ ptrs[d]) >= 0 ? c : d; ++ if (d == n) ++ b = c; ++ ++ while (b != a && ++ bch2_bkey_cmp_packed(bt, ++ ptrs[a], ++ ptrs[b]) >= 0) ++ b = (b - 1) / 2; ++ c = b; ++ while (b != a) { ++ b = (b - 1) / 2; ++ swap(ptrs[b], ptrs[c]); ++ } ++ } ++} ++ ++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; ++ bool used_mempool = false; ++ size_t bytes = b->whiteout_u64s * sizeof(u64); ++ ++ if (!b->whiteout_u64s) ++ return; ++ ++ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); ++ ++ for (k = unwritten_whiteouts_start(c, b); ++ k != unwritten_whiteouts_end(c, b); ++ k = bkey_next(k)) ++ *--ptrs = k; ++ ++ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); ++ ++ k = new_whiteouts; ++ ++ while (ptrs != ptrs_end) { ++ bkey_copy(k, *ptrs); ++ k = bkey_next(k); ++ ptrs++; ++ } ++ ++ verify_no_dups(b, new_whiteouts, ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); ++ ++ memcpy_u64s(unwritten_whiteouts_start(c, b), ++ new_whiteouts, b->whiteout_u64s); ++ ++ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); ++} ++ ++static bool should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, enum compact_mode mode) ++{ ++ if (!bset_dead_u64s(b, t)) ++ return false; ++ ++ switch (mode) { ++ case COMPACT_LAZY: ++ return should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t))); ++ case COMPACT_ALL: ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) ++{ ++ struct bset_tree *t; ++ bool ret = false; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ ret = true; ++ ++ if (!should_compact_bset(b, t, ret, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ start = btree_bkey_first(b, t); ++ end = btree_bkey_last(b, t); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next(k); ++ ++ if (!bkey_deleted(k)) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } else { ++ BUG_ON(k->needs_whiteout); ++ } ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ ret = true; ++ } ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return ret; ++} ++ ++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ return bch2_drop_whiteouts(b, mode); ++} ++ ++static void btree_node_sort(struct bch_fs *c, struct btree *b, ++ unsigned start_idx, ++ unsigned end_idx, ++ bool filter_whiteouts) ++{ ++ struct btree_node *out; ++ struct sort_iter sort_iter; ++ struct bset_tree *t; ++ struct bset *start_bset = bset(b, &b->set[start_idx]); ++ bool used_mempool = false; ++ u64 start_time, seq = 0; ++ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; ++ bool sorting_entire_node = start_idx == 0 && ++ end_idx == b->nsets; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ for (t = b->set + start_idx; ++ t < b->set + end_idx; ++ t++) { ++ u64s += le16_to_cpu(bset(b, t)->u64s); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ } ++ ++ bytes = sorting_entire_node ++ ? btree_bytes(c) ++ : __vstruct_bytes(struct btree_node, u64s); ++ ++ out = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ start_time = local_clock(); ++ ++ u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); ++ ++ out->keys.u64s = cpu_to_le16(u64s); ++ ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); ++ ++ if (sorting_entire_node) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ /* Make sure we preserve bset journal_seq: */ ++ for (t = b->set + start_idx; t < b->set + end_idx; t++) ++ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); ++ start_bset->journal_seq = cpu_to_le64(seq); ++ ++ if (sorting_entire_node) { ++ unsigned u64s = le16_to_cpu(out->keys.u64s); ++ ++ BUG_ON(bytes != btree_bytes(c)); ++ ++ /* ++ * Our temporary buffer is the same size as the btree node's ++ * buffer, we can just swap buffers instead of doing a big ++ * memcpy() ++ */ ++ *out = *b->data; ++ out->keys.u64s = cpu_to_le16(u64s); ++ swap(out, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ } else { ++ start_bset->u64s = out->keys.u64s; ++ memcpy_u64s(start_bset->start, ++ out->keys.start, ++ le16_to_cpu(out->keys.u64s)); ++ } ++ ++ for (i = start_idx + 1; i < end_idx; i++) ++ b->nr.bset_u64s[start_idx] += ++ b->nr.bset_u64s[i]; ++ ++ b->nsets -= shift; ++ ++ for (i = start_idx + 1; i < b->nsets; i++) { ++ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; ++ b->set[i] = b->set[i + shift]; ++ } ++ ++ for (i = b->nsets; i < MAX_BSETS; i++) ++ b->nr.bset_u64s[i] = 0; ++ ++ set_btree_bset_end(b, &b->set[start_idx]); ++ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); ++ ++ btree_bounce_free(c, bytes, used_mempool, out); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *c, ++ struct btree *dst, ++ struct btree *src) ++{ ++ struct btree_nr_keys nr; ++ struct btree_node_iter src_iter; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(dst->nsets != 1); ++ ++ bch2_bset_set_no_aux_tree(dst, dst->set); ++ ++ bch2_btree_node_iter_init_from_start(&src_iter, src); ++ ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ set_btree_bset_end(dst, dst->set); ++ ++ dst->nr.live_u64s += nr.live_u64s; ++ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; ++ dst->nr.packed_keys += nr.packed_keys; ++ dst->nr.unpacked_keys += nr.unpacked_keys; ++ ++ bch2_verify_btree_nr_keys(dst); ++} ++ ++#define SORT_CRIT (4096 / sizeof(u64)) ++ ++/* ++ * We're about to add another bset to the btree node, so if there's currently ++ * too many bsets - sort some of them together: ++ */ ++static bool btree_node_compact(struct bch_fs *c, struct btree *b) ++{ ++ unsigned unwritten_idx; ++ bool ret = false; ++ ++ for (unwritten_idx = 0; ++ unwritten_idx < b->nsets; ++ unwritten_idx++) ++ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) ++ break; ++ ++ if (b->nsets - unwritten_idx > 1) { ++ btree_node_sort(c, b, unwritten_idx, ++ b->nsets, false); ++ ret = true; ++ } ++ ++ if (unwritten_idx > 1) { ++ btree_node_sort(c, b, 0, unwritten_idx, false); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_build_aux_trees(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ bch2_bset_build_aux_tree(b, t, ++ !bset_written(b, bset(b, t)) && ++ t == bset_tree_last(b)); ++} ++ ++/* ++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be ++ * inserted into ++ * ++ * Safe to call if there already is an unwritten bset - will only add a new bset ++ * if @b doesn't already have one. ++ * ++ * Returns true if we sorted (i.e. invalidated iterators ++ */ ++void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_node_entry *bne; ++ bool reinit_iter = false; ++ ++ EBUG_ON(!(b->c.lock.state.seq & 1)); ++ BUG_ON(bset_written(b, bset(b, &b->set[1]))); ++ ++ if (b->nsets == MAX_BSETS && ++ !btree_node_write_in_flight(b)) { ++ unsigned log_u64s[] = { ++ ilog2(bset_u64s(&b->set[0])), ++ ilog2(bset_u64s(&b->set[1])), ++ ilog2(bset_u64s(&b->set[2])), ++ }; ++ ++ if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { ++ bch2_btree_node_write(c, b, SIX_LOCK_write, 0); ++ reinit_iter = true; ++ } ++ } ++ ++ if (b->nsets == MAX_BSETS && ++ btree_node_compact(c, b)) ++ reinit_iter = true; ++ ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ if (reinit_iter) ++ bch2_trans_node_reinit_iter(trans, b); ++} ++ ++static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ prt_printf(out, "%s level %u/%u\n ", ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level, ++ c->btree_roots[b->c.btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++} ++ ++static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct bch_dev *ca, ++ struct btree *b, struct bset *i, ++ unsigned offset, int write) ++{ ++ prt_printf(out, "error validating btree node "); ++ if (write) ++ prt_printf(out, "before write "); ++ if (ca) ++ prt_printf(out, "on %s ", ca->name); ++ prt_printf(out, "at btree "); ++ btree_pos_to_text(out, c, b); ++ ++ prt_printf(out, "\n node offset %u", b->written); ++ if (i) ++ prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); ++} ++ ++enum btree_err_type { ++ BTREE_ERR_FIXABLE, ++ BTREE_ERR_WANT_RETRY, ++ BTREE_ERR_MUST_RETRY, ++ BTREE_ERR_FATAL, ++}; ++ ++enum btree_validate_ret { ++ BTREE_RETRY_READ = 64, ++}; ++ ++#define btree_err(type, c, ca, b, i, msg, ...) \ ++({ \ ++ __label__ out; \ ++ struct printbuf out = PRINTBUF; \ ++ \ ++ btree_err_msg(&out, c, ca, b, i, b->written, write); \ ++ prt_printf(&out, ": " msg, ##__VA_ARGS__); \ ++ \ ++ if (type == BTREE_ERR_FIXABLE && \ ++ write == READ && \ ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ ++ mustfix_fsck_err(c, "%s", out.buf); \ ++ goto out; \ ++ } \ ++ \ ++ switch (write) { \ ++ case READ: \ ++ bch_err(c, "%s", out.buf); \ ++ \ ++ switch (type) { \ ++ case BTREE_ERR_FIXABLE: \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ ++ goto fsck_err; \ ++ case BTREE_ERR_WANT_RETRY: \ ++ if (have_retry) { \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case BTREE_ERR_MUST_RETRY: \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ case BTREE_ERR_FATAL: \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write: %s", out.buf);\ ++ \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++out: \ ++ printbuf_exit(&out); \ ++ true; \ ++}) ++ ++#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) ++ ++/* ++ * When btree topology repair changes the start or end of a node, that might ++ * mean we have to drop keys that are no longer inside the node: ++ */ ++void bch2_btree_node_drop_keys_outside_node(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_s_c k; ++ struct bkey unpacked; ++ struct btree_node_iter iter; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k; ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) ++ break; ++ ++ if (k != i->start) { ++ unsigned shift = (u64 *) k - (u64 *) i->start; ++ ++ memmove_u64s_down(i->start, k, ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ ++ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) ++ if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) ++ break; ++ ++ if (k != vstruct_last(i)) { ++ i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ } ++ ++ bch2_btree_build_aux_trees(b); ++ ++ for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { ++ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); ++ } ++} ++ ++static int validate_bset(struct bch_fs *c, struct bch_dev *ca, ++ struct btree *b, struct bset *i, ++ unsigned offset, unsigned sectors, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ const char *err; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret = 0; ++ ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, ca, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(version < c->sb.version_min, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "bset version %u older than superblock version_min %u", ++ version, c->sb.version_min)) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version_min = cpu_to_le16(version); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (btree_err_on(version > c->sb.version, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "bset version %u newer than superblock version %u", ++ version, c->sb.version)) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = cpu_to_le16(version); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ btree_err_on(BSET_SEPARATE_WHITEOUTS(i), ++ BTREE_ERR_FATAL, c, ca, b, i, ++ "BSET_SEPARATE_WHITEOUTS no longer supported"); ++ ++ if (btree_err_on(offset + sectors > btree_sectors(c), ++ BTREE_ERR_FIXABLE, c, ca, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ ret = 0; ++ goto out; ++ } ++ ++ btree_err_on(offset && !i->u64s, ++ BTREE_ERR_FIXABLE, c, ca, b, i, ++ "empty bset"); ++ ++ btree_err_on(BSET_OFFSET(i) && ++ BSET_OFFSET(i) != offset, ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "bset at wrong sector offset"); ++ ++ if (!offset) { ++ struct btree_node *bn = ++ container_of(i, struct btree_node, keys); ++ /* These indicate that we read the wrong btree node: */ ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ /* XXX endianness */ ++ btree_err_on(bp->seq != bn->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "incorrect sequence number (wrong btree node)"); ++ } ++ ++ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ "incorrect btree id"); ++ ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ "incorrect level"); ++ ++ if (!write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ if (BTREE_PTR_RANGE_UPDATED(bp)) { ++ b->data->min_key = bp->min_key; ++ b->data->max_key = b->key.k.p; ++ } ++ ++ btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "incorrect min_key: got %s should be %s", ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), ++ (printbuf_reset(&buf2), ++ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); ++ } ++ ++ btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), ++ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ "incorrect max key %s", ++ (printbuf_reset(&buf1), ++ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); ++ ++ if (write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ err = bch2_bkey_format_validate(&bn->format); ++ btree_err_on(err, ++ BTREE_ERR_FATAL, c, ca, b, i, ++ "invalid bkey format: %s", err); ++ ++ compat_bformat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &bn->format); ++ } ++out: ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++static int bset_key_invalid(struct bch_fs *c, struct btree *b, ++ struct bkey_s_c k, ++ bool updated_range, int rw, ++ struct printbuf *err) ++{ ++ return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: ++ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: ++ (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); ++} ++ ++static int validate_bset_keys(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned *whiteout_u64s, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ struct bkey_packed *k, *prev = NULL; ++ struct printbuf buf = PRINTBUF; ++ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); ++ int ret = 0; ++ ++ for (k = i->start; ++ k != vstruct_last(i);) { ++ struct bkey_s u; ++ struct bkey tmp; ++ ++ if (btree_err_on(bkey_next(k) > vstruct_last(i), ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "key extends past end of bset")) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, ++ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ "invalid bkey format %u", k->format)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ /* XXX: validate k->u64s */ ++ if (!write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ u = __bkey_disassemble(b, k, &tmp); ++ ++ printbuf_reset(&buf); ++ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "invalid bkey: "); ++ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, u.s_c); ++ ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ struct bkey up = bkey_unpack_key(b, prev); ++ ++ printbuf_reset(&buf); ++ prt_printf(&buf, "keys out of order: "); ++ bch2_bkey_to_text(&buf, &up); ++ prt_printf(&buf, " > "); ++ bch2_bkey_to_text(&buf, u.k); ++ ++ bch2_dump_bset(c, b, i, 0); ++ ++ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ } ++ ++ prev = k; ++ k = bkey_next(k); ++ } ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ++ struct btree *b, bool have_retry) ++{ ++ struct btree_node_entry *bne; ++ struct sort_iter *iter; ++ struct btree_node *sorted; ++ struct bkey_packed *k; ++ struct bch_extent_ptr *ptr; ++ struct bset *i; ++ bool used_mempool, blacklisted; ++ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); ++ unsigned u64s; ++ unsigned blacklisted_written, nonblacklisted_written = 0; ++ unsigned ptr_written = btree_ptr_sectors_written(&b->key); ++ struct printbuf buf = PRINTBUF; ++ int ret, retry_read = 0, write = READ; ++ ++ b->version_ondisk = U16_MAX; ++ /* We might get called multiple times on read retry: */ ++ b->written = 0; ++ ++ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); ++ sort_iter_init(iter, b); ++ iter->size = (btree_blocks(c) + 1) * 2; ++ ++ if (bch2_meta_read_fault("btree")) ++ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "dynamic fault"); ++ ++ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "bad magic: want %llx, got %llx", ++ bset_magic(c), le64_to_cpu(b->data->magic)); ++ ++ btree_err_on(!b->data->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "bad btree header: seq 0"); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(b->data->keys.seq != bp->seq, ++ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ "got wrong btree node (seq %llx want %llx)", ++ b->data->keys.seq, bp->seq); ++ } ++ ++ while (b->written < (ptr_written ?: btree_sectors(c))) { ++ unsigned sectors, whiteout_u64s = 0; ++ struct nonce nonce; ++ struct bch_csum csum; ++ bool first = !b->written; ++ ++ if (!b->written) { ++ i = &b->data->keys; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "unknown checksum type %llu", ++ BSET_CSUM_TYPE(i)); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ ++ btree_err_on(bch2_crc_cmp(csum, b->data->csum), ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "invalid checksum"); ++ ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i", ret)) ++ goto fsck_err; ++ ++ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), ++ BTREE_ERR_FATAL, c, NULL, b, NULL, ++ "btree node does not have NEW_EXTENT_OVERWRITE set"); ++ ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; ++ ++ if (i->seq != b->data->keys.seq) ++ break; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "unknown checksum type %llu", ++ BSET_CSUM_TYPE(i)); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ btree_err_on(bch2_crc_cmp(csum, bne->csum), ++ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ "invalid checksum"); ++ ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting btree node: %i\n", ret)) ++ goto fsck_err; ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ b->version_ondisk = min(b->version_ondisk, ++ le16_to_cpu(i->version)); ++ ++ ret = validate_bset(c, ca, b, i, b->written, sectors, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ if (!b->written) ++ btree_node_set_format(b, b->data->format); ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ blacklisted = bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(i->journal_seq), ++ true); ++ ++ btree_err_on(blacklisted && first, ++ BTREE_ERR_FIXABLE, c, ca, b, i, ++ "first btree node bset has blacklisted journal seq (%llu)", ++ le64_to_cpu(i->journal_seq)); ++ ++ btree_err_on(blacklisted && ptr_written, ++ BTREE_ERR_FIXABLE, c, ca, b, i, ++ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", ++ le64_to_cpu(i->journal_seq), ++ b->written, b->written + sectors, ptr_written); ++ ++ b->written += sectors; ++ ++ if (blacklisted && !first) ++ continue; ++ ++ sort_iter_add(iter, i->start, ++ vstruct_idx(i, whiteout_u64s)); ++ ++ sort_iter_add(iter, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); ++ ++ nonblacklisted_written = b->written; ++ } ++ ++ if (ptr_written) { ++ btree_err_on(b->written < ptr_written, ++ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, ++ "btree node data missing: expected %u sectors, found %u", ++ ptr_written, b->written); ++ } else { ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq && ++ !bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), ++ true), ++ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, ++ "found bset signature after last bset"); ++ ++ /* ++ * Blacklisted bsets are those that were written after the most recent ++ * (flush) journal write. Since there wasn't a flush, they may not have ++ * made it to all devices - which means we shouldn't write new bsets ++ * after them, as that could leave a gap and then reads from that device ++ * wouldn't find all the bsets in that btree node - which means it's ++ * important that we start writing new bsets after the most recent _non_ ++ * blacklisted bset: ++ */ ++ blacklisted_written = b->written; ++ b->written = nonblacklisted_written; ++ } ++ ++ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); ++ sorted->keys.u64s = 0; ++ ++ set_btree_bset(b, b->set, &b->data->keys); ++ ++ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); ++ ++ u64s = le16_to_cpu(sorted->keys.u64s); ++ *sorted = *b->data; ++ sorted->keys.u64s = cpu_to_le16(u64s); ++ swap(sorted, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ b->nsets = 1; ++ ++ BUG_ON(b->nr.live_u64s != u64s); ++ ++ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); ++ ++ if (updated_range) ++ bch2_btree_node_drop_keys_outside_node(b); ++ ++ i = &b->data->keys; ++ for (k = i->start; k != vstruct_last(i);) { ++ struct bkey tmp; ++ struct bkey_s u = __bkey_disassemble(b, k, &tmp); ++ ++ printbuf_reset(&buf); ++ ++ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || ++ (bch2_inject_invalid_keys && ++ !bversion_cmp(u.k->version, MAX_VERSION))) { ++ printbuf_reset(&buf); ++ ++ prt_printf(&buf, "invalid bkey: "); ++ bch2_bkey_val_invalid(c, u.s_c, READ, &buf); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, u.s_c); ++ ++ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); ++ ++ btree_keys_account_key_drop(&b->nr, 0, k); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_bset_end(b, b->set); ++ continue; ++ } ++ ++ if (u.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); ++ ++ bp.v->mem_ptr = 0; ++ } ++ ++ k = bkey_next(k); ++ } ++ ++ bch2_bset_build_aux_tree(b, b->set, false); ++ ++ set_needs_whiteout(btree_bset_first(b), true); ++ ++ btree_node_reset_sib_u64s(b); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_rw) ++ set_btree_node_need_rewrite(b); ++ } ++ ++ if (!ptr_written) ++ set_btree_node_need_rewrite(b); ++out: ++ mempool_free(iter, &c->fill_iter); ++ printbuf_exit(&buf); ++ return retry_read; ++fsck_err: ++ if (ret == BTREE_RETRY_READ) { ++ retry_read = 1; ++ } else { ++ bch2_inconsistent_error(c); ++ set_btree_node_read_error(b); ++ } ++ goto out; ++} ++ ++static void btree_node_read_work(struct work_struct *work) ++{ ++ struct btree_read_bio *rb = ++ container_of(work, struct btree_read_bio, work); ++ struct bch_fs *c = rb->c; ++ struct btree *b = rb->b; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ struct bio *bio = &rb->bio; ++ struct bch_io_failures failed = { .nr = 0 }; ++ struct printbuf buf = PRINTBUF; ++ bool saw_error = false; ++ bool retry = false; ++ bool can_retry; ++ ++ goto start; ++ while (1) { ++ retry = true; ++ bch_info(c, "retrying read"); ++ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); ++ bio->bi_iter.bi_sector = rb->pick.ptr.offset; ++ bio->bi_iter.bi_size = btree_bytes(c); ++ ++ if (rb->have_ioref) { ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ submit_bio_wait(bio); ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ } ++start: ++ printbuf_reset(&buf); ++ btree_pos_to_text(&buf, c, b); ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", ++ bch2_blk_status_to_str(bio->bi_status), buf.buf); ++ if (rb->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ rb->have_ioref = false; ++ ++ bch2_mark_io_failure(&failed, &rb->pick); ++ ++ can_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick) > 0; ++ ++ if (!bio->bi_status && ++ !bch2_btree_node_read_done(c, ca, b, can_retry)) { ++ if (retry) ++ bch_info(c, "retry success"); ++ break; ++ } ++ ++ saw_error = true; ++ ++ if (!can_retry) { ++ set_btree_node_read_error(b); ++ break; ++ } ++ } ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); ++ printbuf_exit(&buf); ++ ++ if (saw_error && !btree_node_read_error(b)) ++ bch2_btree_node_rewrite_async(c, b); ++ ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ queue_work(c->io_complete_wq, &rb->work); ++} ++ ++struct btree_node_read_all { ++ struct closure cl; ++ struct bch_fs *c; ++ struct btree *b; ++ unsigned nr; ++ void *buf[BCH_REPLICAS_MAX]; ++ struct bio *bio[BCH_REPLICAS_MAX]; ++ int err[BCH_REPLICAS_MAX]; ++}; ++ ++static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) ++{ ++ struct btree_node *bn = data; ++ struct btree_node_entry *bne; ++ unsigned offset = 0; ++ ++ if (le64_to_cpu(bn->magic) != bset_magic(c)) ++ return 0; ++ ++ while (offset < btree_sectors(c)) { ++ if (!offset) { ++ offset += vstruct_sectors(bn, c->block_bits); ++ } else { ++ bne = data + (offset << 9); ++ if (bne->keys.seq != bn->keys.seq) ++ break; ++ offset += vstruct_sectors(bne, c->block_bits); ++ } ++ } ++ ++ return offset; ++} ++ ++static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) ++{ ++ struct btree_node *bn = data; ++ struct btree_node_entry *bne; ++ ++ if (!offset) ++ return false; ++ ++ while (offset < btree_sectors(c)) { ++ bne = data + (offset << 9); ++ if (bne->keys.seq == bn->keys.seq) ++ return true; ++ offset++; ++ } ++ ++ return false; ++ return offset; ++} ++ ++static void btree_node_read_all_replicas_done(struct closure *cl) ++{ ++ struct btree_node_read_all *ra = ++ container_of(cl, struct btree_node_read_all, cl); ++ struct bch_fs *c = ra->c; ++ struct btree *b = ra->b; ++ struct printbuf buf = PRINTBUF; ++ bool dump_bset_maps = false; ++ bool have_retry = false; ++ int ret = 0, best = -1, write = READ; ++ unsigned i, written = 0, written2 = 0; ++ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ++ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; ++ ++ for (i = 0; i < ra->nr; i++) { ++ struct btree_node *bn = ra->buf[i]; ++ ++ if (ra->err[i]) ++ continue; ++ ++ if (le64_to_cpu(bn->magic) != bset_magic(c) || ++ (seq && seq != bn->keys.seq)) ++ continue; ++ ++ if (best < 0) { ++ best = i; ++ written = btree_node_sectors_written(c, bn); ++ continue; ++ } ++ ++ written2 = btree_node_sectors_written(c, ra->buf[i]); ++ if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "btree node sectors written mismatch: %u != %u", ++ written, written2) || ++ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), ++ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "found bset signature after last bset") || ++ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), ++ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ "btree node replicas content mismatch")) ++ dump_bset_maps = true; ++ ++ if (written2 > written) { ++ written = written2; ++ best = i; ++ } ++ } ++fsck_err: ++ if (dump_bset_maps) { ++ for (i = 0; i < ra->nr; i++) { ++ struct btree_node *bn = ra->buf[i]; ++ struct btree_node_entry *bne = NULL; ++ unsigned offset = 0, sectors; ++ bool gap = false; ++ ++ if (ra->err[i]) ++ continue; ++ ++ printbuf_reset(&buf); ++ ++ while (offset < btree_sectors(c)) { ++ if (!offset) { ++ sectors = vstruct_sectors(bn, c->block_bits); ++ } else { ++ bne = ra->buf[i] + (offset << 9); ++ if (bne->keys.seq != bn->keys.seq) ++ break; ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ prt_printf(&buf, " %u-%u", offset, offset + sectors); ++ if (bne && bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), false)) ++ prt_printf(&buf, "*"); ++ offset += sectors; ++ } ++ ++ while (offset < btree_sectors(c)) { ++ bne = ra->buf[i] + (offset << 9); ++ if (bne->keys.seq == bn->keys.seq) { ++ if (!gap) ++ prt_printf(&buf, " GAP"); ++ gap = true; ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ prt_printf(&buf, " %u-%u", offset, offset + sectors); ++ if (bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(bne->keys.journal_seq), false)) ++ prt_printf(&buf, "*"); ++ } ++ offset++; ++ } ++ ++ bch_err(c, "replica %u:%s", i, buf.buf); ++ } ++ } ++ ++ if (best >= 0) { ++ memcpy(b->data, ra->buf[best], btree_bytes(c)); ++ ret = bch2_btree_node_read_done(c, NULL, b, false); ++ } else { ++ ret = -1; ++ } ++ ++ if (ret) ++ set_btree_node_read_error(b); ++ ++ for (i = 0; i < ra->nr; i++) { ++ mempool_free(ra->buf[i], &c->btree_bounce_pool); ++ bio_put(ra->bio[i]); ++ } ++ ++ closure_debug_destroy(&ra->cl); ++ kfree(ra); ++ printbuf_exit(&buf); ++ ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_all_replicas_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ struct btree_node_read_all *ra = rb->ra; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ ra->err[rb->idx] = bio->bi_status; ++ closure_put(&ra->cl); ++} ++ ++/* ++ * XXX This allocates multiple times from the same mempools, and can deadlock ++ * under sufficient memory pressure (but is only a debug path) ++ */ ++static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) ++{ ++ struct bkey_s_c k = bkey_i_to_s_c(&b->key); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded pick; ++ struct btree_node_read_all *ra; ++ unsigned i; ++ ++ ra = kzalloc(sizeof(*ra), GFP_NOFS); ++ if (!ra) ++ return -ENOMEM; ++ ++ closure_init(&ra->cl, NULL); ++ ra->c = c; ++ ra->b = b; ++ ra->nr = bch2_bkey_nr_ptrs(k); ++ ++ for (i = 0; i < ra->nr; i++) { ++ ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ++ ra->bio[i] = bio_alloc_bioset(NULL, ++ buf_pages(ra->buf[i], btree_bytes(c)), ++ REQ_OP_READ|REQ_SYNC|REQ_META, ++ GFP_NOFS, ++ &c->btree_bio); ++ } ++ ++ i = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ struct btree_read_bio *rb = ++ container_of(ra->bio[i], struct btree_read_bio, bio); ++ rb->c = c; ++ rb->b = b; ++ rb->ra = ra; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->idx = i; ++ rb->pick = pick; ++ rb->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rb->bio.bi_end_io = btree_node_read_all_replicas_endio; ++ bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], ++ bio_sectors(&rb->bio)); ++ bio_set_dev(&rb->bio, ca->disk_sb.bdev); ++ ++ closure_get(&ra->cl); ++ submit_bio(&rb->bio); ++ } else { ++ ra->err[i] = BLK_STS_REMOVED; ++ } ++ ++ i++; ++ } ++ ++ if (sync) { ++ closure_sync(&ra->cl); ++ btree_node_read_all_replicas_done(&ra->cl); ++ } else { ++ continue_at(&ra->cl, btree_node_read_all_replicas_done, ++ c->io_complete_wq); ++ } ++ ++ return 0; ++} ++ ++void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ++ bool sync) ++{ ++ struct extent_ptr_decoded pick; ++ struct btree_read_bio *rb; ++ struct bch_dev *ca; ++ struct bio *bio; ++ int ret; ++ ++ trace_btree_read(c, b); ++ ++ if (bch2_verify_all_btree_replicas && ++ !btree_node_read_all_replicas(c, b, sync)) ++ return; ++ ++ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick); ++ ++ if (ret <= 0) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_str(&buf, "btree node read error: no device to read from\n at "); ++ btree_pos_to_text(&buf, c, b); ++ bch_err(c, "%s", buf.buf); ++ ++ if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) ++ bch2_fatal_error(c); ++ ++ set_btree_node_read_error(b); ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++ printbuf_exit(&buf); ++ return; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ bio = bio_alloc_bioset(NULL, ++ buf_pages(b->data, btree_bytes(c)), ++ REQ_OP_READ|REQ_SYNC|REQ_META, ++ GFP_NOIO, ++ &c->btree_bio); ++ rb = container_of(bio, struct btree_read_bio, bio); ++ rb->c = c; ++ rb->b = b; ++ rb->ra = NULL; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->pick = pick; ++ INIT_WORK(&rb->work, btree_node_read_work); ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bio->bi_end_io = btree_node_read_endio; ++ bch2_bio_map(bio, b->data, btree_bytes(c)); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], ++ bio_sectors(bio)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ ++ if (sync) { ++ submit_bio_wait(bio); ++ ++ btree_node_read_work(&rb->work); ++ } else { ++ submit_bio(bio); ++ } ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ ++ if (sync) ++ btree_node_read_work(&rb->work); ++ else ++ queue_work(c->io_complete_wq, &rb->work); ++ } ++} ++ ++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c, level != 0); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ BUG_ON(IS_ERR(b)); ++ ++ bkey_copy(&b->key, k); ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); ++ ++ set_btree_node_read_in_flight(b); ++ ++ bch2_btree_node_read(c, b, true); ++ ++ if (btree_node_read_error(b)) { ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_btree_set_root_for_read(c, b); ++err: ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ return ret; ++} ++ ++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ++ struct btree_write *w) ++{ ++ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); ++ ++ do { ++ old = new = v; ++ if (!(old & 1)) ++ break; ++ ++ new &= ~1UL; ++ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); ++ ++ if (old & 1) ++ closure_put(&((struct btree_update *) new)->cl); ++ ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++} ++ ++static void __btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_write *w = btree_prev_write(b); ++ unsigned long old, new, v; ++ ++ bch2_btree_complete_write(c, b, w); ++ ++ v = READ_ONCE(b->flags); ++ do { ++ old = new = v; ++ ++ if ((old & (1U << BTREE_NODE_dirty)) && ++ (old & (1U << BTREE_NODE_need_write)) && ++ !(old & (1U << BTREE_NODE_never_write)) && ++ !(old & (1U << BTREE_NODE_write_blocked)) && ++ !(old & (1U << BTREE_NODE_will_make_reachable))) { ++ new &= ~(1U << BTREE_NODE_dirty); ++ new &= ~(1U << BTREE_NODE_need_write); ++ new |= (1U << BTREE_NODE_write_in_flight); ++ new |= (1U << BTREE_NODE_write_in_flight_inner); ++ new |= (1U << BTREE_NODE_just_written); ++ new ^= (1U << BTREE_NODE_write_idx); ++ } else { ++ new &= ~(1U << BTREE_NODE_write_in_flight); ++ new &= ~(1U << BTREE_NODE_write_in_flight_inner); ++ } ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ if (new & (1U << BTREE_NODE_write_in_flight)) ++ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); ++ else ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ __btree_node_write_done(c, b); ++ six_unlock_read(&b->c.lock); ++} ++ ++static void btree_node_write_work(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; ++ struct btree *b = wbio->wbio.bio.bi_private; ++ struct bch_extent_ptr *ptr; ++ int ret; ++ ++ btree_bounce_free(c, ++ wbio->data_bytes, ++ wbio->wbio.used_mempool, ++ wbio->data); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, ++ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) ++ goto err; ++ ++ if (wbio->wbio.first_btree_write) { ++ if (wbio->wbio.failed.nr) { ++ ++ } ++ } else { ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, ++ !wbio->wbio.failed.nr)); ++ if (ret) ++ goto err; ++ } ++out: ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++ return; ++err: ++ set_btree_node_noevict(b); ++ bch2_fs_fatal_error(c, "fatal error writing btree node"); ++ goto out; ++} ++ ++static void btree_node_write_endio(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_write_bio *orig = parent ?: wbio; ++ struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); ++ struct bch_fs *c = wbio->c; ++ struct btree *b = wbio->bio.bi_private; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ unsigned long flags; ++ ++ if (wbio->have_ioref) ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("btree")) { ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bch2_dev_list_add_dev(&orig->failed, wbio->dev); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ } ++ ++ if (wbio->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ ++ if (parent) { ++ bio_put(bio); ++ bio_endio(&parent->bio); ++ return; ++ } ++ ++ clear_btree_node_write_in_flight_inner(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(c->btree_io_complete_wq, &wb->work); ++} ++ ++static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors) ++{ ++ unsigned whiteout_u64s = 0; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), ++ BKEY_TYPE_btree, WRITE, &buf); ++ ++ if (ret) ++ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); ++ printbuf_exit(&buf); ++ if (ret) ++ return ret; ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: ++ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); ++ if (ret) { ++ bch2_inconsistent_error(c); ++ dump_stack(); ++ } ++ ++ return ret; ++} ++ ++static void btree_write_submit(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); ++ struct bch_extent_ptr *ptr; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ ++ bkey_copy(&tmp.k, &wbio->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) ++ ptr->offset += wbio->sector_offset; ++ ++ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); ++} ++ ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) ++{ ++ struct btree_write_bio *wbio; ++ struct bset_tree *t; ++ struct bset *i; ++ struct btree_node *bn = NULL; ++ struct btree_node_entry *bne = NULL; ++ struct sort_iter sort_iter; ++ struct nonce nonce; ++ unsigned bytes_to_write, sectors_to_write, bytes, u64s; ++ u64 seq = 0; ++ bool used_mempool; ++ unsigned long old, new; ++ bool validate_before_checksum = false; ++ void *data; ++ int ret; ++ ++ if (flags & BTREE_WRITE_ALREADY_STARTED) ++ goto do_write; ++ ++ /* ++ * We may only have a read lock on the btree node - the dirty bit is our ++ * "lock" against racing with other threads that may be trying to start ++ * a write, we do a write iff we clear the dirty bit. Since setting the ++ * dirty bit requires a write lock, we can't race with other threads ++ * redirtying it: ++ */ ++ do { ++ old = new = READ_ONCE(b->flags); ++ ++ if (!(old & (1 << BTREE_NODE_dirty))) ++ return; ++ ++ if ((flags & BTREE_WRITE_ONLY_IF_NEED) && ++ !(old & (1 << BTREE_NODE_need_write))) ++ return; ++ ++ if (old & ++ ((1 << BTREE_NODE_never_write)| ++ (1 << BTREE_NODE_write_blocked))) ++ return; ++ ++ if (b->written && ++ (old & (1 << BTREE_NODE_will_make_reachable))) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) ++ return; ++ ++ new &= ~(1 << BTREE_NODE_dirty); ++ new &= ~(1 << BTREE_NODE_need_write); ++ new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_write_in_flight_inner); ++ new |= (1 << BTREE_NODE_just_written); ++ new ^= (1 << BTREE_NODE_write_idx); ++ } while (cmpxchg_acquire(&b->flags, old, new) != old); ++ ++ if (new & (1U << BTREE_NODE_need_write)) ++ return; ++do_write: ++ atomic_dec(&c->btree_cache.dirty); ++ ++ BUG_ON(btree_node_fake(b)); ++ BUG_ON((b->will_make_reachable != 0) != !b->written); ++ ++ BUG_ON(b->written >= btree_sectors(c)); ++ BUG_ON(b->written & (block_sectors(c) - 1)); ++ BUG_ON(bset_written(b, btree_bset_last(b))); ++ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); ++ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ bytes = !b->written ++ ? sizeof(struct btree_node) ++ : sizeof(struct btree_node_entry); ++ ++ bytes += b->whiteout_u64s * sizeof(u64); ++ ++ for_each_bset(b, t) { ++ i = bset(b, t); ++ ++ if (bset_written(b, i)) ++ continue; ++ ++ bytes += le16_to_cpu(i->u64s) * sizeof(u64); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ seq = max(seq, le64_to_cpu(i->journal_seq)); ++ } ++ ++ BUG_ON(b->written && !seq); ++ ++ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ ++ bytes += 8; ++ ++ /* buffer must be a multiple of the block size */ ++ bytes = round_up(bytes, block_bytes(c)); ++ ++ data = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ if (!b->written) { ++ bn = data; ++ *bn = *b->data; ++ i = &bn->keys; ++ } else { ++ bne = data; ++ bne->keys = b->data->keys; ++ i = &bne->keys; ++ } ++ ++ i->journal_seq = cpu_to_le64(seq); ++ i->u64s = 0; ++ ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); ++ ++ b->whiteout_u64s = 0; ++ ++ u64s = bch2_sort_keys(i->start, &sort_iter, false); ++ le16_add_cpu(&i->u64s, u64s); ++ ++ set_needs_whiteout(i, false); ++ ++ /* do we have data to write? */ ++ if (b->written && !i->u64s) ++ goto nowrite; ++ ++ bytes_to_write = vstruct_end(i) - data; ++ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; ++ ++ memset(data + bytes_to_write, 0, ++ (sectors_to_write << 9) - bytes_to_write); ++ ++ BUG_ON(b->written + sectors_to_write > btree_sectors(c)); ++ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); ++ BUG_ON(i->seq != b->data->keys.seq); ++ ++ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ++ ? cpu_to_le16(BCH_BSET_VERSION_OLD) ++ : cpu_to_le16(c->sb.version); ++ SET_BSET_OFFSET(i, b->written); ++ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) ++ validate_before_checksum = true; ++ ++ /* validate_bset will be modifying: */ ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) ++ validate_before_checksum = true; ++ ++ /* if we're going to be encrypting, check metadata validity first: */ ++ if (validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error encrypting btree node: %i\n", ret)) ++ goto err; ++ ++ nonce = btree_nonce(i, b->written << 9); ++ ++ if (bn) ++ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); ++ else ++ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ /* if we're not encrypting, check metadata after checksumming: */ ++ if (!validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ /* ++ * We handle btree write errors by immediately halting the journal - ++ * after we've done that, we can't issue any subsequent btree writes ++ * because they might have pointers to new nodes that failed to write. ++ * ++ * Furthermore, there's no point in doing any more btree writes because ++ * with the journal stopped, we're never going to update the journal to ++ * reflect that those writes were done and the data flushed from the ++ * journal: ++ * ++ * Also on journal error, the pending write may have updates that were ++ * never journalled (interior nodes, see btree_update_nodes_written()) - ++ * it's critical that we don't do the write in that case otherwise we ++ * will have updates visible that weren't in the journal: ++ * ++ * Make sure to update b->written so bch2_btree_init_next() doesn't ++ * break: ++ */ ++ if (bch2_journal_error(&c->journal) || ++ c->opts.nochanges) ++ goto err; ++ ++ trace_btree_write(b, bytes_to_write, sectors_to_write); ++ ++ wbio = container_of(bio_alloc_bioset(NULL, ++ buf_pages(data, sectors_to_write << 9), ++ REQ_OP_WRITE|REQ_META, ++ GFP_NOIO, ++ &c->btree_bio), ++ struct btree_write_bio, wbio.bio); ++ wbio_init(&wbio->wbio.bio); ++ wbio->data = data; ++ wbio->data_bytes = bytes; ++ wbio->sector_offset = b->written; ++ wbio->wbio.c = c; ++ wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.first_btree_write = !b->written; ++ wbio->wbio.bio.bi_end_io = btree_node_write_endio; ++ wbio->wbio.bio.bi_private = b; ++ ++ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); ++ ++ bkey_copy(&wbio->key, &b->key); ++ ++ b->written += sectors_to_write; ++ ++ if (wbio->wbio.first_btree_write && ++ b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = ++ cpu_to_le16(b->written); ++ ++ if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = ++ cpu_to_le16(b->written); ++ ++ atomic64_inc(&c->btree_writes_nr); ++ atomic64_add(sectors_to_write, &c->btree_writes_sectors); ++ ++ INIT_WORK(&wbio->work, btree_write_submit); ++ queue_work(c->io_complete_wq, &wbio->work); ++ return; ++err: ++ set_btree_node_noevict(b); ++ if (!b->written && ++ b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = ++ cpu_to_le16(sectors_to_write); ++ b->written += sectors_to_write; ++nowrite: ++ btree_bounce_free(c, bytes, used_mempool, data); ++ __btree_node_write_done(c, b); ++} ++ ++/* ++ * Work that must be done with write lock held: ++ */ ++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) ++{ ++ bool invalidated_iter = false; ++ struct btree_node_entry *bne; ++ struct bset_tree *t; ++ ++ if (!btree_node_just_written(b)) ++ return false; ++ ++ BUG_ON(b->whiteout_u64s); ++ ++ clear_btree_node_just_written(b); ++ ++ /* ++ * Note: immediately after write, bset_written() doesn't work - the ++ * amount of data we had to write after compaction might have been ++ * smaller than the offset of the last bset. ++ * ++ * However, we know that all bsets have been written here, as long as ++ * we're still holding the write lock: ++ */ ++ ++ /* ++ * XXX: decide if we really want to unconditionally sort down to a ++ * single bset: ++ */ ++ if (b->nsets > 1) { ++ btree_node_sort(c, b, 0, b->nsets, true); ++ invalidated_iter = true; ++ } else { ++ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); ++ } ++ ++ for_each_bset(b, t) ++ set_needs_whiteout(bset(b, t), true); ++ ++ bch2_btree_verify(c, b); ++ ++ /* ++ * If later we don't unconditionally sort down to a single bset, we have ++ * to ensure this is still true: ++ */ ++ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return invalidated_iter; ++} ++ ++/* ++ * Use this one if the node is intent locked: ++ */ ++void bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held, ++ unsigned flags) ++{ ++ if (lock_type_held == SIX_LOCK_intent || ++ (lock_type_held == SIX_LOCK_read && ++ six_lock_tryupgrade(&b->c.lock))) { ++ __bch2_btree_node_write(c, b, flags); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->c.lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ six_unlock_write(&b->c.lock); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ } else { ++ __bch2_btree_node_write(c, b, flags); ++ if (lock_type_held == SIX_LOCK_write && ++ btree_node_just_written(b)) ++ bch2_btree_post_write_cleanup(c, b); ++ } ++} ++ ++static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ bool ret = false; ++restart: ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (test_bit(flag, &b->flags)) { ++ rcu_read_unlock(); ++ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); ++ ret = true; ++ goto restart; ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++bool bch2_btree_flush_all_reads(struct bch_fs *c) ++{ ++ return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); ++} ++ ++bool bch2_btree_flush_all_writes(struct bch_fs *c) ++{ ++ return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); ++} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +new file mode 100644 +index 000000000000..8af853642123 +--- /dev/null ++++ b/fs/bcachefs/btree_io.h +@@ -0,0 +1,222 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_IO_H ++#define _BCACHEFS_BTREE_IO_H ++ ++#include "bkey_methods.h" ++#include "bset.h" ++#include "btree_locking.h" ++#include "checksum.h" ++#include "extents.h" ++#include "io_types.h" ++ ++struct bch_fs; ++struct btree_write; ++struct btree; ++struct btree_iter; ++struct btree_node_read_all; ++ ++static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) ++{ ++ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_inc(&c->btree_cache.dirty); ++} ++ ++static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) ++{ ++ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_dec(&c->btree_cache.dirty); ++} ++ ++static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) ++ : 0; ++} ++ ++struct btree_read_bio { ++ struct bch_fs *c; ++ struct btree *b; ++ struct btree_node_read_all *ra; ++ u64 start_time; ++ unsigned have_ioref:1; ++ unsigned idx:7; ++ struct extent_ptr_decoded pick; ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++struct btree_write_bio { ++ struct work_struct work; ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ void *data; ++ unsigned data_bytes; ++ unsigned sector_offset; ++ struct bch_write_bio wbio; ++}; ++ ++void bch2_btree_node_io_unlock(struct btree *); ++void bch2_btree_node_io_lock(struct btree *); ++void __bch2_btree_node_wait_on_read(struct btree *); ++void __bch2_btree_node_wait_on_write(struct btree *); ++void bch2_btree_node_wait_on_read(struct btree *); ++void bch2_btree_node_wait_on_write(struct btree *); ++ ++enum compact_mode { ++ COMPACT_LAZY, ++ COMPACT_ALL, ++}; ++ ++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, ++ enum compact_mode); ++ ++static inline bool should_compact_bset_lazy(struct btree *b, ++ struct bset_tree *t) ++{ ++ unsigned total_u64s = bset_u64s(t); ++ unsigned dead_u64s = bset_dead_u64s(b, t); ++ ++ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; ++} ++ ++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (should_compact_bset_lazy(b, t)) ++ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ ++ return false; ++} ++ ++static inline struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ int ret; ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, ++ &bn->flags, bytes); ++ if (ret) ++ return ret; ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); ++ ++void bch2_btree_node_drop_keys_outside_node(struct btree *); ++ ++void bch2_btree_build_aux_trees(struct btree *); ++void bch2_btree_init_next(struct btree_trans *, struct btree *); ++ ++int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, ++ struct btree *, bool); ++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); ++int bch2_btree_root_read(struct bch_fs *, enum btree_id, ++ const struct bkey_i *, unsigned); ++ ++void bch2_btree_complete_write(struct bch_fs *, struct btree *, ++ struct btree_write *); ++ ++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ++ ++#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) ++#define BTREE_WRITE_ALREADY_STARTED (1U << 1) ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); ++void bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type, unsigned); ++ ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_held) ++{ ++ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); ++} ++ ++bool bch2_btree_flush_all_reads(struct bch_fs *); ++bool bch2_btree_flush_all_writes(struct bch_fs *); ++ ++static inline void compat_bformat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_inodes) { ++ swap(f->bits_per_field[BKEY_FIELD_INODE], ++ f->bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(f->field_offset[BKEY_FIELD_INODE], ++ f->field_offset[BKEY_FIELD_OFFSET]); ++ } ++ ++ if (version < bcachefs_metadata_version_snapshot && ++ (level || btree_type_has_snapshots(btree_id))) { ++ u64 max_packed = ++ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); ++ ++ f->field_offset[BKEY_FIELD_SNAPSHOT] = write ++ ? 0 ++ : U32_MAX - max_packed; ++ } ++} ++ ++static inline void compat_bpos(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bpos *p) ++{ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bpos_swab(p); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_inodes) ++ swap(p->inode, p->offset); ++} ++ ++static inline void compat_btree_node(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct btree_node *bn) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bpos_cmp(bn->min_key, POS_MIN) && ++ write) ++ bn->min_key = bpos_nosnap_predecessor(bn->min_key); ++ ++ if (version < bcachefs_metadata_version_snapshot && ++ write) ++ bn->max_key.snapshot = 0; ++ ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); ++ ++ if (version < bcachefs_metadata_version_snapshot && ++ !write) ++ bn->max_key.snapshot = U32_MAX; ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bpos_cmp(bn->min_key, POS_MIN) && ++ !write) ++ bn->min_key = bpos_nosnap_successor(bn->min_key); ++} ++ ++#endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +new file mode 100644 +index 000000000000..a90a45939aa3 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.c +@@ -0,0 +1,3515 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_buf.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "subvolume.h" ++ ++#include ++#include ++#include ++ ++static void btree_trans_verify_sorted(struct btree_trans *); ++inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++ ++static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); ++static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, ++ struct btree_path *); ++ ++static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return iter->ip_allocated; ++#else ++ return 0; ++#endif ++} ++ ++static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); ++ ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans); ++ } else { ++ return 0; ++ } ++} ++ ++static inline int __btree_path_cmp(const struct btree_path *l, ++ enum btree_id r_btree_id, ++ bool r_cached, ++ struct bpos r_pos, ++ unsigned r_level) ++{ ++ /* ++ * Must match lock ordering as defined by __bch2_btree_node_lock: ++ */ ++ return cmp_int(l->btree_id, r_btree_id) ?: ++ cmp_int((int) l->cached, (int) r_cached) ?: ++ bpos_cmp(l->pos, r_pos) ?: ++ -cmp_int(l->level, r_level); ++} ++ ++static inline int btree_path_cmp(const struct btree_path *l, ++ const struct btree_path *r) ++{ ++ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); ++} ++ ++static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) ++{ ++ /* Are we iterating over keys in all snapshots? */ ++ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { ++ p = bpos_successor(p); ++ } else { ++ p = bpos_nosnap_successor(p); ++ p.snapshot = iter->snapshot; ++ } ++ ++ return p; ++} ++ ++static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) ++{ ++ /* Are we iterating over keys in all snapshots? */ ++ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { ++ p = bpos_predecessor(p); ++ } else { ++ p = bpos_nosnap_predecessor(p); ++ p.snapshot = iter->snapshot; ++ } ++ ++ return p; ++} ++ ++static inline bool is_btree_node(struct btree_path *path, unsigned l) ++{ ++ return l < BTREE_MAX_DEPTH && ++ (unsigned long) path->l[l].b >= 128; ++} ++ ++static inline struct bpos btree_iter_search_key(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ pos = bkey_successor(iter, pos); ++ return pos; ++} ++ ++static inline bool btree_path_pos_before_node(struct btree_path *path, ++ struct btree *b) ++{ ++ return bpos_cmp(path->pos, b->data->min_key) < 0; ++} ++ ++static inline bool btree_path_pos_after_node(struct btree_path *path, ++ struct btree *b) ++{ ++ return bpos_cmp(b->key.k.p, path->pos) < 0; ++} ++ ++static inline bool btree_path_pos_in_node(struct btree_path *path, ++ struct btree *b) ++{ ++ return path->btree_id == b->c.btree_id && ++ !btree_path_pos_before_node(path, b) && ++ !btree_path_pos_after_node(path, b); ++} ++ ++/* Btree node locking: */ ++ ++void bch2_btree_node_unlock_write(struct btree_trans *trans, ++ struct btree_path *path, struct btree *b) ++{ ++ bch2_btree_node_unlock_write_inlined(trans, path, b); ++} ++ ++void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_path *linked; ++ unsigned readers = 0; ++ ++ trans_for_each_path(trans, linked) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) ++ readers++; ++ ++ /* ++ * Must drop our read locks before calling six_lock_write() - ++ * six_unlock() won't do wakeups until the reader count ++ * goes to 0, and it's safe because we have the node intent ++ * locked: ++ */ ++ if (!b->c.lock.readers) ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_sub(*b->c.lock.readers, readers); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ if (!b->c.lock.readers) ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ else ++ this_cpu_add(*b->c.lock.readers, readers); ++} ++ ++bool __bch2_btree_node_relock(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) ++{ ++ struct btree *b = btree_path_node(path, level); ++ int want = __btree_lock_want(path, level); ++ ++ if (!is_btree_node(path, level)) ++ goto fail; ++ ++ if (race_fault()) ++ goto fail; ++ ++ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(path, b, level) && ++ btree_node_lock_increment(trans, b, level, want))) { ++ mark_btree_node_locked(trans, path, level, want); ++ return true; ++ } ++fail: ++ if (b != BTREE_ITER_NO_NODE_CACHED && ++ b != BTREE_ITER_NO_NODE_INIT) ++ trace_btree_node_relock_fail(trans->fn, _RET_IP_, ++ path->btree_id, ++ &path->pos, ++ (unsigned long) b, ++ path->l[level].lock_seq, ++ is_btree_node(path, level) ? b->c.lock.state.seq : 0); ++ return false; ++} ++ ++bool bch2_btree_node_upgrade(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) ++{ ++ struct btree *b = path->l[level].b; ++ ++ if (!is_btree_node(path, level)) ++ return false; ++ ++ switch (btree_lock_want(path, level)) { ++ case BTREE_NODE_UNLOCKED: ++ BUG_ON(btree_node_locked(path, level)); ++ return true; ++ case BTREE_NODE_READ_LOCKED: ++ BUG_ON(btree_node_intent_locked(path, level)); ++ return bch2_btree_node_relock(trans, path, level); ++ case BTREE_NODE_INTENT_LOCKED: ++ break; ++ } ++ ++ if (btree_node_intent_locked(path, level)) ++ return true; ++ ++ if (race_fault()) ++ return false; ++ ++ if (btree_node_locked(path, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) ++ goto success; ++ ++ if (btree_node_lock_seq_matches(path, b, level) && ++ btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_unlock(trans, path, level); ++ goto success; ++ } ++ ++ return false; ++success: ++ mark_btree_node_intent_locked(trans, path, level); ++ return true; ++} ++ ++static inline bool btree_path_get_locks(struct btree_trans *trans, ++ struct btree_path *path, ++ bool upgrade) ++{ ++ unsigned l = path->level; ++ int fail_idx = -1; ++ ++ do { ++ if (!btree_path_node(path, l)) ++ break; ++ ++ if (!(upgrade ++ ? bch2_btree_node_upgrade(trans, path, l) ++ : bch2_btree_node_relock(trans, path, l))) ++ fail_idx = l; ++ ++ l++; ++ } while (l < path->locks_want); ++ ++ /* ++ * When we fail to get a lock, we have to ensure that any child nodes ++ * can't be relocked so bch2_btree_path_traverse has to walk back up to ++ * the node that we failed to relock: ++ */ ++ if (fail_idx >= 0) { ++ __bch2_btree_path_unlock(trans, path); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ ++ do { ++ path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } while (fail_idx >= 0); ++ } ++ ++ if (path->uptodate == BTREE_ITER_NEED_RELOCK) ++ path->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_trans_verify_locks(trans); ++ ++ return path->uptodate < BTREE_ITER_NEED_RELOCK; ++} ++ ++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, ++ bool cached) ++{ ++ return !cached ++ ? container_of(_b, struct btree, c)->key.k.p ++ : container_of(_b, struct bkey_cached, c)->key.pos; ++} ++ ++/* Slowpath: */ ++int __bch2_btree_node_lock(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) ++{ ++ struct btree_path *linked; ++ unsigned reason; ++ ++ /* Check if it's safe to block: */ ++ trans_for_each_path(trans, linked) { ++ if (!linked->nodes_locked) ++ continue; ++ ++ /* ++ * Can't block taking an intent lock if we have _any_ nodes read ++ * locked: ++ * ++ * - Our read lock blocks another thread with an intent lock on ++ * the same node from getting a write lock, and thus from ++ * dropping its intent lock ++ * ++ * - And the other thread may have multiple nodes intent locked: ++ * both the node we want to intent lock, and the node we ++ * already have read locked - deadlock: ++ */ ++ if (type == SIX_LOCK_intent && ++ linked->nodes_locked != linked->nodes_intent_locked) { ++ reason = 1; ++ goto deadlock; ++ } ++ ++ if (linked->btree_id != path->btree_id) { ++ if (linked->btree_id < path->btree_id) ++ continue; ++ ++ reason = 3; ++ goto deadlock; ++ } ++ ++ /* ++ * Within the same btree, non-cached paths come before cached ++ * paths: ++ */ ++ if (linked->cached != path->cached) { ++ if (!linked->cached) ++ continue; ++ ++ reason = 4; ++ goto deadlock; ++ } ++ ++ /* ++ * Interior nodes must be locked before their descendants: if ++ * another path has possible descendants locked of the node ++ * we're about to lock, it must have the ancestors locked too: ++ */ ++ if (level > __fls(linked->nodes_locked)) { ++ reason = 5; ++ goto deadlock; ++ } ++ ++ /* Must lock btree nodes in key order: */ ++ if (btree_node_locked(linked, level) && ++ bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ linked->cached)) <= 0) { ++ reason = 7; ++ goto deadlock; ++ } ++ } ++ ++ return btree_node_lock_type(trans, path, b, pos, level, ++ type, should_sleep_fn, p); ++deadlock: ++ trace_trans_restart_would_deadlock(trans->fn, ip, ++ trans->in_traverse_all, reason, ++ linked->btree_id, ++ linked->cached, ++ &linked->pos, ++ path->btree_id, ++ path->cached, ++ &pos); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); ++} ++ ++/* Btree iterator locking: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_btree_path_verify_locks(struct btree_path *path) ++{ ++ unsigned l; ++ ++ if (!path->nodes_locked) { ++ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && ++ btree_path_node(path, path->level)); ++ return; ++ } ++ ++ for (l = 0; btree_path_node(path, l); l++) ++ BUG_ON(btree_lock_want(path, l) != ++ btree_node_locked_type(path, l)); ++} ++ ++void bch2_trans_verify_locks(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ bch2_btree_path_verify_locks(path); ++} ++#else ++static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} ++#endif ++ ++/* Btree path locking: */ ++ ++/* ++ * Only for btree_cache.c - only relocks intent locks ++ */ ++int bch2_btree_path_relock_intent(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ unsigned l; ++ ++ for (l = path->level; ++ l < path->locks_want && btree_path_node(path, l); ++ l++) { ++ if (!bch2_btree_node_relock(trans, path, l)) { ++ __bch2_btree_path_unlock(trans, path); ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); ++ } ++ } ++ ++ return 0; ++} ++ ++__flatten ++static int bch2_btree_path_relock(struct btree_trans *trans, ++ struct btree_path *path, unsigned long trace_ip) ++{ ++ bool ret = btree_path_get_locks(trans, path, false); ++ ++ if (!ret) { ++ trace_trans_restart_relock_path(trans->fn, trace_ip, ++ path->btree_id, &path->pos); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); ++ } ++ ++ return 0; ++} ++ ++bool __bch2_btree_path_upgrade(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) ++{ ++ struct btree_path *linked; ++ ++ EBUG_ON(path->locks_want >= new_locks_want); ++ ++ path->locks_want = new_locks_want; ++ ++ if (btree_path_get_locks(trans, path, true)) ++ return true; ++ ++ /* ++ * XXX: this is ugly - we'd prefer to not be mucking with other ++ * iterators in the btree_trans here. ++ * ++ * On failure to upgrade the iterator, setting iter->locks_want and ++ * calling get_locks() is sufficient to make bch2_btree_path_traverse() ++ * get the locks we want on transaction restart. ++ * ++ * But if this iterator was a clone, on transaction restart what we did ++ * to this iterator isn't going to be preserved. ++ * ++ * Possibly we could add an iterator field for the parent iterator when ++ * an iterator is a copy - for now, we'll just upgrade any other ++ * iterators with the same btree id. ++ * ++ * The code below used to be needed to ensure ancestor nodes get locked ++ * before interior nodes - now that's handled by ++ * bch2_btree_path_traverse_all(). ++ */ ++ if (!path->cached && !trans->in_traverse_all) ++ trans_for_each_path(trans, linked) ++ if (linked != path && ++ linked->cached == path->cached && ++ linked->btree_id == path->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_path_get_locks(trans, linked, true); ++ } ++ ++ return false; ++} ++ ++void __bch2_btree_path_downgrade(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) ++{ ++ unsigned l; ++ ++ EBUG_ON(path->locks_want < new_locks_want); ++ ++ path->locks_want = new_locks_want; ++ ++ while (path->nodes_locked && ++ (l = __fls(path->nodes_locked)) >= path->locks_want) { ++ if (l > path->level) { ++ btree_node_unlock(trans, path, l); ++ } else { ++ if (btree_node_intent_locked(path, l)) { ++ six_lock_downgrade(&path->l[l].b->c.lock); ++ path->nodes_intent_locked ^= 1 << l; ++ } ++ break; ++ } ++ } ++ ++ bch2_btree_path_verify_locks(path); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ bch2_btree_path_downgrade(trans, path); ++} ++ ++/* Btree transaction locking: */ ++ ++int bch2_trans_relock(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ if (unlikely(trans->restarted)) ++ return -BCH_ERR_transaction_restart_relock; ++ ++ trans_for_each_path(trans, path) ++ if (path->should_be_locked && ++ bch2_btree_path_relock(trans, path, _RET_IP_)) { ++ trace_trans_restart_relock(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); ++ BUG_ON(!trans->restarted); ++ return -BCH_ERR_transaction_restart_relock; ++ } ++ return 0; ++} ++ ++void bch2_trans_unlock(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ __bch2_btree_path_unlock(trans, path); ++ ++ /* ++ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking ++ * btree nodes, it implements its own walking: ++ */ ++ BUG_ON(!trans->is_initial_gc && ++ lock_class_is_held(&bch2_btree_node_lock_key)); ++} ++ ++/* Btree iterator: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_btree_path_verify_cached(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ struct bkey_cached *ck; ++ bool locked = btree_node_locked(path, 0); ++ ++ if (!bch2_btree_node_relock(trans, path, 0)) ++ return; ++ ++ ck = (void *) path->l[0].b; ++ BUG_ON(ck->key.btree_id != path->btree_id || ++ bkey_cmp(ck->key.pos, path->pos)); ++ ++ if (!locked) ++ btree_node_unlock(trans, path, 0); ++} ++ ++static void bch2_btree_path_verify_level(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) ++{ ++ struct btree_path_level *l; ++ struct btree_node_iter tmp; ++ bool locked; ++ struct bkey_packed *p, *k; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; ++ const char *msg; ++ ++ if (!bch2_debug_check_iterators) ++ return; ++ ++ l = &path->l[level]; ++ tmp = l->iter; ++ locked = btree_node_locked(path, level); ++ ++ if (path->cached) { ++ if (!level) ++ bch2_btree_path_verify_cached(trans, path); ++ return; ++ } ++ ++ if (!btree_path_node(path, level)) ++ return; ++ ++ if (!bch2_btree_node_relock(trans, path, level)) ++ return; ++ ++ BUG_ON(!btree_path_pos_in_node(path, l->b)); ++ ++ bch2_btree_node_iter_verify(&l->iter, l->b); ++ ++ /* ++ * For interior nodes, the iterator will have skipped past deleted keys: ++ */ ++ p = level ++ ? bch2_btree_node_iter_prev(&tmp, l->b) ++ : bch2_btree_node_iter_prev_all(&tmp, l->b); ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { ++ msg = "before"; ++ goto err; ++ } ++ ++ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { ++ msg = "after"; ++ goto err; ++ } ++ ++ if (!locked) ++ btree_node_unlock(trans, path, level); ++ return; ++err: ++ bch2_bpos_to_text(&buf1, path->pos); ++ ++ if (p) { ++ struct bkey uk = bkey_unpack_key(l->b, p); ++ bch2_bkey_to_text(&buf2, &uk); ++ } else { ++ prt_printf(&buf2, "(none)"); ++ } ++ ++ if (k) { ++ struct bkey uk = bkey_unpack_key(l->b, k); ++ bch2_bkey_to_text(&buf3, &uk); ++ } else { ++ prt_printf(&buf3, "(none)"); ++ } ++ ++ panic("path should be %s key at level %u:\n" ++ "path pos %s\n" ++ "prev key %s\n" ++ "cur key %s\n", ++ msg, level, buf1.buf, buf2.buf, buf3.buf); ++} ++ ++static void bch2_btree_path_verify(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i; ++ ++ EBUG_ON(path->btree_id >= BTREE_ID_NR); ++ ++ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { ++ if (!path->l[i].b) { ++ BUG_ON(!path->cached && ++ c->btree_roots[path->btree_id].b->c.level > i); ++ break; ++ } ++ ++ bch2_btree_path_verify_level(trans, path, i); ++ } ++ ++ bch2_btree_path_verify_locks(path); ++} ++ ++void bch2_trans_verify_paths(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ bch2_btree_path_verify(trans, path); ++} ++ ++static void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ ++ BUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); ++ ++ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); ++ ++ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && ++ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ !btree_type_has_snapshots(iter->btree_id)); ++ ++ if (iter->update_path) ++ bch2_btree_path_verify(trans, iter->update_path); ++ bch2_btree_path_verify(trans, iter->path); ++} ++ ++static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) ++{ ++ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !iter->pos.snapshot); ++ ++ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ iter->pos.snapshot != iter->snapshot); ++ ++ BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0); ++} ++ ++static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter copy; ++ struct bkey_s_c prev; ++ int ret = 0; ++ ++ if (!bch2_debug_check_iterators) ++ return 0; ++ ++ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) ++ return 0; ++ ++ if (bkey_err(k) || !k.k) ++ return 0; ++ ++ BUG_ON(!bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)); ++ ++ bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, ++ BTREE_ITER_NOPRESERVE| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ prev = bch2_btree_iter_prev(©); ++ if (!prev.k) ++ goto out; ++ ++ ret = bkey_err(prev); ++ if (ret) ++ goto out; ++ ++ if (!bkey_cmp(prev.k->p, k.k->p) && ++ bch2_snapshot_is_ancestor(trans->c, iter->snapshot, ++ prev.k->p.snapshot) > 0) { ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ ++ bch2_bkey_to_text(&buf1, k.k); ++ bch2_bkey_to_text(&buf2, prev.k); ++ ++ panic("iter snap %u\n" ++ "k %s\n" ++ "prev %s\n", ++ iter->snapshot, ++ buf1.buf, buf2.buf); ++ } ++out: ++ bch2_trans_iter_exit(trans, ©); ++ return ret; ++} ++ ++void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, ++ struct bpos pos, bool key_cache) ++{ ++ struct btree_path *path; ++ unsigned idx; ++ struct printbuf buf = PRINTBUF; ++ ++ trans_for_each_path_inorder(trans, path, idx) { ++ int cmp = cmp_int(path->btree_id, id) ?: ++ cmp_int(path->cached, key_cache); ++ ++ if (cmp > 0) ++ break; ++ if (cmp < 0) ++ continue; ++ ++ if (!(path->nodes_locked & 1) || ++ !path->should_be_locked) ++ continue; ++ ++ if (!key_cache) { ++ if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && ++ bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) ++ return; ++ } else { ++ if (!bkey_cmp(pos, path->pos)) ++ return; ++ } ++ } ++ ++ bch2_dump_trans_paths_updates(trans); ++ bch2_bpos_to_text(&buf, pos); ++ ++ panic("not locked: %s %s%s\n", ++ bch2_btree_ids[id], buf.buf, ++ key_cache ? " cached" : ""); ++} ++ ++#else ++ ++static inline void bch2_btree_path_verify_level(struct btree_trans *trans, ++ struct btree_path *path, unsigned l) {} ++static inline void bch2_btree_path_verify(struct btree_trans *trans, ++ struct btree_path *path) {} ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} ++static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} ++static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } ++ ++#endif ++ ++/* Btree path: fixups after btree updates */ ++ ++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) { ++ set->k = __btree_node_key_to_offset(b, k); ++ bch2_btree_node_iter_sort(iter, b); ++ return; ++ } ++ ++ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); ++} ++ ++static void __bch2_btree_path_fix_key_modified(struct btree_path *path, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_path_level *l = &path->l[b->c.level]; ++ ++ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) ++ return; ++ ++ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++} ++ ++void bch2_btree_path_fix_key_modified(struct btree_trans *trans, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path_with_node(trans, b, path) { ++ __bch2_btree_path_fix_key_modified(path, b, where); ++ bch2_btree_path_verify_level(trans, path, b->c.level); ++ } ++} ++ ++static void __bch2_btree_node_iter_fix(struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ const struct bkey_packed *end = btree_bkey_last(b, t); ++ struct btree_node_iter_set *set; ++ unsigned offset = __btree_node_key_to_offset(b, where); ++ int shift = new_u64s - clobber_u64s; ++ unsigned old_end = t->end_offset - shift; ++ unsigned orig_iter_pos = node_iter->data[0].k; ++ bool iter_current_key_modified = ++ orig_iter_pos >= offset && ++ orig_iter_pos <= offset + clobber_u64s; ++ ++ btree_node_iter_for_each(node_iter, set) ++ if (set->end == old_end) ++ goto found; ++ ++ /* didn't find the bset in the iterator - might have to readd it: */ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { ++ bch2_btree_node_iter_push(node_iter, b, where, end); ++ goto fixup_done; ++ } else { ++ /* Iterator is after key that changed */ ++ return; ++ } ++found: ++ set->end = t->end_offset; ++ ++ /* Iterator hasn't gotten to the key that changed yet: */ ++ if (set->k < offset) ++ return; ++ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { ++ set->k = offset; ++ } else if (set->k < offset + clobber_u64s) { ++ set->k = offset + new_u64s; ++ if (set->k == set->end) ++ bch2_btree_node_iter_set_drop(node_iter, set); ++ } else { ++ /* Iterator is after key that changed */ ++ set->k = (int) set->k + shift; ++ return; ++ } ++ ++ bch2_btree_node_iter_sort(node_iter, b); ++fixup_done: ++ if (node_iter->data[0].k != orig_iter_pos) ++ iter_current_key_modified = true; ++ ++ /* ++ * When a new key is added, and the node iterator now points to that ++ * key, the iterator might have skipped past deleted keys that should ++ * come after the key the iterator now points to. We have to rewind to ++ * before those deleted keys - otherwise ++ * bch2_btree_node_iter_prev_all() breaks: ++ */ ++ if (!bch2_btree_node_iter_end(node_iter) && ++ iter_current_key_modified && ++ b->c.level) { ++ struct bset_tree *t; ++ struct bkey_packed *k, *k2, *p; ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ for_each_bset(b, t) { ++ bool set_pos = false; ++ ++ if (node_iter->data[0].end == t->end_offset) ++ continue; ++ ++ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); ++ ++ while ((p = bch2_bkey_prev_all(b, t, k2)) && ++ bkey_iter_cmp(b, k, p) < 0) { ++ k2 = p; ++ set_pos = true; ++ } ++ ++ if (set_pos) ++ btree_node_iter_set_set_pos(node_iter, ++ b, t, k2); ++ } ++ } ++} ++ ++void bch2_btree_node_iter_fix(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct btree_path *linked; ++ ++ if (node_iter != &path->l[b->c.level].iter) { ++ __bch2_btree_node_iter_fix(path, b, node_iter, t, ++ where, clobber_u64s, new_u64s); ++ ++ if (bch2_debug_check_iterators) ++ bch2_btree_node_iter_verify(node_iter, b); ++ } ++ ++ trans_for_each_path_with_node(trans, b, linked) { ++ __bch2_btree_node_iter_fix(linked, b, ++ &linked->l[b->c.level].iter, t, ++ where, clobber_u64s, new_u64s); ++ bch2_btree_path_verify_level(trans, linked, b->c.level); ++ } ++} ++ ++/* Btree path level: pointer to a particular btree node and node iter */ ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, ++ struct btree_path_level *l, ++ struct bkey *u, ++ struct bkey_packed *k) ++{ ++ if (unlikely(!k)) { ++ /* ++ * signal to bch2_btree_iter_peek_slot() that we're currently at ++ * a hole ++ */ ++ u->type = KEY_TYPE_deleted; ++ return bkey_s_c_null; ++ } ++ ++ return bkey_disassemble(l->b, k, u); ++} ++ ++static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, ++ struct btree_path_level *l, ++ struct bkey *u) ++{ ++ return __btree_iter_unpack(c, l, u, ++ bch2_btree_node_iter_peek_all(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree_path_level *l, ++ struct bkey *u) ++{ ++ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ path->pos = k.k ? k.k->p : l->b->key.k.p; ++ bch2_btree_path_verify_level(trans, path, l - path->l); ++ return k; ++} ++ ++static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree_path_level *l, ++ struct bkey *u) ++{ ++ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, ++ bch2_btree_node_iter_prev(&l->iter, l->b)); ++ ++ path->pos = k.k ? k.k->p : l->b->data->min_key; ++ bch2_btree_path_verify_level(trans, path, l - path->l); ++ return k; ++} ++ ++static inline bool btree_path_advance_to_pos(struct btree_path *path, ++ struct btree_path_level *l, ++ int max_advance) ++{ ++ struct bkey_packed *k; ++ int nr_advanced = 0; ++ ++ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && ++ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { ++ if (max_advance > 0 && nr_advanced >= max_advance) ++ return false; ++ ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ nr_advanced++; ++ } ++ ++ return true; ++} ++ ++/* ++ * Verify that iterator for parent node points to child node: ++ */ ++static void btree_path_verify_new_node(struct btree_trans *trans, ++ struct btree_path *path, struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l; ++ unsigned plevel; ++ bool parent_locked; ++ struct bkey_packed *k; ++ ++ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ return; ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; ++ ++ plevel = b->c.level + 1; ++ if (!btree_path_node(path, plevel)) ++ return; ++ ++ parent_locked = btree_node_locked(path, plevel); ++ ++ if (!bch2_btree_node_relock(trans, path, plevel)) ++ return; ++ ++ l = &path->l[plevel]; ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ if (!k || ++ bkey_deleted(k) || ++ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct printbuf buf3 = PRINTBUF; ++ struct printbuf buf4 = PRINTBUF; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_dump_btree_node(c, l->b); ++ bch2_bpos_to_text(&buf1, path->pos); ++ bch2_bkey_to_text(&buf2, &uk); ++ bch2_bpos_to_text(&buf3, b->data->min_key); ++ bch2_bpos_to_text(&buf3, b->data->max_key); ++ panic("parent iter doesn't point to new node:\n" ++ "iter pos %s %s\n" ++ "iter key %s\n" ++ "new node %s-%s\n", ++ bch2_btree_ids[path->btree_id], ++ buf1.buf, buf2.buf, buf3.buf, buf4.buf); ++ } ++ ++ if (!parent_locked) ++ btree_node_unlock(trans, path, plevel); ++} ++ ++static inline void __btree_path_level_init(struct btree_path *path, ++ unsigned level) ++{ ++ struct btree_path_level *l = &path->l[level]; ++ ++ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); ++ ++ /* ++ * Iterators to interior nodes should always be pointed at the first non ++ * whiteout: ++ */ ++ if (level) ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++} ++ ++static inline void btree_path_level_init(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ BUG_ON(path->cached); ++ ++ btree_path_verify_new_node(trans, path, b); ++ ++ EBUG_ON(!btree_path_pos_in_node(path, b)); ++ EBUG_ON(b->c.lock.state.seq & 1); ++ ++ path->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ path->l[b->c.level].b = b; ++ __btree_path_level_init(path, b->c.level); ++} ++ ++/* Btree path: fixups after btree node updates: */ ++ ++/* ++ * A btree node is being replaced - update the iterator to point to the new ++ * node: ++ */ ++void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ if (!path->cached && ++ btree_path_pos_in_node(path, b)) { ++ enum btree_node_locked_type t = ++ btree_lock_want(path, b->c.level); ++ ++ if (path->nodes_locked && ++ t != BTREE_NODE_UNLOCKED) { ++ btree_node_unlock(trans, path, b->c.level); ++ six_lock_increment(&b->c.lock, t); ++ mark_btree_node_locked(trans, path, b->c.level, t); ++ } ++ ++ btree_path_level_init(trans, path, b); ++ } ++} ++ ++/* ++ * A btree node has been modified in such a way as to invalidate iterators - fix ++ * them: ++ */ ++void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path_with_node(trans, b, path) ++ __btree_path_level_init(path, b->c.level); ++} ++ ++/* Btree path: traverse, set_pos: */ ++ ++static int lock_root_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ struct btree **rootp = p; ++ ++ if (b != *rootp) ++ return BCH_ERR_lock_fail_root_changed; ++ return 0; ++} ++ ++static inline int btree_path_lock_root(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned depth_want, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; ++ enum six_lock_type lock_type; ++ unsigned i; ++ int ret; ++ ++ EBUG_ON(path->nodes_locked); ++ ++ while (1) { ++ b = READ_ONCE(*rootp); ++ path->level = READ_ONCE(b->c.level); ++ ++ if (unlikely(path->level < depth_want)) { ++ /* ++ * the root is at a lower depth than the depth we want: ++ * got to the end of the btree, or we're walking nodes ++ * greater than some depth and there are no nodes >= ++ * that depth ++ */ ++ path->level = depth_want; ++ for (i = path->level; i < BTREE_MAX_DEPTH; i++) ++ path->l[i].b = NULL; ++ return 1; ++ } ++ ++ lock_type = __btree_lock_want(path, path->level); ++ ret = btree_node_lock(trans, path, b, SPOS_MAX, ++ path->level, lock_type, ++ lock_root_check_fn, rootp, ++ trace_ip); ++ if (unlikely(ret)) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) ++ continue; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ BUG(); ++ } ++ ++ if (likely(b == READ_ONCE(*rootp) && ++ b->c.level == path->level && ++ !race_fault())) { ++ for (i = 0; i < path->level; i++) ++ path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ path->l[path->level].b = b; ++ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) ++ path->l[i].b = NULL; ++ ++ mark_btree_node_locked(trans, path, path->level, lock_type); ++ btree_path_level_init(trans, path, b); ++ return 0; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_type); ++ } ++} ++ ++noinline ++static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l = path_l(path); ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *k; ++ struct bkey_buf tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (path->level > 1 ? 0 : 2) ++ : (path->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(path, path->level); ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ while (nr && !ret) { ++ if (!bch2_btree_node_relock(trans, path, path->level)) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); ++ k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!k) ++ break; ++ ++ bch2_bkey_buf_unpack(&tmp, c, l->b, k); ++ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, ++ path->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(trans, path, path->level); ++ ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; ++} ++ ++static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, ++ struct btree_and_journal_iter *jiter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bkey_buf tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (path->level > 1 ? 0 : 2) ++ : (path->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(path, path->level); ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ while (nr && !ret) { ++ if (!bch2_btree_node_relock(trans, path, path->level)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(jiter); ++ k = bch2_btree_and_journal_iter_peek(jiter); ++ if (!k.k) ++ break; ++ ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, ++ path->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(trans, path, path->level); ++ ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; ++} ++ ++static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned plevel, struct btree *b) ++{ ++ struct btree_path_level *l = &path->l[plevel]; ++ bool locked = btree_node_locked(path, plevel); ++ struct bkey_packed *k; ++ struct bch_btree_ptr_v2 *bp; ++ ++ if (!bch2_btree_node_relock(trans, path, plevel)) ++ return; ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); ++ ++ bp = (void *) bkeyp_val(&l->b->format, k); ++ bp->mem_ptr = (unsigned long)b; ++ ++ if (!locked) ++ btree_node_unlock(trans, path, plevel); ++} ++ ++static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, ++ struct bkey_buf *out) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l = path_l(path); ++ struct btree_and_journal_iter jiter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); ++ ++ k = bch2_btree_and_journal_iter_peek(&jiter); ++ ++ bch2_bkey_buf_reassemble(out, c, k); ++ ++ if (flags & BTREE_ITER_PREFETCH) ++ ret = btree_path_prefetch_j(trans, path, &jiter); ++ ++ bch2_btree_and_journal_iter_exit(&jiter); ++ return ret; ++} ++ ++static __always_inline int btree_path_down(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path_level *l = path_l(path); ++ struct btree *b; ++ unsigned level = path->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(path, level); ++ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); ++ struct bkey_buf tmp; ++ int ret; ++ ++ EBUG_ON(!btree_node_locked(path, path->level)); ++ ++ bch2_bkey_buf_init(&tmp); ++ ++ if (unlikely(!replay_done)) { ++ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); ++ if (ret) ++ goto err; ++ } else { ++ bch2_bkey_buf_unpack(&tmp, c, l->b, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ if (flags & BTREE_ITER_PREFETCH) { ++ ret = btree_path_prefetch(trans, path); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (unlikely(ret)) ++ goto err; ++ ++ mark_btree_node_locked(trans, path, level, lock_type); ++ btree_path_level_init(trans, path, b); ++ ++ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && ++ unlikely(b != btree_node_mem_ptr(tmp.k))) ++ btree_node_mem_ptr_set(trans, path, level + 1, b); ++ ++ if (btree_node_read_locked(path, level + 1)) ++ btree_node_unlock(trans, path, level + 1); ++ path->level = level; ++ ++ bch2_btree_path_verify_locks(path); ++err: ++ bch2_bkey_buf_exit(&tmp, c); ++ return ret; ++} ++ ++static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, ++ unsigned, unsigned long); ++ ++static int bch2_btree_path_traverse_all(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; ++ unsigned long trace_ip = _RET_IP_; ++ int i, ret = 0; ++ ++ if (trans->in_traverse_all) ++ return -BCH_ERR_transaction_restart_in_traverse_all; ++ ++ trans->in_traverse_all = true; ++retry_all: ++ trans->restarted = 0; ++ trans->traverse_all_idx = U8_MAX; ++ ++ trans_for_each_path(trans, path) ++ path->should_be_locked = false; ++ ++ btree_trans_verify_sorted(trans); ++ ++ for (i = trans->nr_sorted - 2; i >= 0; --i) { ++ struct btree_path *path1 = trans->paths + trans->sorted[i]; ++ struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; ++ ++ if (path1->btree_id == path2->btree_id && ++ path1->locks_want < path2->locks_want) ++ __bch2_btree_path_upgrade(trans, path1, path2->locks_want); ++ else if (!path1->locks_want && path2->locks_want) ++ __bch2_btree_path_upgrade(trans, path1, 1); ++ } ++ ++ bch2_trans_unlock(trans); ++ cond_resched(); ++ ++ if (unlikely(trans->memory_allocation_failure)) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ } ++ ++ /* Now, redo traversals in correct order: */ ++ trans->traverse_all_idx = 0; ++ while (trans->traverse_all_idx < trans->nr_sorted) { ++ path = trans->paths + trans->sorted[trans->traverse_all_idx]; ++ ++ /* ++ * Traversing a path can cause another path to be added at about ++ * the same position: ++ */ ++ if (path->uptodate) { ++ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ ret == -ENOMEM) ++ goto retry_all; ++ if (ret) ++ goto err; ++ BUG_ON(path->uptodate); ++ } else { ++ trans->traverse_all_idx++; ++ } ++ } ++ ++ /* ++ * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() ++ * and relock(), relock() won't relock since path->should_be_locked ++ * isn't set yet, which is all fine ++ */ ++ trans_for_each_path(trans, path) ++ BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); ++err: ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ trans->in_traverse_all = false; ++ ++ trace_trans_traverse_all(trans->fn, trace_ip); ++ return ret; ++} ++ ++static inline bool btree_path_good_node(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned l, int check_pos) ++{ ++ if (!is_btree_node(path, l) || ++ !bch2_btree_node_relock(trans, path, l)) ++ return false; ++ ++ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) ++ return false; ++ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) ++ return false; ++ return true; ++} ++ ++static void btree_path_set_level_up(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ btree_node_unlock(trans, path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++} ++ ++static void btree_path_set_level_down(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_level) ++{ ++ unsigned l; ++ ++ path->level = new_level; ++ ++ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) ++ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(trans, path, l); ++ ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ bch2_btree_path_verify(trans, path); ++} ++ ++static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, ++ struct btree_path *path, ++ int check_pos) ++{ ++ unsigned i, l = path->level; ++ ++ while (btree_path_node(path, l) && ++ !btree_path_good_node(trans, path, l, check_pos)) { ++ btree_node_unlock(trans, path, l); ++ path->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ /* If we need intent locks, take them too: */ ++ for (i = l + 1; ++ i < path->locks_want && btree_path_node(path, i); ++ i++) ++ if (!bch2_btree_node_relock(trans, path, i)) ++ while (l <= i) { ++ btree_node_unlock(trans, path, l); ++ path->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ return l; ++} ++ ++/* ++ * This is the main state machine for walking down the btree - walks down to a ++ * specified depth ++ * ++ * Returns 0 on success, -EIO on error (error reading in a btree node). ++ * ++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is ++ * stashed in the iterator and returned from bch2_trans_exit(). ++ */ ++static int btree_path_traverse_one(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags, ++ unsigned long trace_ip) ++{ ++ unsigned depth_want = path->level; ++ int ret = trans->restarted; ++ ++ if (unlikely(ret)) ++ goto out; ++ ++ /* ++ * Ensure we obey path->should_be_locked: if it's set, we can't unlock ++ * and re-traverse the path without a transaction restart: ++ */ ++ if (path->should_be_locked) { ++ ret = bch2_btree_path_relock(trans, path, trace_ip); ++ goto out; ++ } ++ ++ if (path->cached) { ++ ret = bch2_btree_path_traverse_cached(trans, path, flags); ++ goto out; ++ } ++ ++ if (unlikely(path->level >= BTREE_MAX_DEPTH)) ++ goto out; ++ ++ path->level = btree_path_up_until_good_node(trans, path, 0); ++ ++ /* ++ * Note: path->nodes[path->level] may be temporarily NULL here - that ++ * would indicate to other code that we got to the end of the btree, ++ * here it indicates that relocking the root failed - it's critical that ++ * btree_path_lock_root() comes next and that it can't fail ++ */ ++ while (path->level > depth_want) { ++ ret = btree_path_node(path, path->level) ++ ? btree_path_down(trans, path, flags, trace_ip) ++ : btree_path_lock_root(trans, path, depth_want, trace_ip); ++ if (unlikely(ret)) { ++ if (ret == 1) { ++ /* ++ * No nodes at this level - got to the end of ++ * the btree: ++ */ ++ ret = 0; ++ goto out; ++ } ++ ++ __bch2_btree_path_unlock(trans, path); ++ path->level = depth_want; ++ ++ if (ret == -EIO) ++ path->l[path->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ else ++ path->l[path->level].b = ++ BTREE_ITER_NO_NODE_DOWN; ++ goto out; ++ } ++ } ++ ++ path->uptodate = BTREE_ITER_UPTODATE; ++out: ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); ++ bch2_btree_path_verify(trans, path); ++ return ret; ++} ++ ++int __must_check bch2_btree_path_traverse(struct btree_trans *trans, ++ struct btree_path *path, unsigned flags) ++{ ++ if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); ++ u64 mask = ~(~0ULL << restart_probability_bits); ++ ++ if ((prandom_u32() & mask) == mask) { ++ trace_transaction_restart_injected(trans->fn, _RET_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); ++ } ++ } ++ ++ if (path->uptodate < BTREE_ITER_NEED_RELOCK) ++ return 0; ++ ++ return bch2_trans_cond_resched(trans) ?: ++ btree_path_traverse_one(trans, path, flags, _RET_IP_); ++} ++ ++static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, ++ struct btree_path *src) ++{ ++ unsigned i, offset = offsetof(struct btree_path, pos); ++ ++ memcpy((void *) dst + offset, ++ (void *) src + offset, ++ sizeof(struct btree_path) - offset); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->c.lock, ++ __btree_lock_want(dst, i)); ++ ++ bch2_btree_path_check_sort(trans, dst, 0); ++} ++ ++static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, ++ bool intent) ++{ ++ struct btree_path *new = btree_path_alloc(trans, src); ++ ++ btree_path_copy(trans, new, src); ++ __btree_path_get(new, intent); ++ return new; ++} ++ ++inline struct btree_path * __must_check ++bch2_btree_path_make_mut(struct btree_trans *trans, ++ struct btree_path *path, bool intent, ++ unsigned long ip) ++{ ++ if (path->ref > 1 || path->preserve) { ++ __btree_path_put(path, intent); ++ path = btree_path_clone(trans, path, intent); ++ path->preserve = false; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_allocated = ip; ++#endif ++ btree_trans_verify_sorted(trans); ++ } ++ ++ path->should_be_locked = false; ++ return path; ++} ++ ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *trans, ++ struct btree_path *path, struct bpos new_pos, ++ bool intent, unsigned long ip) ++{ ++ int cmp = bpos_cmp(new_pos, path->pos); ++ unsigned l = path->level; ++ ++ EBUG_ON(trans->restarted); ++ EBUG_ON(!path->ref); ++ ++ if (!cmp) ++ return path; ++ ++ path = bch2_btree_path_make_mut(trans, path, intent, ip); ++ ++ path->pos = new_pos; ++ ++ bch2_btree_path_check_sort(trans, path, cmp); ++ ++ if (unlikely(path->cached)) { ++ btree_node_unlock(trans, path, 0); ++ path->l[0].b = BTREE_ITER_NO_NODE_CACHED; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ goto out; ++ } ++ ++ l = btree_path_up_until_good_node(trans, path, cmp); ++ ++ if (btree_path_node(path, l)) { ++ BUG_ON(!btree_node_locked(path, l)); ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_path_advance_to_pos(path, &path->l[l], 8)) ++ __btree_path_level_init(path, l); ++ } ++ ++ if (l != path->level) { ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ __bch2_btree_path_unlock(trans, path); ++ } ++out: ++ bch2_btree_path_verify(trans, path); ++ return path; ++} ++ ++/* Btree path: main interface: */ ++ ++static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree_path *next; ++ ++ next = prev_btree_path(trans, path); ++ if (next && !btree_path_cmp(next, path)) ++ return next; ++ ++ next = next_btree_path(trans, path); ++ if (next && !btree_path_cmp(next, path)) ++ return next; ++ ++ return NULL; ++} ++ ++static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree_path *next; ++ ++ next = prev_btree_path(trans, path); ++ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) ++ return next; ++ ++ next = next_btree_path(trans, path); ++ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) ++ return next; ++ ++ return NULL; ++} ++ ++static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) ++{ ++ __bch2_btree_path_unlock(trans, path); ++ btree_path_list_remove(trans, path); ++ trans->paths_allocated &= ~(1ULL << path->idx); ++} ++ ++void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) ++{ ++ struct btree_path *dup; ++ ++ EBUG_ON(trans->paths + path->idx != path); ++ EBUG_ON(!path->ref); ++ ++ if (!__btree_path_put(path, intent)) ++ return; ++ ++ /* ++ * Perhaps instead we should check for duplicate paths in traverse_all: ++ */ ++ if (path->preserve && ++ (dup = have_path_at_pos(trans, path))) { ++ dup->preserve = true; ++ path->preserve = false; ++ goto free; ++ } ++ ++ if (!path->preserve && ++ (dup = have_node_at_pos(trans, path))) ++ goto free; ++ return; ++free: ++ if (path->should_be_locked && ++ !btree_node_locked(dup, path->level)) ++ return; ++ ++ dup->should_be_locked |= path->should_be_locked; ++ __bch2_path_free(trans, path); ++} ++ ++void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ prt_printf(buf, "transaction updates for %s journal seq %llu", ++ trans->fn, trans->journal_res.seq); ++ prt_newline(buf); ++ printbuf_indent_add(buf, 2); ++ ++ trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ ++ prt_printf(buf, "update: btree=%s cached=%u %pS", ++ bch2_btree_ids[i->btree_id], ++ i->cached, ++ (void *) i->ip_allocated); ++ prt_newline(buf); ++ ++ prt_printf(buf, " old "); ++ bch2_bkey_val_to_text(buf, trans->c, old); ++ prt_newline(buf); ++ ++ prt_printf(buf, " new "); ++ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); ++ prt_newline(buf); ++ } ++ ++ printbuf_indent_sub(buf, 2); ++} ++ ++noinline __cold ++void bch2_dump_trans_updates(struct btree_trans *trans) ++{ ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_trans_updates_to_text(&buf, trans); ++ bch_err(trans->c, "%s", buf.buf); ++ printbuf_exit(&buf); ++} ++ ++noinline __cold ++void bch2_dump_trans_paths_updates(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ struct printbuf buf = PRINTBUF; ++ unsigned idx; ++ ++ trans_for_each_path_inorder(trans, path, idx) { ++ printbuf_reset(&buf); ++ ++ bch2_bpos_to_text(&buf, path->pos); ++ ++ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", ++ path->idx, path->ref, path->intent_ref, ++ path->should_be_locked ? " S" : "", ++ path->preserve ? " P" : "", ++ bch2_btree_ids[path->btree_id], ++ path->level, ++ buf.buf, ++ path->nodes_locked, ++#ifdef CONFIG_BCACHEFS_DEBUG ++ (void *) path->ip_allocated ++#else ++ NULL ++#endif ++ ); ++ } ++ ++ printbuf_exit(&buf); ++ ++ bch2_dump_trans_updates(trans); ++} ++ ++static struct btree_path *btree_path_alloc(struct btree_trans *trans, ++ struct btree_path *pos) ++{ ++ struct btree_path *path; ++ unsigned idx; ++ ++ if (unlikely(trans->paths_allocated == ++ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { ++ bch2_dump_trans_paths_updates(trans); ++ panic("trans path oveflow\n"); ++ } ++ ++ idx = __ffs64(~trans->paths_allocated); ++ trans->paths_allocated |= 1ULL << idx; ++ ++ path = &trans->paths[idx]; ++ ++ path->idx = idx; ++ path->ref = 0; ++ path->intent_ref = 0; ++ path->nodes_locked = 0; ++ path->nodes_intent_locked = 0; ++ ++ btree_path_list_add(trans, pos, path); ++ return path; ++} ++ ++struct btree_path *bch2_path_get(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ unsigned locks_want, unsigned level, ++ unsigned flags, unsigned long ip) ++{ ++ struct btree_path *path, *path_pos = NULL; ++ bool cached = flags & BTREE_ITER_CACHED; ++ bool intent = flags & BTREE_ITER_INTENT; ++ int i; ++ ++ BUG_ON(trans->restarted); ++ btree_trans_verify_sorted(trans); ++ bch2_trans_verify_locks(trans); ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ if (__btree_path_cmp(path, ++ btree_id, ++ cached, ++ pos, ++ level) > 0) ++ break; ++ ++ path_pos = path; ++ } ++ ++ if (path_pos && ++ path_pos->cached == cached && ++ path_pos->btree_id == btree_id && ++ path_pos->level == level) { ++ __btree_path_get(path_pos, intent); ++ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); ++ } else { ++ path = btree_path_alloc(trans, path_pos); ++ path_pos = NULL; ++ ++ __btree_path_get(path, intent); ++ path->pos = pos; ++ path->btree_id = btree_id; ++ path->cached = cached; ++ path->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ path->should_be_locked = false; ++ path->level = level; ++ path->locks_want = locks_want; ++ path->nodes_locked = 0; ++ path->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(path->l); i++) ++ path->l[i].b = BTREE_ITER_NO_NODE_INIT; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ path->ip_allocated = ip; ++#endif ++ btree_trans_verify_sorted(trans); ++ } ++ ++ if (!(flags & BTREE_ITER_NOPRESERVE)) ++ path->preserve = true; ++ ++ if (path->intent_ref) ++ locks_want = max(locks_want, level + 1); ++ ++ /* ++ * If the path has locks_want greater than requested, we don't downgrade ++ * it here - on transaction restart because btree node split needs to ++ * upgrade locks, we might be putting/getting the iterator again. ++ * Downgrading iterators only happens via bch2_trans_downgrade(), after ++ * a successful transaction commit. ++ */ ++ ++ locks_want = min(locks_want, BTREE_MAX_DEPTH); ++ if (locks_want > path->locks_want) { ++ path->locks_want = locks_want; ++ btree_path_get_locks(trans, path, true); ++ } ++ ++ return path; ++} ++ ++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) ++{ ++ ++ struct bkey_s_c k; ++ ++ if (!path->cached) { ++ struct btree_path_level *l = path_l(path); ++ struct bkey_packed *_k; ++ ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); ++ ++ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; ++ ++ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); ++ ++ if (!k.k || bpos_cmp(path->pos, k.k->p)) ++ goto hole; ++ } else { ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ ++ EBUG_ON(ck && ++ (path->btree_id != ck->key.btree_id || ++ bkey_cmp(path->pos, ck->key.pos))); ++ ++ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ ++ if (unlikely(!ck || !ck->valid)) ++ return bkey_s_c_null; ++ ++ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); ++ ++ *u = ck->k->k; ++ k = bkey_i_to_s_c(ck->k); ++ } ++ ++ return k; ++hole: ++ bkey_init(u); ++ u->p = path->pos; ++ return (struct bkey_s_c) { u, NULL }; ++} ++ ++/* Btree iterators: */ ++ ++int __must_check ++__bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); ++} ++ ++int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ int ret; ++ ++ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, ++ btree_iter_search_key(iter), ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); ++ if (ret) ++ return ret; ++ ++ iter->path->should_be_locked = true; ++ return 0; ++} ++ ++/* Iterate across nodes (leaf and interior nodes) */ ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b = NULL; ++ int ret; ++ ++ EBUG_ON(iter->path->cached); ++ bch2_btree_iter_verify(iter); ++ ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ if (ret) ++ goto err; ++ ++ b = btree_path_node(iter->path, iter->path->level); ++ if (!b) ++ goto out; ++ ++ BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = b->key.k.p; ++ ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ iter->path->should_be_locked = true; ++ BUG_ON(iter->path->uptodate); ++out: ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++err: ++ b = ERR_PTR(ret); ++ goto out; ++} ++ ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_path *path = iter->path; ++ struct btree *b = NULL; ++ int ret; ++ ++ BUG_ON(trans->restarted); ++ EBUG_ON(iter->path->cached); ++ bch2_btree_iter_verify(iter); ++ ++ /* already at end? */ ++ if (!btree_path_node(path, path->level)) ++ return NULL; ++ ++ /* got to end? */ ++ if (!btree_path_node(path, path->level + 1)) { ++ btree_path_set_level_up(trans, path); ++ return NULL; ++ } ++ ++ if (!bch2_btree_node_relock(trans, path, path->level + 1)) { ++ __bch2_btree_path_unlock(trans, path); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, ++ path->btree_id, &path->pos); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); ++ goto err; ++ } ++ ++ b = btree_path_node(path, path->level + 1); ++ ++ if (!bpos_cmp(iter->pos, b->key.k.p)) { ++ btree_node_unlock(trans, path, path->level); ++ path->l[path->level].b = BTREE_ITER_NO_NODE_UP; ++ path->level++; ++ } else { ++ /* ++ * Haven't gotten to the end of the parent node: go back down to ++ * the next child node ++ */ ++ path = iter->path = ++ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ btree_path_set_level_down(trans, path, iter->min_depth); ++ ++ ret = bch2_btree_path_traverse(trans, path, iter->flags); ++ if (ret) ++ goto err; ++ ++ b = path->l[path->level].b; ++ } ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = b->key.k.p; ++ ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ iter->path->should_be_locked = true; ++ BUG_ON(iter->path->uptodate); ++out: ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++err: ++ b = ERR_PTR(ret); ++ goto out; ++} ++ ++/* Iterate across keys (in leaf nodes only) */ ++ ++inline bool bch2_btree_iter_advance(struct btree_iter *iter) ++{ ++ if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { ++ struct bpos pos = iter->k.p; ++ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS ++ ? bpos_cmp(pos, SPOS_MAX) ++ : bkey_cmp(pos, SPOS_MAX)) != 0; ++ ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ pos = bkey_successor(iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); ++ return ret; ++ } else { ++ if (!btree_path_node(iter->path, iter->path->level)) ++ return true; ++ ++ iter->advanced = true; ++ return false; ++ } ++} ++ ++inline bool bch2_btree_iter_rewind(struct btree_iter *iter) ++{ ++ struct bpos pos = bkey_start_pos(&iter->k); ++ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS ++ ? bpos_cmp(pos, POS_MIN) ++ : bkey_cmp(pos, POS_MIN)) != 0; ++ ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ pos = bkey_predecessor(iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); ++ return ret; ++} ++ ++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct btree_insert_entry *i; ++ struct bkey_i *ret = NULL; ++ ++ trans_for_each_update(trans, i) { ++ if (i->btree_id < btree_id) ++ continue; ++ if (i->btree_id > btree_id) ++ break; ++ if (bpos_cmp(i->k->k.p, pos) < 0) ++ continue; ++ if (i->key_cache_already_flushed) ++ continue; ++ if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) ++ ret = i->k; ++ } ++ ++ return ret; ++} ++ ++struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos start_pos, ++ struct bpos end_pos) ++{ ++ struct bkey_i *k; ++ ++ if (bpos_cmp(start_pos, iter->journal_pos) < 0) ++ iter->journal_idx = 0; ++ ++ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0, ++ start_pos, end_pos, ++ &iter->journal_idx); ++ ++ iter->journal_pos = k ? k->k.p : end_pos; ++ return k; ++} ++ ++struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos) ++{ ++ return bch2_btree_journal_peek(trans, iter, pos, pos); ++} ++ ++static noinline ++struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_i *next_journal = ++ bch2_btree_journal_peek(trans, iter, iter->path->pos, ++ k.k ? k.k->p : iter->path->l[0].b->key.k.p); ++ ++ if (next_journal) { ++ iter->k = next_journal->k; ++ k = bkey_i_to_s_c(next_journal); ++ } ++ ++ return k; ++} ++ ++/* ++ * Checks btree key cache for key at iter->pos and returns it if present, or ++ * bkey_s_c_null: ++ */ ++static noinline ++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey u; ++ int ret; ++ ++ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) ++ return bkey_s_c_null; ++ ++ if (!iter->key_cache_path) ++ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, ++ iter->flags & BTREE_ITER_INTENT, 0, ++ iter->flags|BTREE_ITER_CACHED, ++ _THIS_IP_); ++ ++ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ iter->key_cache_path->should_be_locked = true; ++ ++ return bch2_btree_path_peek_slot(iter->key_cache_path, &u); ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bkey_i *next_update; ++ struct bkey_s_c k, k2; ++ int ret; ++ ++ EBUG_ON(iter->path->cached || iter->path->level); ++ bch2_btree_iter_verify(iter); ++ ++ while (1) { ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ if (unlikely(ret)) { ++ /* ensure that iter->k is consistent with iter->pos: */ ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ goto out; ++ } ++ ++ iter->path->should_be_locked = true; ++ ++ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ k.k && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { ++ ret = bkey_err(k2); ++ if (ret) { ++ k = k2; ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ goto out; ++ } ++ ++ k = k2; ++ iter->k = *k.k; ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) ++ k = btree_trans_peek_journal(trans, iter, k); ++ ++ next_update = iter->flags & BTREE_ITER_WITH_UPDATES ++ ? btree_trans_peek_updates(trans, iter->btree_id, search_key) ++ : NULL; ++ if (next_update && ++ bpos_cmp(next_update->k.p, ++ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ } ++ ++ if (k.k && bkey_deleted(k.k)) { ++ /* ++ * If we've got a whiteout, and it's after the search ++ * key, advance the search key to the whiteout instead ++ * of just after the whiteout - it might be a btree ++ * whiteout, with a real key at the same position, since ++ * in the btree deleted keys sort before non deleted. ++ */ ++ search_key = bpos_cmp(search_key, k.k->p) ++ ? k.k->p ++ : bpos_successor(k.k->p); ++ continue; ++ } ++ ++ if (likely(k.k)) { ++ break; ++ } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { ++ /* Advance to next leaf node: */ ++ search_key = bpos_successor(iter->path->l[0].b->key.k.p); ++ } else { ++ /* End of btree: */ ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ } ++out: ++ bch2_btree_iter_verify(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key = btree_iter_search_key(iter); ++ struct bkey_s_c k; ++ struct bpos iter_pos; ++ int ret; ++ ++ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); ++ ++ if (iter->update_path) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } ++ ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ while (1) { ++ k = __bch2_btree_iter_peek(iter, search_key); ++ if (!k.k || bkey_err(k)) ++ goto out; ++ ++ /* ++ * iter->pos should be mononotically increasing, and always be ++ * equal to the key we just returned - except extents can ++ * straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter_pos = k.k->p; ++ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter_pos = bkey_start_pos(k.k); ++ else ++ iter_pos = iter->pos; ++ ++ if (bkey_cmp(iter_pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ ++ if (iter->update_path && ++ bkey_cmp(iter->update_path->pos, k.k->p)) { ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ } ++ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ (iter->flags & BTREE_ITER_INTENT) && ++ !(iter->flags & BTREE_ITER_IS_EXTENTS) && ++ !iter->update_path) { ++ struct bpos pos = k.k->p; ++ ++ if (pos.snapshot < iter->snapshot) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ pos.snapshot = iter->snapshot; ++ ++ /* ++ * advance, same as on exit for iter->path, but only up ++ * to snapshot ++ */ ++ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = iter->path; ++ ++ iter->update_path = bch2_btree_path_set_pos(trans, ++ iter->update_path, pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ } ++ ++ /* ++ * We can never have a key in a leaf node at POS_MAX, so ++ * we don't have to check these successor() calls: ++ */ ++ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && ++ !bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } ++ ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_successor(iter, k.k->p); ++ continue; ++ } ++ ++ break; ++ } ++ ++ iter->pos = iter_pos; ++ ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ BUG_ON(!iter->path->nodes_locked); ++out: ++ if (iter->update_path) { ++ if (iter->update_path->uptodate && ++ (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) { ++ k = bkey_s_c_err(ret); ++ } else { ++ BUG_ON(!(iter->update_path->nodes_locked & 1)); ++ iter->update_path->should_be_locked = true; ++ } ++ } ++ iter->path->should_be_locked = true; ++ ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ iter->pos.snapshot = iter->snapshot; ++ ++ ret = bch2_btree_iter_verify_ret(iter, k); ++ if (unlikely(ret)) { ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ } ++ ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal ++ * to iterator's current position, returning keys from every level of the btree. ++ * For keys at different levels of the btree that compare equal, the key from ++ * the lower level (leaf) is returned first. ++ */ ++struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(iter->path->cached); ++ bch2_btree_iter_verify(iter); ++ BUG_ON(iter->path->level < iter->min_depth); ++ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); ++ EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); ++ ++ while (1) { ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ if (unlikely(ret)) { ++ /* ensure that iter->k is consistent with iter->pos: */ ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ goto out; ++ } ++ ++ /* Already at end? */ ++ if (!btree_path_node(iter->path, iter->path->level)) { ++ k = bkey_s_c_null; ++ goto out; ++ } ++ ++ k = btree_path_level_peek_all(trans->c, ++ &iter->path->l[iter->path->level], &iter->k); ++ ++ /* Check if we should go up to the parent node: */ ++ if (!k.k || ++ (iter->advanced && ++ !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { ++ iter->pos = path_l(iter->path)->b->key.k.p; ++ btree_path_set_level_up(trans, iter->path); ++ iter->advanced = false; ++ continue; ++ } ++ ++ /* ++ * Check if we should go back down to a leaf: ++ * If we're not in a leaf node, we only return the current key ++ * if it exactly matches iter->pos - otherwise we first have to ++ * go back to the leaf: ++ */ ++ if (iter->path->level != iter->min_depth && ++ (iter->advanced || ++ !k.k || ++ bpos_cmp(iter->pos, k.k->p))) { ++ btree_path_set_level_down(trans, iter->path, iter->min_depth); ++ iter->pos = bpos_successor(iter->pos); ++ iter->advanced = false; ++ continue; ++ } ++ ++ /* Check if we should go to the next key: */ ++ if (iter->path->level == iter->min_depth && ++ iter->advanced && ++ k.k && ++ !bpos_cmp(iter->pos, k.k->p)) { ++ iter->pos = bpos_successor(iter->pos); ++ iter->advanced = false; ++ continue; ++ } ++ ++ if (iter->advanced && ++ iter->path->level == iter->min_depth && ++ bpos_cmp(k.k->p, iter->pos)) ++ iter->advanced = false; ++ ++ BUG_ON(iter->advanced); ++ BUG_ON(!k.k); ++ break; ++ } ++ ++ iter->pos = k.k->p; ++out: ++ iter->path->should_be_locked = true; ++ bch2_btree_iter_verify(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_next: returns first key greater than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) ++{ ++ if (!bch2_btree_iter_advance(iter)) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek(iter); ++} ++ ++/** ++ * bch2_btree_iter_peek_prev: returns first key less than or equal to ++ * iterator's current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key = iter->pos; ++ struct btree_path *saved_path = NULL; ++ struct bkey_s_c k; ++ struct bkey saved_k; ++ const struct bch_val *saved_v; ++ int ret; ++ ++ EBUG_ON(iter->path->cached || iter->path->level); ++ EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); ++ ++ if (iter->flags & BTREE_ITER_WITH_JOURNAL) ++ return bkey_s_c_err(-EIO); ++ ++ bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ search_key.snapshot = U32_MAX; ++ ++ while (1) { ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ if (unlikely(ret)) { ++ /* ensure that iter->k is consistent with iter->pos: */ ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ k = bkey_s_c_err(ret); ++ goto out; ++ } ++ ++ k = btree_path_level_peek(trans, iter->path, ++ &iter->path->l[0], &iter->k); ++ if (!k.k || ++ ((iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 ++ : bpos_cmp(k.k->p, search_key) > 0)) ++ k = btree_path_level_prev(trans, iter->path, ++ &iter->path->l[0], &iter->k); ++ ++ bch2_btree_path_check_sort(trans, iter->path, 0); ++ ++ if (likely(k.k)) { ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { ++ if (k.k->p.snapshot == iter->snapshot) ++ goto got_key; ++ ++ /* ++ * If we have a saved candidate, and we're no ++ * longer at the same _key_ (not pos), return ++ * that candidate ++ */ ++ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { ++ bch2_path_put(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->path = saved_path; ++ saved_path = NULL; ++ iter->k = saved_k; ++ k.v = saved_v; ++ goto got_key; ++ } ++ ++ if (bch2_snapshot_is_ancestor(iter->trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ if (saved_path) ++ bch2_path_put(trans, saved_path, ++ iter->flags & BTREE_ITER_INTENT); ++ saved_path = btree_path_clone(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ saved_k = *k.k; ++ saved_v = k.v; ++ } ++ ++ search_key = bpos_predecessor(k.k->p); ++ continue; ++ } ++got_key: ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { ++ search_key = bkey_predecessor(iter, k.k->p); ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ search_key.snapshot = U32_MAX; ++ continue; ++ } ++ ++ break; ++ } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { ++ /* Advance to previous leaf node: */ ++ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); ++ } else { ++ /* Start of btree: */ ++ bch2_btree_iter_set_pos(iter, POS_MIN); ++ k = bkey_s_c_null; ++ goto out; ++ } ++ } ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); ++ ++ /* Extents can straddle iter->pos: */ ++ if (bkey_cmp(k.k->p, iter->pos) < 0) ++ iter->pos = k.k->p; ++ ++ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) ++ iter->pos.snapshot = iter->snapshot; ++out: ++ if (saved_path) ++ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); ++ iter->path->should_be_locked = true; ++ ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); ++ ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_prev: returns first key less than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) ++{ ++ if (!bch2_btree_iter_rewind(iter)) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_prev(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_verify(iter); ++ bch2_btree_iter_verify_entry_exit(iter); ++ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); ++ EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); ++ ++ /* extents can't span inode numbers: */ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); ++ } ++ ++ search_key = btree_iter_search_key(iter); ++ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, ++ iter->flags & BTREE_ITER_INTENT, ++ btree_iter_ip_allocated(iter)); ++ ++ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ if ((iter->flags & BTREE_ITER_CACHED) || ++ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { ++ struct bkey_i *next_update; ++ ++ if ((iter->flags & BTREE_ITER_WITH_UPDATES) && ++ (next_update = btree_trans_peek_updates(trans, ++ iter->btree_id, search_key)) && ++ !bpos_cmp(next_update->k.p, iter->pos)) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ goto out; ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && ++ (next_update = bch2_btree_journal_peek_slot(trans, ++ iter, iter->pos))) { ++ iter->k = next_update->k; ++ k = bkey_i_to_s_c(next_update); ++ goto out; ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && ++ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { ++ if (!bkey_err(k)) ++ iter->k = *k.k; ++ goto out; ++ } ++ ++ k = bch2_btree_path_peek_slot(iter->path, &iter->k); ++ } else { ++ struct bpos next; ++ ++ EBUG_ON(iter->path->level); ++ ++ if (iter->flags & BTREE_ITER_INTENT) { ++ struct btree_iter iter2; ++ struct bpos end = iter->pos; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ end.offset = U64_MAX; ++ ++ bch2_trans_copy_iter(&iter2, iter); ++ k = bch2_btree_iter_peek_upto(&iter2, end); ++ ++ if (k.k && !bkey_err(k)) { ++ iter->k = iter2.k; ++ k.k = &iter->k; ++ } ++ bch2_trans_iter_exit(trans, &iter2); ++ } else { ++ struct bpos pos = iter->pos; ++ ++ k = bch2_btree_iter_peek(iter); ++ iter->pos = pos; ++ } ++ ++ if (unlikely(bkey_err(k))) ++ return k; ++ ++ next = k.k ? bkey_start_pos(k.k) : POS_MAX; ++ ++ if (bkey_cmp(iter->pos, next) < 0) { ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) { ++ bch2_key_resize(&iter->k, ++ min_t(u64, KEY_SIZE_MAX, ++ (next.inode == iter->pos.inode ++ ? next.offset ++ : KEY_OFFSET_MAX) - ++ iter->pos.offset)); ++ EBUG_ON(!iter->k.size); ++ } ++ ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ } ++out: ++ iter->path->should_be_locked = true; ++ ++ bch2_btree_iter_verify_entry_exit(iter); ++ bch2_btree_iter_verify(iter); ++ ret = bch2_btree_iter_verify_ret(iter, k); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) ++{ ++ if (!bch2_btree_iter_advance(iter)) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) ++{ ++ if (!bch2_btree_iter_rewind(iter)) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++/* new transactional stuff: */ ++ ++static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); ++ EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); ++} ++ ++static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ for (i = 0; i < trans->nr_sorted; i++) ++ btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); ++#endif ++} ++ ++static void btree_trans_verify_sorted(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_path *path, *prev = NULL; ++ unsigned i; ++ ++ if (!bch2_debug_check_iterators) ++ return; ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ if (prev && btree_path_cmp(prev, path) > 0) { ++ bch2_dump_trans_paths_updates(trans); ++ panic("trans paths out of order!\n"); ++ } ++ prev = path; ++ } ++#endif ++} ++ ++static inline void btree_path_swap(struct btree_trans *trans, ++ struct btree_path *l, struct btree_path *r) ++{ ++ swap(l->sorted_idx, r->sorted_idx); ++ swap(trans->sorted[l->sorted_idx], ++ trans->sorted[r->sorted_idx]); ++ ++ btree_path_verify_sorted_ref(trans, l); ++ btree_path_verify_sorted_ref(trans, r); ++} ++ ++inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, ++ int cmp) ++{ ++ struct btree_path *n; ++ ++ if (cmp <= 0) { ++ n = prev_btree_path(trans, path); ++ if (n && btree_path_cmp(n, path) > 0) { ++ do { ++ btree_path_swap(trans, n, path); ++ n = prev_btree_path(trans, path); ++ } while (n && btree_path_cmp(n, path) > 0); ++ ++ goto out; ++ } ++ } ++ ++ if (cmp >= 0) { ++ n = next_btree_path(trans, path); ++ if (n && btree_path_cmp(path, n) > 0) { ++ do { ++ btree_path_swap(trans, path, n); ++ n = next_btree_path(trans, path); ++ } while (n && btree_path_cmp(path, n) > 0); ++ } ++ } ++out: ++ btree_trans_verify_sorted(trans); ++} ++ ++static inline void btree_path_list_remove(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ unsigned i; ++ ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); ++ ++ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); ++ ++ for (i = path->sorted_idx; i < trans->nr_sorted; i++) ++ trans->paths[trans->sorted[i]].sorted_idx = i; ++ ++ path->sorted_idx = U8_MAX; ++ ++ btree_trans_verify_sorted_refs(trans); ++} ++ ++static inline void btree_path_list_add(struct btree_trans *trans, ++ struct btree_path *pos, ++ struct btree_path *path) ++{ ++ unsigned i; ++ ++ btree_trans_verify_sorted_refs(trans); ++ ++ path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; ++ ++ if (trans->in_traverse_all && ++ trans->traverse_all_idx != U8_MAX && ++ trans->traverse_all_idx >= path->sorted_idx) ++ trans->traverse_all_idx++; ++ ++ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); ++ ++ for (i = path->sorted_idx; i < trans->nr_sorted; i++) ++ trans->paths[trans->sorted[i]].sorted_idx = i; ++ ++ btree_trans_verify_sorted_refs(trans); ++} ++ ++void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ if (iter->path) ++ bch2_path_put(trans, iter->path, ++ iter->flags & BTREE_ITER_INTENT); ++ if (iter->update_path) ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ if (iter->key_cache_path) ++ bch2_path_put(trans, iter->key_cache_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->path = NULL; ++ iter->update_path = NULL; ++ iter->key_cache_path = NULL; ++} ++ ++static void __bch2_trans_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned btree_id, struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags, ++ unsigned long ip) ++{ ++ EBUG_ON(trans->restarted); ++ ++ if (flags & BTREE_ITER_ALL_LEVELS) ++ flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; ++ ++ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && ++ btree_node_type_is_extents(btree_id)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ ++ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && ++ !btree_type_has_snapshots(btree_id)) ++ flags &= ~BTREE_ITER_ALL_SNAPSHOTS; ++ ++ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && ++ btree_type_has_snapshots(btree_id)) ++ flags |= BTREE_ITER_FILTER_SNAPSHOTS; ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) ++ flags |= BTREE_ITER_WITH_JOURNAL; ++ ++ iter->trans = trans; ++ iter->path = NULL; ++ iter->update_path = NULL; ++ iter->key_cache_path = NULL; ++ iter->btree_id = btree_id; ++ iter->min_depth = depth; ++ iter->flags = flags; ++ iter->snapshot = pos.snapshot; ++ iter->pos = pos; ++ iter->k.type = KEY_TYPE_deleted; ++ iter->k.p = pos; ++ iter->k.size = 0; ++ iter->journal_idx = 0; ++ iter->journal_pos = POS_MIN; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ iter->ip_allocated = ip; ++#endif ++ ++ iter->path = bch2_path_get(trans, btree_id, iter->pos, ++ locks_want, depth, flags, ip); ++} ++ ++void bch2_trans_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) ++{ ++ if (!btree_id_cached(trans->c, btree_id)) { ++ flags &= ~BTREE_ITER_CACHED; ++ flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ } else if (!(flags & BTREE_ITER_CACHED)) ++ flags |= BTREE_ITER_WITH_KEY_CACHE; ++ ++ __bch2_trans_iter_init(trans, iter, btree_id, pos, ++ 0, 0, flags, _RET_IP_); ++} ++ ++void bch2_trans_node_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) ++{ ++ __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, ++ BTREE_ITER_NOT_EXTENTS| ++ __BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ flags, _RET_IP_); ++ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); ++ BUG_ON(iter->path->level != depth); ++ BUG_ON(iter->min_depth != depth); ++} ++ ++void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) ++{ ++ *dst = *src; ++ if (src->path) ++ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); ++ if (src->update_path) ++ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); ++ dst->key_cache_path = NULL; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ size_t new_top = trans->mem_top + size; ++ void *p; ++ ++ if (new_top > trans->mem_bytes) { ++ size_t old_bytes = trans->mem_bytes; ++ size_t new_bytes = roundup_pow_of_two(new_top); ++ void *new_mem; ++ ++ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); ++ ++ new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { ++ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); ++ new_bytes = BTREE_TRANS_MEM_MAX; ++ kfree(trans->mem); ++ } ++ ++ if (!new_mem) ++ return ERR_PTR(-ENOMEM); ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ ++ if (old_bytes) { ++ trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); ++ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); ++ } ++ } ++ ++ p = trans->mem + trans->mem_top; ++ trans->mem_top += size; ++ memset(p, 0, size); ++ return p; ++} ++ ++/** ++ * bch2_trans_begin() - reset a transaction after a interrupted attempt ++ * @trans: transaction to reset ++ * ++ * While iterating over nodes or updating nodes a attempt to lock a btree node ++ * may return BCH_ERR_transaction_restart when the trylock fails. When this ++ * occurs bch2_trans_begin() should be called and the transaction retried. ++ */ ++u32 bch2_trans_begin(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ ++ bch2_trans_reset_updates(trans); ++ ++ trans->mem_top = 0; ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset((void *) trans->fs_usage_deltas + ++ offsetof(struct replicas_delta_list, memset_start), 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ trans_for_each_path(trans, path) { ++ path->should_be_locked = false; ++ ++ /* ++ * If the transaction wasn't restarted, we're presuming to be ++ * doing something new: dont keep iterators excpt the ones that ++ * are in use - except for the subvolumes btree: ++ */ ++ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) ++ path->preserve = false; ++ ++ /* ++ * XXX: we probably shouldn't be doing this if the transaction ++ * was restarted, but currently we still overflow transaction ++ * iterators if we do that ++ */ ++ if (!path->ref && !path->preserve) ++ __bch2_path_free(trans, path); ++ else ++ path->preserve = false; ++ } ++ ++ if (!trans->restarted && ++ (need_resched() || ++ ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { ++ bch2_trans_unlock(trans); ++ cond_resched(); ++ bch2_trans_relock(trans); ++ } ++ ++ trans->last_restarted_ip = _RET_IP_; ++ if (trans->restarted) ++ bch2_btree_path_traverse_all(trans); ++ ++ trans->last_begin_time = ktime_get_ns(); ++ return trans->restart_count; ++} ++ ++void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) ++{ ++ bch2_trans_inconsistent_on(trans_was_restarted(trans, restart_count), trans, ++ "trans->restart_count %u, should be %u, last restarted by %ps\n", ++ trans->restart_count, restart_count, ++ (void *) trans->last_restarted_ip); ++} ++ ++static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) ++{ ++ size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; ++ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; ++ void *p = NULL; ++ ++ BUG_ON(trans->used_mempool); ++ ++#ifdef __KERNEL__ ++ p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); ++#endif ++ if (!p) ++ p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); ++ ++ trans->paths = p; p += paths_bytes; ++ trans->updates = p; p += updates_bytes; ++} ++ ++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes, ++ const char *fn) ++ __acquires(&c->btree_trans_barrier) ++{ ++ struct btree_trans *pos; ++ ++ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); ++ ++ memset(trans, 0, sizeof(*trans)); ++ trans->c = c; ++ trans->fn = fn; ++ trans->last_begin_time = ktime_get_ns(); ++ trans->task = current; ++ ++ while (c->lock_held_stats.names[trans->lock_name_idx] != fn ++ && c->lock_held_stats.names[trans->lock_name_idx] != 0) ++ trans->lock_name_idx++; ++ ++ if (trans->lock_name_idx >= BCH_LOCK_TIME_NR) ++ pr_warn_once("lock_times array not big enough!"); ++ else ++ c->lock_held_stats.names[trans->lock_name_idx] = fn; ++ ++ bch2_trans_alloc_paths(trans, c); ++ ++ if (expected_mem_bytes) { ++ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); ++ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); ++ ++ if (!unlikely(trans->mem)) { ++ trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); ++ trans->mem_bytes = BTREE_TRANS_MEM_MAX; ++ } ++ } ++ ++ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(pos, &c->btree_trans_list, list) { ++ if (trans->task->pid < pos->task->pid) { ++ list_add_tail(&trans->list, &pos->list); ++ goto list_add_done; ++ } ++ } ++ list_add_tail(&trans->list, &c->btree_trans_list); ++list_add_done: ++ mutex_unlock(&c->btree_trans_lock); ++} ++ ++static void check_btree_paths_leaked(struct btree_trans *trans) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ if (path->ref) ++ goto leaked; ++ return; ++leaked: ++ bch_err(c, "btree paths leaked from %s!", trans->fn); ++ trans_for_each_path(trans, path) ++ if (path->ref) ++ printk(KERN_ERR " btree %s %pS\n", ++ bch2_btree_ids[path->btree_id], ++ (void *) path->ip_allocated); ++ /* Be noisy about this: */ ++ bch2_fatal_error(c); ++#endif ++} ++ ++void bch2_trans_exit(struct btree_trans *trans) ++ __releases(&c->btree_trans_barrier) ++{ ++ struct btree_insert_entry *i; ++ struct bch_fs *c = trans->c; ++ ++ bch2_trans_unlock(trans); ++ ++ trans_for_each_update(trans, i) ++ __btree_path_put(i->path, true); ++ trans->nr_updates = 0; ++ ++ check_btree_paths_leaked(trans); ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_del(&trans->list); ++ mutex_unlock(&c->btree_trans_lock); ++ ++ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); ++ ++ kfree(trans->extra_journal_entries.data); ++ ++ if (trans->fs_usage_deltas) { ++ if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == ++ REPLICAS_DELTA_LIST_MAX) ++ mempool_free(trans->fs_usage_deltas, ++ &c->replicas_delta_pool); ++ else ++ kfree(trans->fs_usage_deltas); ++ } ++ ++ if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) ++ mempool_free(trans->mem, &c->btree_trans_mem_pool); ++ else ++ kfree(trans->mem); ++ ++#ifdef __KERNEL__ ++ /* ++ * Userspace doesn't have a real percpu implementation: ++ */ ++ trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); ++#endif ++ ++ if (trans->paths) ++ mempool_free(trans->paths, &c->btree_paths_pool); ++ ++ trans->mem = (void *) 0x1; ++ trans->paths = (void *) 0x1; ++} ++ ++static void __maybe_unused ++bch2_btree_path_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ bool cached) ++{ ++ prt_printf(out, " l=%u %s:", ++ _b->level, bch2_btree_ids[_b->btree_id]); ++ bch2_bpos_to_text(out, btree_node_pos(_b, cached)); ++} ++ ++void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ struct btree *b; ++ static char lock_types[] = { 'r', 'i', 'w' }; ++ unsigned l; ++ ++ prt_printf(out, "%i %s\n", trans->task->pid, trans->fn); ++ ++ trans_for_each_path(trans, path) { ++ if (!path->nodes_locked) ++ continue; ++ ++ prt_printf(out, " path %u %c l=%u %s:", ++ path->idx, ++ path->cached ? 'c' : 'b', ++ path->level, ++ bch2_btree_ids[path->btree_id]); ++ bch2_bpos_to_text(out, path->pos); ++ prt_printf(out, "\n"); ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) { ++ if (btree_node_locked(path, l)) { ++ prt_printf(out, " %s l=%u ", ++ btree_node_intent_locked(path, l) ? "i" : "r", l); ++ bch2_btree_path_node_to_text(out, ++ (void *) path->l[l].b, ++ path->cached); ++ prt_printf(out, "\n"); ++ } ++ } ++ } ++ ++ b = READ_ONCE(trans->locking); ++ if (b) { ++ path = &trans->paths[trans->locking_path_idx]; ++ prt_printf(out, " locking path %u %c l=%u %c %s:", ++ trans->locking_path_idx, ++ path->cached ? 'c' : 'b', ++ trans->locking_level, ++ lock_types[trans->locking_lock_type], ++ bch2_btree_ids[trans->locking_btree_id]); ++ bch2_bpos_to_text(out, trans->locking_pos); ++ ++ prt_printf(out, " node "); ++ bch2_btree_path_node_to_text(out, ++ (void *) b, path->cached); ++ prt_printf(out, "\n"); ++ } ++} ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *c) ++{ ++ if (c->btree_trans_barrier_initialized) ++ cleanup_srcu_struct(&c->btree_trans_barrier); ++ mempool_exit(&c->btree_trans_mem_pool); ++ mempool_exit(&c->btree_paths_pool); ++} ++ ++int bch2_fs_btree_iter_init(struct bch_fs *c) ++{ ++ unsigned nr = BTREE_ITER_MAX; ++ int ret; ++ ++ INIT_LIST_HEAD(&c->btree_trans_list); ++ mutex_init(&c->btree_trans_lock); ++ ++ ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, ++ sizeof(struct btree_path) * nr + ++ sizeof(struct btree_insert_entry) * nr) ?: ++ mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, ++ BTREE_TRANS_MEM_MAX) ?: ++ init_srcu_struct(&c->btree_trans_barrier); ++ if (!ret) ++ c->btree_trans_barrier_initialized = true; ++ return ret; ++} +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +new file mode 100644 +index 000000000000..1b02f75d4cab +--- /dev/null ++++ b/fs/bcachefs/btree_iter.h +@@ -0,0 +1,556 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_ITER_H ++#define _BCACHEFS_BTREE_ITER_H ++ ++#include "bset.h" ++#include "btree_types.h" ++ ++#include ++ ++static inline void __btree_path_get(struct btree_path *path, bool intent) ++{ ++ path->ref++; ++ path->intent_ref += intent; ++} ++ ++static inline bool __btree_path_put(struct btree_path *path, bool intent) ++{ ++ EBUG_ON(!path->ref); ++ EBUG_ON(!path->intent_ref && intent); ++ path->intent_ref -= intent; ++ return --path->ref == 0; ++} ++ ++static inline void btree_path_set_dirty(struct btree_path *path, ++ enum btree_path_uptodate u) ++{ ++ path->uptodate = max_t(unsigned, path->uptodate, u); ++} ++ ++static inline struct btree *btree_path_node(struct btree_path *path, ++ unsigned level) ++{ ++ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; ++} ++ ++static inline bool btree_node_lock_seq_matches(const struct btree_path *path, ++ const struct btree *b, unsigned level) ++{ ++ /* ++ * We don't compare the low bits of the lock sequence numbers because ++ * @path might have taken a write lock on @b, and we don't want to skip ++ * the linked path if the sequence numbers were equal before taking that ++ * write lock. The lock sequence number is incremented by taking and ++ * releasing write locks and is even when unlocked: ++ */ ++ return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; ++} ++ ++static inline struct btree *btree_node_parent(struct btree_path *path, ++ struct btree *b) ++{ ++ return btree_path_node(path, b->c.level + 1); ++} ++ ++/* Iterate over paths within a transaction: */ ++ ++static inline struct btree_path * ++__trans_next_path(struct btree_trans *trans, unsigned idx) ++{ ++ u64 l; ++ ++ if (idx == BTREE_ITER_MAX) ++ return NULL; ++ ++ l = trans->paths_allocated >> idx; ++ if (!l) ++ return NULL; ++ ++ idx += __ffs64(l); ++ EBUG_ON(idx >= BTREE_ITER_MAX); ++ EBUG_ON(trans->paths[idx].idx != idx); ++ return &trans->paths[idx]; ++} ++ ++void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); ++ ++#define trans_for_each_path(_trans, _path) \ ++ for (_path = __trans_next_path((_trans), 0); \ ++ (_path); \ ++ _path = __trans_next_path((_trans), (_path)->idx + 1)) ++ ++static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) ++{ ++ unsigned idx = path ? path->sorted_idx + 1 : 0; ++ ++ EBUG_ON(idx > trans->nr_sorted); ++ ++ return idx < trans->nr_sorted ++ ? trans->paths + trans->sorted[idx] ++ : NULL; ++} ++ ++static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) ++{ ++ EBUG_ON(path->sorted_idx >= trans->nr_sorted); ++ return path->sorted_idx ++ ? trans->paths + trans->sorted[path->sorted_idx - 1] ++ : NULL; ++} ++ ++#define trans_for_each_path_inorder(_trans, _path, _i) \ ++ for (_i = 0; \ ++ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ ++ _i++) ++ ++static inline bool __path_has_node(const struct btree_path *path, ++ const struct btree *b) ++{ ++ return path->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(path, b, b->c.level); ++} ++ ++static inline struct btree_path * ++__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, ++ unsigned idx) ++{ ++ struct btree_path *path = __trans_next_path(trans, idx); ++ ++ while (path && !__path_has_node(path, b)) ++ path = __trans_next_path(trans, path->idx + 1); ++ ++ return path; ++} ++ ++#define trans_for_each_path_with_node(_trans, _b, _path) \ ++ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ ++ (_path); \ ++ _path = __trans_next_path_with_node((_trans), (_b), \ ++ (_path)->idx + 1)) ++ ++struct btree_path * __must_check ++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, ++ bool, unsigned long); ++struct btree_path * __must_check ++bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, ++ struct bpos, bool, unsigned long); ++int __must_check bch2_btree_path_traverse(struct btree_trans *, ++ struct btree_path *, unsigned); ++struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned, unsigned long); ++inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); ++ ++struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, ++ struct btree_iter *, struct bpos); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_trans_verify_paths(struct btree_trans *); ++void bch2_trans_verify_locks(struct btree_trans *); ++void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, ++ struct bpos, bool); ++#else ++static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} ++static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} ++static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, ++ struct bpos pos, bool key_cache) {} ++#endif ++ ++void bch2_btree_path_fix_key_modified(struct btree_trans *trans, ++ struct btree *, struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, ++ struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, unsigned, unsigned); ++ ++int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); ++ ++void bch2_path_put(struct btree_trans *, struct btree_path *, bool); ++ ++int bch2_trans_relock(struct btree_trans *); ++void bch2_trans_unlock(struct btree_trans *); ++ ++static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) ++{ ++ return restart_count != trans->restart_count; ++} ++ ++void bch2_trans_verify_not_restarted(struct btree_trans *, u32); ++ ++__always_inline ++static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) ++{ ++ BUG_ON(err <= 0); ++ BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart)); ++ ++ trans->restarted = err; ++ trans->restart_count++; ++ return -err; ++} ++ ++__always_inline ++static inline int btree_trans_restart(struct btree_trans *trans, int err) ++{ ++ btree_trans_restart_nounlock(trans, err); ++ return -err; ++} ++ ++bool bch2_btree_node_upgrade(struct btree_trans *, ++ struct btree_path *, unsigned); ++ ++bool __bch2_btree_path_upgrade(struct btree_trans *, ++ struct btree_path *, unsigned); ++ ++static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) ++{ ++ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); ++ ++ return path->locks_want < new_locks_want ++ ? __bch2_btree_path_upgrade(trans, path, new_locks_want) ++ : path->uptodate == BTREE_ITER_UPTODATE; ++} ++ ++void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); ++ ++static inline void bch2_btree_path_downgrade(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ unsigned new_locks_want = path->level + !!path->intent_ref; ++ ++ if (path->locks_want > new_locks_want) ++ __bch2_btree_path_downgrade(trans, path, new_locks_want); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *); ++ ++void bch2_trans_node_add(struct btree_trans *trans, struct btree *); ++void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); ++int __must_check bch2_btree_iter_traverse(struct btree_iter *); ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); ++ ++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ return bch2_btree_iter_peek_upto(iter, SPOS_MAX); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); ++ ++bool bch2_btree_iter_advance(struct btree_iter *); ++bool bch2_btree_iter_rewind(struct btree_iter *); ++ ++static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ iter->k.type = KEY_TYPE_deleted; ++ iter->k.p.inode = iter->pos.inode = new_pos.inode; ++ iter->k.p.offset = iter->pos.offset = new_pos.offset; ++ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; ++ iter->k.size = 0; ++} ++ ++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ if (unlikely(iter->update_path)) ++ bch2_path_put(iter->trans, iter->update_path, ++ iter->flags & BTREE_ITER_INTENT); ++ iter->update_path = NULL; ++ ++ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) ++ new_pos.snapshot = iter->snapshot; ++ ++ __bch2_btree_iter_set_pos(iter, new_pos); ++} ++ ++static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) ++{ ++ BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); ++ iter->pos = bkey_start_pos(&iter->k); ++} ++ ++static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) ++{ ++ struct bpos pos = iter->pos; ++ ++ iter->snapshot = snapshot; ++ pos.snapshot = snapshot; ++ bch2_btree_iter_set_pos(iter, pos); ++} ++ ++void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); ++void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, ++ unsigned, struct bpos, unsigned); ++void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); ++ ++static inline void set_btree_iter_dontneed(struct btree_iter *iter) ++{ ++ iter->path->preserve = false; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++u32 bch2_trans_begin(struct btree_trans *); ++ ++static inline struct btree * ++__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ struct btree *b; ++ ++ while (b = bch2_btree_iter_peek_node(iter), ++ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) ++ bch2_trans_begin(trans); ++ ++ return b; ++} ++ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b, _ret) \ ++ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ ++ _start, _locks_want, _depth, _flags); \ ++ (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\ ++ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ ++ (_b) = bch2_btree_iter_next_node(&(_iter))) ++ ++#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _flags, _b, _ret) \ ++ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ 0, 0, _flags, _b, _ret) ++ ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ ++static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, ++ unsigned flags) ++{ ++ BUG_ON(flags & BTREE_ITER_ALL_LEVELS); ++ ++ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : ++ bch2_btree_iter_peek_prev(iter); ++} ++ ++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : ++ flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : ++ bch2_btree_iter_peek(iter); ++} ++ ++static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, ++ struct bpos end, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_ITER_SLOTS)) ++ return bch2_btree_iter_peek_upto(iter, end); ++ ++ if (bkey_cmp(iter->pos, end) > 0) ++ return bkey_s_c_null; ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++static inline int btree_trans_too_many_iters(struct btree_trans *trans) ++{ ++ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX) { ++ trace_trans_restart_too_many_iters(trans->fn, _THIS_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); ++ } ++ ++ return 0; ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_s_c k; ++ ++ while (btree_trans_too_many_iters(trans) || ++ (k = bch2_btree_iter_peek_type(iter, flags), ++ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) ++ bch2_trans_begin(trans); ++ ++ return k; ++} ++ ++#define lockrestart_do(_trans, _do) \ ++({ \ ++ u32 _restart_count; \ ++ int _ret; \ ++ \ ++ do { \ ++ _restart_count = bch2_trans_begin(_trans); \ ++ _ret = (_do); \ ++ } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ ++ \ ++ if (!_ret) \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ \ ++ _ret; \ ++}) ++ ++/* ++ * nested_lockrestart_do(), nested_commit_do(): ++ * ++ * These are like lockrestart_do() and commit_do(), with two differences: ++ * ++ * - We don't call bch2_trans_begin() unless we had a transaction restart ++ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a ++ * transaction restart ++ */ ++#define nested_lockrestart_do(_trans, _do) \ ++({ \ ++ u32 _restart_count, _orig_restart_count; \ ++ int _ret; \ ++ \ ++ _restart_count = _orig_restart_count = (_trans)->restart_count; \ ++ \ ++ while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ ++ _restart_count = bch2_trans_begin(_trans); \ ++ \ ++ if (!_ret) \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ \ ++ if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ ++ _ret = -BCH_ERR_transaction_restart_nested; \ ++ \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key2(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _do) \ ++({ \ ++ int _ret = 0; \ ++ \ ++ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ \ ++ while (1) { \ ++ u32 _restart_count = bch2_trans_begin(_trans); \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ ++ if (!(_k).k) { \ ++ _ret = 0; \ ++ break; \ ++ } \ ++ \ ++ _ret = bkey_err(_k) ?: (_do); \ ++ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ ++ continue; \ ++ if (_ret) \ ++ break; \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ if (!bch2_btree_iter_advance(&(_iter))) \ ++ break; \ ++ } \ ++ \ ++ bch2_trans_iter_exit((_trans), &(_iter)); \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _do) \ ++({ \ ++ int _ret = 0; \ ++ \ ++ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ \ ++ while (1) { \ ++ u32 _restart_count = bch2_trans_begin(_trans); \ ++ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ ++ if (!(_k).k) { \ ++ _ret = 0; \ ++ break; \ ++ } \ ++ \ ++ _ret = bkey_err(_k) ?: (_do); \ ++ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ ++ continue; \ ++ if (_ret) \ ++ break; \ ++ bch2_trans_verify_not_restarted(_trans, _restart_count);\ ++ if (!bch2_btree_iter_rewind(&(_iter))) \ ++ break; \ ++ } \ ++ \ ++ bch2_trans_iter_exit((_trans), &(_iter)); \ ++ _ret; \ ++}) ++ ++#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ ++ _start, _iter_flags, _k, \ ++ _disk_res, _journal_seq, _commit_flags,\ ++ _do) \ ++ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ ++ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_commit_flags))) ++ ++#define for_each_btree_key(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ ++ _start, _end, _flags, _k, _ret) \ ++ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ ++ for (; \ ++ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ ++ for (; \ ++ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++/* new multiple iterator interface: */ ++ ++void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); ++void bch2_dump_trans_updates(struct btree_trans *); ++void bch2_dump_trans_paths_updates(struct btree_trans *); ++void __bch2_trans_init(struct btree_trans *, struct bch_fs *, ++ unsigned, size_t, const char *); ++void bch2_trans_exit(struct btree_trans *); ++ ++#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) ++ ++void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *); ++int bch2_fs_btree_iter_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +new file mode 100644 +index 000000000000..661006e427f2 +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.c +@@ -0,0 +1,855 @@ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "errcode.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++ ++#include ++#include ++ ++static struct kmem_cache *bch2_key_cache; ++ ++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct bkey_cached *ck = obj; ++ const struct bkey_cached_key *key = arg->key; ++ ++ return cmp_int(ck->key.btree_id, key->btree_id) ?: ++ bpos_cmp(ck->key.pos, key->pos); ++} ++ ++static const struct rhashtable_params bch2_btree_key_cache_params = { ++ .head_offset = offsetof(struct bkey_cached, hash), ++ .key_offset = offsetof(struct bkey_cached, key), ++ .key_len = sizeof(struct bkey_cached_key), ++ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, ++}; ++ ++__flatten ++inline struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++{ ++ struct bkey_cached_key key = { ++ .btree_id = btree_id, ++ .pos = pos, ++ }; ++ ++ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, ++ bch2_btree_key_cache_params); ++} ++ ++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) ++{ ++ if (!six_trylock_intent(&ck->c.lock)) ++ return false; ++ ++ if (!six_trylock_write(&ck->c.lock)) { ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void bkey_cached_evict(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, ++ bch2_btree_key_cache_params)); ++ memset(&ck->key, ~0, sizeof(ck->key)); ++ ++ atomic_long_dec(&c->nr_keys); ++} ++ ++static void bkey_cached_free(struct btree_key_cache *bc, ++ struct bkey_cached *ck) ++{ ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ ++ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ++ ++ ck->btree_trans_barrier_seq = ++ start_poll_synchronize_srcu(&c->btree_trans_barrier); ++ ++ list_move_tail(&ck->list, &bc->freed); ++ atomic_long_inc(&bc->nr_freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static void bkey_cached_free_fast(struct btree_key_cache *bc, ++ struct bkey_cached *ck) ++{ ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ struct btree_key_cache_freelist *f; ++ bool freed = false; ++ ++ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ++ ++ ck->btree_trans_barrier_seq = ++ start_poll_synchronize_srcu(&c->btree_trans_barrier); ++ ++ list_del_init(&ck->list); ++ atomic_long_inc(&bc->nr_freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ preempt_disable(); ++ f = this_cpu_ptr(bc->pcpu_freed); ++ ++ if (f->nr < ARRAY_SIZE(f->objs)) { ++ f->objs[f->nr++] = ck; ++ freed = true; ++ } ++ preempt_enable(); ++ ++ if (!freed) { ++ mutex_lock(&bc->lock); ++ preempt_disable(); ++ f = this_cpu_ptr(bc->pcpu_freed); ++ ++ while (f->nr > ARRAY_SIZE(f->objs) / 2) { ++ struct bkey_cached *ck2 = f->objs[--f->nr]; ++ ++ list_move_tail(&ck2->list, &bc->freed); ++ } ++ preempt_enable(); ++ ++ list_move_tail(&ck->list, &bc->freed); ++ mutex_unlock(&bc->lock); ++ } ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static struct bkey_cached * ++bkey_cached_alloc(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck = NULL; ++ struct btree_key_cache_freelist *f; ++ ++ preempt_disable(); ++ f = this_cpu_ptr(c->pcpu_freed); ++ if (f->nr) ++ ck = f->objs[--f->nr]; ++ preempt_enable(); ++ ++ if (!ck) { ++ mutex_lock(&c->lock); ++ preempt_disable(); ++ f = this_cpu_ptr(c->pcpu_freed); ++ ++ while (!list_empty(&c->freed) && ++ f->nr < ARRAY_SIZE(f->objs) / 2) { ++ ck = list_last_entry(&c->freed, struct bkey_cached, list); ++ list_del_init(&ck->list); ++ f->objs[f->nr++] = ck; ++ } ++ ++ ck = f->nr ? f->objs[--f->nr] : NULL; ++ preempt_enable(); ++ mutex_unlock(&c->lock); ++ } ++ ++ if (ck) { ++ six_lock_intent(&ck->c.lock, NULL, NULL); ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ return ck; ++ } ++ ++ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); ++ if (likely(ck)) { ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ return ck; ++ } ++ ++ return NULL; ++} ++ ++static struct bkey_cached * ++bkey_cached_reuse(struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct bkey_cached *ck; ++ unsigned i; ++ ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ rcu_read_unlock(); ++ return ck; ++ } ++ } ++ rcu_read_unlock(); ++ ++ return NULL; ++} ++ ++static struct bkey_cached * ++btree_key_cache_create(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ struct bkey_cached *ck; ++ bool was_new = true; ++ ++ ck = bkey_cached_alloc(bc); ++ ++ if (unlikely(!ck)) { ++ ck = bkey_cached_reuse(bc); ++ if (unlikely(!ck)) { ++ bch_err(c, "error allocating memory for key cache item, btree %s", ++ bch2_btree_ids[btree_id]); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ was_new = false; ++ } else { ++ if (btree_id == BTREE_ID_subvolumes) ++ six_lock_pcpu_alloc(&ck->c.lock); ++ else ++ six_lock_pcpu_free(&ck->c.lock); ++ } ++ ++ ck->c.level = 0; ++ ck->c.btree_id = btree_id; ++ ck->key.btree_id = btree_id; ++ ck->key.pos = pos; ++ ck->valid = false; ++ ck->flags = 1U << BKEY_CACHED_ACCESSED; ++ ++ if (unlikely(rhashtable_lookup_insert_fast(&bc->table, ++ &ck->hash, ++ bch2_btree_key_cache_params))) { ++ /* We raced with another fill: */ ++ ++ if (likely(was_new)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ kfree(ck); ++ } else { ++ bkey_cached_free_fast(bc, ck); ++ } ++ ++ return NULL; ++ } ++ ++ atomic_long_inc(&bc->nr_keys); ++ ++ six_unlock_write(&ck->c.lock); ++ ++ return ck; ++} ++ ++static int btree_key_cache_fill(struct btree_trans *trans, ++ struct btree_path *ck_path, ++ struct bkey_cached *ck) ++{ ++ struct btree_path *path; ++ struct bkey_s_c k; ++ unsigned new_u64s = 0; ++ struct bkey_i *new_k = NULL; ++ struct bkey u; ++ int ret; ++ ++ path = bch2_path_get(trans, ck->key.btree_id, ++ ck->key.pos, 0, 0, 0, _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, path, 0); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_path_peek_slot(path, &u); ++ ++ if (!bch2_btree_node_relock(trans, ck_path, 0)) { ++ trace_trans_restart_relock_key_cache_fill(trans->fn, ++ _THIS_IP_, ck_path->btree_id, &ck_path->pos); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); ++ goto err; ++ } ++ ++ /* ++ * bch2_varint_decode can read past the end of the buffer by at ++ * most 7 bytes (it won't be used): ++ */ ++ new_u64s = k.k->u64s + 1; ++ ++ /* ++ * Allocate some extra space so that the transaction commit path is less ++ * likely to have to reallocate, since that requires a transaction ++ * restart: ++ */ ++ new_u64s = min(256U, (new_u64s * 3) / 2); ++ ++ if (new_u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(new_u64s); ++ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[ck->key.btree_id], new_u64s); ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ /* ++ * XXX: not allowed to be holding read locks when we take a write lock, ++ * currently ++ */ ++ bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); ++ if (new_k) { ++ kfree(ck->k); ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ } ++ ++ bkey_reassemble(ck->k, k); ++ ck->valid = true; ++ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); ++ ++ /* We're not likely to need this iterator again: */ ++ path->preserve = false; ++err: ++ bch2_path_put(trans, path, 0); ++ return ret; ++} ++ ++static int bkey_cached_check_fn(struct six_lock *lock, void *p) ++{ ++ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); ++ const struct btree_path *path = p; ++ ++ if (ck->key.btree_id != path->btree_id && ++ bpos_cmp(ck->key.pos, path->pos)) ++ return BCH_ERR_lock_fail_node_reused; ++ return 0; ++} ++ ++__flatten ++int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck; ++ int ret = 0; ++ ++ BUG_ON(path->level); ++ ++ path->l[1].b = NULL; ++ ++ if (bch2_btree_node_relock(trans, path, 0)) { ++ ck = (void *) path->l[0].b; ++ goto fill; ++ } ++retry: ++ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); ++ if (!ck) { ++ if (flags & BTREE_ITER_CACHED_NOCREATE) { ++ path->l[0].b = NULL; ++ return 0; ++ } ++ ++ ck = btree_key_cache_create(c, path->btree_id, path->pos); ++ ret = PTR_ERR_OR_ZERO(ck); ++ if (ret) ++ goto err; ++ if (!ck) ++ goto retry; ++ ++ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); ++ path->locks_want = 1; ++ } else { ++ enum six_lock_type lock_want = __btree_lock_want(path, 0); ++ ++ ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0, ++ lock_want, ++ bkey_cached_check_fn, path, _THIS_IP_); ++ if (ret) { ++ if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) ++ goto retry; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto err; ++ BUG(); ++ } ++ ++ if (ck->key.btree_id != path->btree_id || ++ bpos_cmp(ck->key.pos, path->pos)) { ++ six_unlock_type(&ck->c.lock, lock_want); ++ goto retry; ++ } ++ ++ mark_btree_node_locked(trans, path, 0, lock_want); ++ } ++ ++ path->l[0].lock_seq = ck->c.lock.state.seq; ++ path->l[0].b = (void *) ck; ++fill: ++ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!path->locks_want && ++ !__bch2_btree_path_upgrade(trans, path, 1)) { ++ trace_transaction_restart_ip(trans->fn, _THIS_IP_); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); ++ goto err; ++ } ++ ++ ret = btree_key_cache_fill(trans, path, ck); ++ if (ret) ++ goto err; ++ } ++ ++ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ set_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ ++ path->uptodate = BTREE_ITER_UPTODATE; ++ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); ++ ++ return ret; ++err: ++ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ btree_node_unlock(trans, path, 0); ++ path->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ } ++ return ret; ++} ++ ++static int btree_key_cache_flush_pos(struct btree_trans *trans, ++ struct bkey_cached_key key, ++ u64 journal_seq, ++ unsigned commit_flags, ++ bool evict) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree_iter c_iter, b_iter; ++ struct bkey_cached *ck = NULL; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; ++ ++ ret = bch2_btree_iter_traverse(&c_iter); ++ if (ret) ++ goto out; ++ ++ ck = (void *) c_iter.path->l[0].b; ++ if (!ck) ++ goto out; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ if (evict) ++ goto evict; ++ goto out; ++ } ++ ++ BUG_ON(!ck->valid); ++ ++ if (journal_seq && ck->journal.seq != journal_seq) ++ goto out; ++ ++ /* ++ * Since journal reclaim depends on us making progress here, and the ++ * allocator/copygc depend on journal reclaim making progress, we need ++ * to be using alloc reserves: ++ * */ ++ ret = bch2_btree_iter_traverse(&b_iter) ?: ++ bch2_trans_update(trans, &b_iter, ck->k, ++ BTREE_UPDATE_KEY_CACHE_RECLAIM| ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ (ck->journal.seq == journal_last_seq(j) ++ ? JOURNAL_WATERMARK_reserved ++ : 0)| ++ commit_flags); ++ ++ bch2_fs_fatal_err_on(ret && ++ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && ++ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && ++ !bch2_journal_error(j), c, ++ "error flushing key cache: %s", bch2_err_str(ret)); ++ if (ret) ++ goto out; ++ ++ bch2_journal_pin_drop(j, &ck->journal); ++ bch2_journal_preres_put(j, &ck->res); ++ ++ BUG_ON(!btree_node_locked(c_iter.path, 0)); ++ ++ if (!evict) { ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ atomic_long_dec(&c->btree_key_cache.nr_dirty); ++ } ++ } else { ++evict: ++ BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); ++ ++ mark_btree_node_unlocked(c_iter.path, 0); ++ c_iter.path->l[0].b = NULL; ++ ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ atomic_long_dec(&c->btree_key_cache.nr_dirty); ++ } ++ ++ bkey_cached_evict(&c->btree_key_cache, ck); ++ ++ bkey_cached_free_fast(&c->btree_key_cache, ck); ++ } ++out: ++ bch2_trans_iter_exit(trans, &b_iter); ++ bch2_trans_iter_exit(trans, &c_iter); ++ return ret; ++} ++ ++int bch2_btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bkey_cached *ck = ++ container_of(pin, struct bkey_cached, journal); ++ struct bkey_cached_key key; ++ int ret = 0; ++ ++ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ ++ six_lock_read(&ck->c.lock, NULL, NULL); ++ key = ck->key; ++ ++ if (ck->journal.seq != seq || ++ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_read(&ck->c.lock); ++ goto unlock; ++ } ++ six_unlock_read(&ck->c.lock); ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ btree_key_cache_flush_pos(&trans, key, seq, ++ BTREE_INSERT_JOURNAL_RECLAIM, false)); ++unlock: ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); ++ ++ return ret; ++} ++ ++/* ++ * Flush and evict a key from the key cache: ++ */ ++int bch2_btree_key_cache_flush(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached_key key = { id, pos }; ++ ++ /* Fastpath - assume it won't be found: */ ++ if (!bch2_btree_key_cache_find(c, id, pos)) ++ return 0; ++ ++ return btree_key_cache_flush_pos(trans, key, 0, 0, true); ++} ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *trans, ++ struct btree_path *path, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ bool kick_reclaim = false; ++ ++ BUG_ON(insert->u64s > ck->u64s); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ int difference; ++ ++ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); ++ ++ difference = jset_u64s(insert->u64s) - ck->res.u64s; ++ if (difference > 0) { ++ trans->journal_preres.u64s -= difference; ++ ck->res.u64s += difference; ++ } ++ } ++ ++ bkey_copy(ck->k, insert); ++ ck->valid = true; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ atomic_long_inc(&c->btree_key_cache.nr_dirty); ++ ++ if (bch2_nr_btree_keys_need_flush(c)) ++ kick_reclaim = true; ++ } ++ ++ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, ++ &ck->journal, bch2_btree_key_cache_journal_flush); ++ ++ if (kick_reclaim) ++ journal_reclaim_kick(&c->journal); ++ return true; ++} ++ ++void bch2_btree_key_cache_drop(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ ++ ck->valid = false; ++ ++ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ++} ++ ++static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ struct bucket_table *tbl; ++ struct bkey_cached *ck, *t; ++ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; ++ unsigned start, flags; ++ int srcu_idx; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ flags = memalloc_nofs_save(); ++ ++ /* ++ * Newest freed entries are at the end of the list - once we hit one ++ * that's too new to be freed, we can bail out: ++ */ ++ list_for_each_entry_safe(ck, t, &bc->freed, list) { ++ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ++ ck->btree_trans_barrier_seq)) ++ break; ++ ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ atomic_long_dec(&bc->nr_freed); ++ scanned++; ++ freed++; ++ } ++ ++ if (scanned >= nr) ++ goto out; ++ ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); ++ if (bc->shrink_iter >= tbl->size) ++ bc->shrink_iter = 0; ++ start = bc->shrink_iter; ++ ++ do { ++ struct rhash_head *pos, *next; ++ ++ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); ++ ++ while (!rht_is_a_nulls(pos)) { ++ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); ++ ck = container_of(pos, struct bkey_cached, hash); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ goto next; ++ ++ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ else if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(bc, ck); ++ bkey_cached_free(bc, ck); ++ } ++ ++ scanned++; ++ if (scanned >= nr) ++ break; ++next: ++ pos = next; ++ } ++ ++ bc->shrink_iter++; ++ if (bc->shrink_iter >= tbl->size) ++ bc->shrink_iter = 0; ++ } while (scanned < nr && bc->shrink_iter != start); ++ ++ rcu_read_unlock(); ++out: ++ memalloc_nofs_restore(flags); ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); ++ mutex_unlock(&bc->lock); ++ ++ return freed; ++} ++ ++static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ long nr = atomic_long_read(&bc->nr_keys) - ++ atomic_long_read(&bc->nr_dirty); ++ ++ return max(0L, nr); ++} ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) ++{ ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ struct bucket_table *tbl; ++ struct bkey_cached *ck, *n; ++ struct rhash_head *pos; ++ unsigned i; ++ int cpu; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ mutex_lock(&bc->lock); ++ ++ rcu_read_lock(); ++ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); ++ if (tbl) ++ for (i = 0; i < tbl->size; i++) ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ bkey_cached_evict(bc, ck); ++ list_add(&ck->list, &bc->freed); ++ } ++ rcu_read_unlock(); ++ ++ for_each_possible_cpu(cpu) { ++ struct btree_key_cache_freelist *f = ++ per_cpu_ptr(bc->pcpu_freed, cpu); ++ ++ for (i = 0; i < f->nr; i++) { ++ ck = f->objs[i]; ++ list_add(&ck->list, &bc->freed); ++ } ++ } ++ ++ list_for_each_entry_safe(ck, n, &bc->freed, list) { ++ cond_resched(); ++ ++ bch2_journal_pin_drop(&c->journal, &ck->journal); ++ bch2_journal_preres_put(&c->journal, &ck->res); ++ ++ list_del(&ck->list); ++ kfree(ck->k); ++ kmem_cache_free(bch2_key_cache, ck); ++ } ++ ++ BUG_ON(atomic_long_read(&bc->nr_dirty) && ++ !bch2_journal_error(&c->journal) && ++ test_bit(BCH_FS_WAS_RW, &c->flags)); ++ BUG_ON(atomic_long_read(&bc->nr_keys)); ++ ++ mutex_unlock(&bc->lock); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++ ++ free_percpu(bc->pcpu_freed); ++} ++ ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) ++{ ++ mutex_init(&c->lock); ++ INIT_LIST_HEAD(&c->freed); ++} ++ ++static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) ++{ ++ struct btree_key_cache *bc = ++ container_of(shrink, struct btree_key_cache, shrink); ++ ++ bch2_btree_key_cache_to_text(out, bc); ++} ++ ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) ++{ ++ int ret; ++ ++ c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); ++ if (!c->pcpu_freed) ++ return -ENOMEM; ++ ++ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++ if (ret) ++ return ret; ++ ++ c->table_init_done = true; ++ ++ c->shrink.seeks = 1; ++ c->shrink.count_objects = bch2_btree_key_cache_count; ++ c->shrink.scan_objects = bch2_btree_key_cache_scan; ++ c->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; ++ return register_shrinker(&c->shrink); ++} ++ ++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) ++{ ++ prt_printf(out, "nr_freed:\t%zu\n", atomic_long_read(&c->nr_freed)); ++ prt_printf(out, "nr_keys:\t%lu\n", atomic_long_read(&c->nr_keys)); ++ prt_printf(out, "nr_dirty:\t%lu\n", atomic_long_read(&c->nr_dirty)); ++} ++ ++void bch2_btree_key_cache_exit(void) ++{ ++ if (bch2_key_cache) ++ kmem_cache_destroy(bch2_key_cache); ++} ++ ++int __init bch2_btree_key_cache_init(void) ++{ ++ bch2_key_cache = KMEM_CACHE(bkey_cached, 0); ++ if (!bch2_key_cache) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +new file mode 100644 +index 000000000000..670746e72dab +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.h +@@ -0,0 +1,47 @@ ++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H ++#define _BCACHEFS_BTREE_KEY_CACHE_H ++ ++static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) ++{ ++ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); ++ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); ++ size_t max_dirty = 1024 + nr_keys / 2; ++ ++ return max_t(ssize_t, 0, nr_dirty - max_dirty); ++} ++ ++static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) ++{ ++ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); ++ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); ++ size_t max_dirty = 4096 + (nr_keys * 3) / 4; ++ ++ return nr_dirty > max_dirty; ++} ++ ++int bch2_btree_key_cache_journal_flush(struct journal *, ++ struct journal_entry_pin *, u64); ++ ++struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); ++ ++int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, ++ unsigned); ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *, ++ struct btree_path *, struct bkey_i *); ++int bch2_btree_key_cache_flush(struct btree_trans *, ++ enum btree_id, struct bpos); ++void bch2_btree_key_cache_drop(struct btree_trans *, ++ struct btree_path *); ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *); ++ ++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); ++ ++void bch2_btree_key_cache_exit(void); ++int __init bch2_btree_key_cache_init(void); ++ ++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +new file mode 100644 +index 000000000000..49eef650e436 +--- /dev/null ++++ b/fs/bcachefs/btree_locking.h +@@ -0,0 +1,289 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_LOCKING_H ++#define _BCACHEFS_BTREE_LOCKING_H ++ ++/* ++ * Only for internal btree use: ++ * ++ * The btree iterator tracks what locks it wants to take, and what locks it ++ * currently has - here we have wrappers for locking/unlocking btree nodes and ++ * updating the iterator state ++ */ ++ ++#include ++ ++#include "btree_iter.h" ++ ++/* matches six lock types */ ++enum btree_node_locked_type { ++ BTREE_NODE_UNLOCKED = -1, ++ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, ++ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, ++}; ++ ++static inline int btree_node_locked_type(struct btree_path *path, ++ unsigned level) ++{ ++ /* ++ * We're relying on the fact that if nodes_intent_locked is set ++ * nodes_locked must be set as well, so that we can compute without ++ * branches: ++ */ ++ return BTREE_NODE_UNLOCKED + ++ ((path->nodes_locked >> level) & 1) + ++ ((path->nodes_intent_locked >> level) & 1); ++} ++ ++static inline bool btree_node_intent_locked(struct btree_path *path, ++ unsigned level) ++{ ++ return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; ++} ++ ++static inline bool btree_node_read_locked(struct btree_path *path, ++ unsigned level) ++{ ++ return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; ++} ++ ++static inline bool btree_node_locked(struct btree_path *path, unsigned level) ++{ ++ return path->nodes_locked & (1 << level); ++} ++ ++static inline void mark_btree_node_unlocked(struct btree_path *path, ++ unsigned level) ++{ ++ path->nodes_locked &= ~(1 << level); ++ path->nodes_intent_locked &= ~(1 << level); ++} ++ ++static inline void mark_btree_node_locked_noreset(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ /* relying on this to avoid a branch */ ++ BUILD_BUG_ON(SIX_LOCK_read != 0); ++ BUILD_BUG_ON(SIX_LOCK_intent != 1); ++ ++ BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx); ++ ++ path->nodes_locked |= 1 << level; ++ path->nodes_intent_locked |= type << level; ++} ++ ++static inline void mark_btree_node_locked(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ mark_btree_node_locked_noreset(trans, path, level, type); ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ path->l[level].lock_taken_time = ktime_get_ns(); ++#endif ++} ++ ++static inline void mark_btree_node_intent_locked(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level) ++{ ++ mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent); ++} ++ ++static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) ++{ ++ return level < path->locks_want ++ ? SIX_LOCK_intent ++ : SIX_LOCK_read; ++} ++ ++static inline enum btree_node_locked_type ++btree_lock_want(struct btree_path *path, int level) ++{ ++ if (level < path->level) ++ return BTREE_NODE_UNLOCKED; ++ if (level < path->locks_want) ++ return BTREE_NODE_INTENT_LOCKED; ++ if (level == path->level) ++ return BTREE_NODE_READ_LOCKED; ++ return BTREE_NODE_UNLOCKED; ++} ++ ++static inline void btree_node_unlock(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) ++{ ++ int lock_type = btree_node_locked_type(path, level); ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (lock_type != BTREE_NODE_UNLOCKED) { ++ six_unlock_type(&path->l[level].b->c.lock, lock_type); ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ if (trans->lock_name_idx < BCH_LOCK_TIME_NR) { ++ struct bch_fs *c = trans->c; ++ ++ __bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx], ++ path->l[level].lock_taken_time, ++ ktime_get_ns()); ++ } ++#endif ++ } ++ mark_btree_node_unlocked(path, level); ++} ++ ++static inline void __bch2_btree_path_unlock(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); ++ ++ while (path->nodes_locked) ++ btree_node_unlock(trans, path, __ffs(path->nodes_locked)); ++} ++ ++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) ++{ ++ switch (type) { ++ case SIX_LOCK_read: ++ return BCH_TIME_btree_lock_contended_read; ++ case SIX_LOCK_intent: ++ return BCH_TIME_btree_lock_contended_intent; ++ case SIX_LOCK_write: ++ return BCH_TIME_btree_lock_contended_write; ++ default: ++ BUG(); ++ } ++} ++ ++static inline int btree_node_lock_type(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct bch_fs *c = trans->c; ++ u64 start_time; ++ int ret; ++ ++ if (six_trylock_type(&b->c.lock, type)) ++ return 0; ++ ++ start_time = local_clock(); ++ ++ trans->locking_path_idx = path->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = path->btree_id; ++ trans->locking_level = level; ++ trans->locking_lock_type = type; ++ trans->locking = b; ++ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p); ++ trans->locking = NULL; ++ ++ if (ret) ++ return ret; ++ ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++ return 0; ++} ++ ++/* ++ * Lock a btree node if we already have it locked on one of our linked ++ * iterators: ++ */ ++static inline bool btree_node_lock_increment(struct btree_trans *trans, ++ struct btree *b, unsigned level, ++ enum btree_node_locked_type want) ++{ ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ if (path->l[level].b == b && ++ btree_node_locked_type(path, level) >= want) { ++ six_lock_increment(&b->c.lock, want); ++ return true; ++ } ++ ++ return false; ++} ++ ++int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, ++ struct btree *, struct bpos, unsigned, ++ enum six_lock_type, ++ six_lock_should_sleep_fn, void *, ++ unsigned long); ++ ++static inline int btree_node_lock(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, struct bpos pos, unsigned level, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) ++{ ++ int ret = 0; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); ++ ++ if (likely(six_trylock_type(&b->c.lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || ++ !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type, ++ should_sleep_fn, p, ip))) { ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ path->l[b->c.level].lock_taken_time = ktime_get_ns(); ++#endif ++ } ++ ++ return ret; ++} ++ ++bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); ++ ++static inline bool bch2_btree_node_relock(struct btree_trans *trans, ++ struct btree_path *path, unsigned level) ++{ ++ EBUG_ON(btree_node_locked(path, level) && ++ btree_node_locked_type(path, level) != ++ __btree_lock_want(path, level)); ++ ++ return likely(btree_node_locked(path, level)) || ++ __bch2_btree_node_relock(trans, path, level); ++} ++ ++/* ++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will ++ * succeed: ++ */ ++static inline void ++bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, ++ struct btree *b) ++{ ++ struct btree_path *linked; ++ ++ EBUG_ON(path->l[b->c.level].b != b); ++ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); ++ ++ trans_for_each_path_with_node(trans, b, linked) ++ linked->l[b->c.level].lock_seq += 2; ++ ++ six_unlock_write(&b->c.lock); ++} ++ ++void bch2_btree_node_unlock_write(struct btree_trans *, ++ struct btree_path *, struct btree *); ++ ++void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); ++ ++static inline void bch2_btree_node_lock_write(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ EBUG_ON(path->l[b->c.level].b != b); ++ EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ EBUG_ON(!btree_node_intent_locked(path, b->c.level)); ++ ++ if (unlikely(!six_trylock_write(&b->c.lock))) ++ __bch2_btree_node_lock_write(trans, b); ++} ++ ++#endif /* _BCACHEFS_BTREE_LOCKING_H */ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +new file mode 100644 +index 000000000000..a2826dfe13cb +--- /dev/null ++++ b/fs/bcachefs/btree_types.h +@@ -0,0 +1,697 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_TYPES_H ++#define _BCACHEFS_BTREE_TYPES_H ++ ++#include ++#include ++#include ++ ++#include "bkey_methods.h" ++#include "buckets_types.h" ++#include "darray.h" ++#include "journal_types.h" ++ ++struct open_bucket; ++struct btree_update; ++struct btree_trans; ++ ++#define MAX_BSETS 3U ++ ++struct btree_nr_keys { ++ ++ /* ++ * Amount of live metadata (i.e. size of node after a compaction) in ++ * units of u64s ++ */ ++ u16 live_u64s; ++ u16 bset_u64s[MAX_BSETS]; ++ ++ /* live keys only: */ ++ u16 packed_keys; ++ u16 unpacked_keys; ++}; ++ ++struct bset_tree { ++ /* ++ * We construct a binary tree in an array as if the array ++ * started at 1, so that things line up on the same cachelines ++ * better: see comments in bset.c at cacheline_to_bkey() for ++ * details ++ */ ++ ++ /* size of the binary tree and prev array */ ++ u16 size; ++ ++ /* function of size - precalculated for to_inorder() */ ++ u16 extra; ++ ++ u16 data_offset; ++ u16 aux_data_offset; ++ u16 end_offset; ++}; ++ ++struct btree_write { ++ struct journal_entry_pin journal; ++}; ++ ++struct btree_alloc { ++ struct open_buckets ob; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); ++}; ++ ++struct btree_bkey_cached_common { ++ struct six_lock lock; ++ u8 level; ++ u8 btree_id; ++}; ++ ++struct btree { ++ struct btree_bkey_cached_common c; ++ ++ struct rhash_head hash; ++ u64 hash_val; ++ ++ unsigned long flags; ++ u16 written; ++ u8 nsets; ++ u8 nr_key_bits; ++ u16 version_ondisk; ++ ++ struct bkey_format format; ++ ++ struct btree_node *data; ++ void *aux_data; ++ ++ /* ++ * Sets of sorted keys - the real btree node - plus a binary search tree ++ * ++ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point ++ * to the memory we have allocated for this btree node. Additionally, ++ * set[0]->data points to the entire btree node as it exists on disk. ++ */ ++ struct bset_tree set[MAX_BSETS]; ++ ++ struct btree_nr_keys nr; ++ u16 sib_u64s[2]; ++ u16 whiteout_u64s; ++ u8 byte_order; ++ u8 unpack_fn_len; ++ ++ struct btree_write writes[2]; ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ ++ /* ++ * XXX: add a delete sequence number, so when bch2_btree_node_relock() ++ * fails because the lock sequence number has changed - i.e. the ++ * contents were modified - we can still relock the node if it's still ++ * the one we want, without redoing the traversal ++ */ ++ ++ /* ++ * For asynchronous splits/interior node updates: ++ * When we do a split, we allocate new child nodes and update the parent ++ * node to point to them: we update the parent in memory immediately, ++ * but then we must wait until the children have been written out before ++ * the update to the parent can be written - this is a list of the ++ * btree_updates that are blocking this node from being ++ * written: ++ */ ++ struct list_head write_blocked; ++ ++ /* ++ * Also for asynchronous splits/interior node updates: ++ * If a btree node isn't reachable yet, we don't want to kick off ++ * another write - because that write also won't yet be reachable and ++ * marking it as completed before it's reachable would be incorrect: ++ */ ++ unsigned long will_make_reachable; ++ ++ struct open_buckets ob; ++ ++ /* lru list */ ++ struct list_head list; ++}; ++ ++struct btree_cache { ++ struct rhashtable table; ++ bool table_init_done; ++ /* ++ * We never free a struct btree, except on shutdown - we just put it on ++ * the btree_cache_freed list and reuse it later. This simplifies the ++ * code, and it doesn't cost us much memory as the memory usage is ++ * dominated by buffers that hold the actual btree node data and those ++ * can be freed - and the number of struct btrees allocated is ++ * effectively bounded. ++ * ++ * btree_cache_freeable effectively is a small cache - we use it because ++ * high order page allocations can be rather expensive, and it's quite ++ * common to delete and allocate btree nodes in quick succession. It ++ * should never grow past ~2-3 nodes in practice. ++ */ ++ struct mutex lock; ++ struct list_head live; ++ struct list_head freeable; ++ struct list_head freed_pcpu; ++ struct list_head freed_nonpcpu; ++ ++ /* Number of elements in live + freeable lists */ ++ unsigned used; ++ unsigned reserve; ++ atomic_t dirty; ++ struct shrinker shrink; ++ ++ /* ++ * If we need to allocate memory for a new btree node and that ++ * allocation fails, we can cannibalize another node in the btree cache ++ * to satisfy the allocation - lock to guarantee only one thread does ++ * this at a time: ++ */ ++ struct task_struct *alloc_lock; ++ struct closure_waitlist alloc_wait; ++}; ++ ++struct btree_node_iter { ++ struct btree_node_iter_set { ++ u16 k, end; ++ } data[MAX_BSETS]; ++}; ++ ++/* ++ * Iterate over all possible positions, synthesizing deleted keys for holes: ++ */ ++#define BTREE_ITER_SLOTS (1 << 0) ++#define BTREE_ITER_ALL_LEVELS (1 << 1) ++/* ++ * Indicates that intent locks should be taken on leaf nodes, because we expect ++ * to be doing updates: ++ */ ++#define BTREE_ITER_INTENT (1 << 2) ++/* ++ * Causes the btree iterator code to prefetch additional btree nodes from disk: ++ */ ++#define BTREE_ITER_PREFETCH (1 << 3) ++/* ++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for ++ * @pos or the first key strictly greater than @pos ++ */ ++#define BTREE_ITER_IS_EXTENTS (1 << 4) ++#define BTREE_ITER_NOT_EXTENTS (1 << 5) ++#define BTREE_ITER_CACHED (1 << 6) ++#define BTREE_ITER_CACHED_NOFILL (1 << 7) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 8) ++#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) ++#define BTREE_ITER_WITH_UPDATES (1 << 10) ++#define BTREE_ITER_WITH_JOURNAL (1 << 11) ++#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) ++#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) ++#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) ++#define BTREE_ITER_NOPRESERVE (1 << 15) ++ ++enum btree_path_uptodate { ++ BTREE_ITER_UPTODATE = 0, ++ BTREE_ITER_NEED_RELOCK = 1, ++ BTREE_ITER_NEED_TRAVERSE = 2, ++}; ++ ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) ++ ++struct btree_path { ++ u8 idx; ++ u8 sorted_idx; ++ u8 ref; ++ u8 intent_ref; ++ ++ /* btree_iter_copy starts here: */ ++ struct bpos pos; ++ ++ enum btree_id btree_id:4; ++ bool cached:1; ++ bool preserve:1; ++ enum btree_path_uptodate uptodate:2; ++ /* ++ * When true, failing to relock this path will cause the transaction to ++ * restart: ++ */ ++ bool should_be_locked:1; ++ unsigned level:3, ++ locks_want:4, ++ nodes_locked:4, ++ nodes_intent_locked:4; ++ ++ struct btree_path_level { ++ struct btree *b; ++ struct btree_node_iter iter; ++ u32 lock_seq; ++#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS ++ u64 lock_taken_time; ++#endif ++ } l[BTREE_MAX_DEPTH]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned long ip_allocated; ++#endif ++}; ++ ++static inline struct btree_path_level *path_l(struct btree_path *path) ++{ ++ return path->l + path->level; ++} ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ struct btree_trans *trans; ++ struct btree_path *path; ++ struct btree_path *update_path; ++ struct btree_path *key_cache_path; ++ ++ enum btree_id btree_id:4; ++ unsigned min_depth:3; ++ unsigned advanced:1; ++ ++ /* btree_iter_copy starts here: */ ++ u16 flags; ++ ++ /* When we're filtering by snapshot, the snapshot ID we're looking for: */ ++ unsigned snapshot; ++ ++ struct bpos pos; ++ struct bpos pos_after_commit; ++ /* ++ * Current unpacked key - so that bch2_btree_iter_next()/ ++ * bch2_btree_iter_next_slot() can correctly advance pos. ++ */ ++ struct bkey k; ++ ++ /* BTREE_ITER_WITH_JOURNAL: */ ++ size_t journal_idx; ++ struct bpos journal_pos; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned long ip_allocated; ++#endif ++}; ++ ++struct btree_key_cache_freelist { ++ struct bkey_cached *objs[16]; ++ unsigned nr; ++}; ++ ++struct btree_key_cache { ++ struct mutex lock; ++ struct rhashtable table; ++ bool table_init_done; ++ struct list_head freed; ++ struct shrinker shrink; ++ unsigned shrink_iter; ++ struct btree_key_cache_freelist __percpu *pcpu_freed; ++ ++ atomic_long_t nr_freed; ++ atomic_long_t nr_keys; ++ atomic_long_t nr_dirty; ++}; ++ ++struct bkey_cached_key { ++ u32 btree_id; ++ struct bpos pos; ++} __attribute__((packed, aligned(4))); ++ ++#define BKEY_CACHED_ACCESSED 0 ++#define BKEY_CACHED_DIRTY 1 ++ ++struct bkey_cached { ++ struct btree_bkey_cached_common c; ++ ++ unsigned long flags; ++ u16 u64s; ++ bool valid; ++ u32 btree_trans_barrier_seq; ++ struct bkey_cached_key key; ++ ++ struct rhash_head hash; ++ struct list_head list; ++ ++ struct journal_preres res; ++ struct journal_entry_pin journal; ++ ++ struct bkey_i *k; ++}; ++ ++struct btree_insert_entry { ++ unsigned flags; ++ u8 bkey_type; ++ enum btree_id btree_id:8; ++ u8 level:4; ++ bool cached:1; ++ bool insert_trigger_run:1; ++ bool overwrite_trigger_run:1; ++ bool key_cache_already_flushed:1; ++ /* ++ * @old_k may be a key from the journal; @old_btree_u64s always refers ++ * to the size of the key being overwritten in the btree: ++ */ ++ u8 old_btree_u64s; ++ struct bkey_i *k; ++ struct btree_path *path; ++ /* key being overwritten: */ ++ struct bkey old_k; ++ const struct bch_val *old_v; ++ unsigned long ip_allocated; ++}; ++ ++#ifndef CONFIG_LOCKDEP ++#define BTREE_ITER_MAX 64 ++#else ++#define BTREE_ITER_MAX 32 ++#endif ++ ++struct btree_trans_commit_hook; ++typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); ++ ++struct btree_trans_commit_hook { ++ btree_trans_commit_hook_fn *fn; ++ struct btree_trans_commit_hook *next; ++}; ++ ++#define BTREE_TRANS_MEM_MAX (1U << 16) ++ ++#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 ++ ++struct btree_trans { ++ struct bch_fs *c; ++ const char *fn; ++ struct list_head list; ++ u64 last_begin_time; ++ struct btree *locking; ++ unsigned locking_path_idx; ++ struct bpos locking_pos; ++ u8 locking_btree_id; ++ u8 locking_level; ++ u8 locking_lock_type; ++ struct task_struct *task; ++ int srcu_idx; ++ ++ u8 nr_sorted; ++ u8 nr_updates; ++ u8 traverse_all_idx; ++ bool used_mempool:1; ++ bool in_traverse_all:1; ++ bool memory_allocation_failure:1; ++ bool is_initial_gc:1; ++ enum bch_errcode restarted:16; ++ u32 restart_count; ++ unsigned long last_restarted_ip; ++ ++ /* ++ * For when bch2_trans_update notices we'll be splitting a compressed ++ * extent: ++ */ ++ unsigned extra_journal_res; ++ ++ u64 paths_allocated; ++ ++ unsigned mem_top; ++ unsigned mem_bytes; ++ void *mem; ++ ++ u8 sorted[BTREE_ITER_MAX]; ++ struct btree_path *paths; ++ struct btree_insert_entry *updates; ++ ++ /* update path: */ ++ struct btree_trans_commit_hook *hooks; ++ DARRAY(u64) extra_journal_entries; ++ struct journal_entry_pin *journal_pin; ++ ++ struct journal_res journal_res; ++ struct journal_preres journal_preres; ++ u64 *journal_seq; ++ struct disk_reservation *disk_res; ++ unsigned flags; ++ unsigned journal_u64s; ++ unsigned journal_preres_u64s; ++ struct replicas_delta_list *fs_usage_deltas; ++ int lock_name_idx; ++}; ++ ++#define BTREE_FLAGS() \ ++ x(read_in_flight) \ ++ x(read_error) \ ++ x(dirty) \ ++ x(need_write) \ ++ x(write_blocked) \ ++ x(will_make_reachable) \ ++ x(noevict) \ ++ x(write_idx) \ ++ x(accessed) \ ++ x(write_in_flight) \ ++ x(write_in_flight_inner) \ ++ x(just_written) \ ++ x(dying) \ ++ x(fake) \ ++ x(need_rewrite) \ ++ x(never_write) ++ ++enum btree_flags { ++#define x(flag) BTREE_NODE_##flag, ++ BTREE_FLAGS() ++#undef x ++}; ++ ++#define x(flag) \ ++static inline bool btree_node_ ## flag(struct btree *b) \ ++{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void set_btree_node_ ## flag(struct btree *b) \ ++{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void clear_btree_node_ ## flag(struct btree *b) \ ++{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } ++ ++BTREE_FLAGS() ++#undef x ++ ++static inline struct btree_write *btree_current_write(struct btree *b) ++{ ++ return b->writes + btree_node_write_idx(b); ++} ++ ++static inline struct btree_write *btree_prev_write(struct btree *b) ++{ ++ return b->writes + (btree_node_write_idx(b) ^ 1); ++} ++ ++static inline struct bset_tree *bset_tree_last(struct btree *b) ++{ ++ EBUG_ON(!b->nsets); ++ return b->set + b->nsets - 1; ++} ++ ++static inline void * ++__btree_node_offset_to_ptr(const struct btree *b, u16 offset) ++{ ++ return (void *) ((u64 *) b->data + 1 + offset); ++} ++ ++static inline u16 ++__btree_node_ptr_to_offset(const struct btree *b, const void *p) ++{ ++ u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ ++ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); ++ return ret; ++} ++ ++static inline struct bset *bset(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return __btree_node_offset_to_ptr(b, t->data_offset); ++} ++ ++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) ++{ ++ t->end_offset = ++ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); ++} ++ ++static inline void set_btree_bset(struct btree *b, struct bset_tree *t, ++ const struct bset *i) ++{ ++ t->data_offset = __btree_node_ptr_to_offset(b, i); ++ set_btree_bset_end(b, t); ++} ++ ++static inline struct bset *btree_bset_first(struct btree *b) ++{ ++ return bset(b, b->set); ++} ++ ++static inline struct bset *btree_bset_last(struct btree *b) ++{ ++ return bset(b, bset_tree_last(b)); ++} ++ ++static inline u16 ++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) ++{ ++ return __btree_node_ptr_to_offset(b, k); ++} ++ ++static inline struct bkey_packed * ++__btree_node_offset_to_key(const struct btree *b, u16 k) ++{ ++ return __btree_node_offset_to_ptr(b, k); ++} ++ ++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) ++{ ++ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); ++} ++ ++#define btree_bkey_first(_b, _t) \ ++({ \ ++ EBUG_ON(bset(_b, _t)->start != \ ++ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ ++ \ ++ bset(_b, _t)->start; \ ++}) ++ ++#define btree_bkey_last(_b, _t) \ ++({ \ ++ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ ++ vstruct_last(bset(_b, _t))); \ ++ \ ++ __btree_node_offset_to_key(_b, (_t)->end_offset); \ ++}) ++ ++static inline unsigned bset_u64s(struct bset_tree *t) ++{ ++ return t->end_offset - t->data_offset - ++ sizeof(struct bset) / sizeof(u64); ++} ++ ++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) ++{ ++ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; ++} ++ ++static inline unsigned bset_byte_offset(struct btree *b, void *i) ++{ ++ return i - (void *) b->data; ++} ++ ++enum btree_node_type { ++#define x(kwd, val) BKEY_TYPE_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BKEY_TYPE_btree, ++}; ++ ++/* Type of a key in btree @id at level @level: */ ++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) ++{ ++ return level ? BKEY_TYPE_btree : (enum btree_node_type) id; ++} ++ ++/* Type of keys @b contains: */ ++static inline enum btree_node_type btree_node_type(struct btree *b) ++{ ++ return __btree_node_type(b->c.level, b->c.btree_id); ++} ++ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ++ ((1U << BKEY_TYPE_extents)| \ ++ (1U << BKEY_TYPE_alloc)| \ ++ (1U << BKEY_TYPE_inodes)| \ ++ (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_reflink)| \ ++ (1U << BKEY_TYPE_btree)) ++ ++#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ ++ ((1U << BKEY_TYPE_alloc)| \ ++ (1U << BKEY_TYPE_inodes)| \ ++ (1U << BKEY_TYPE_stripes)| \ ++ (1U << BKEY_TYPE_snapshots)) ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ ++ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) ++ ++#define BTREE_ID_IS_EXTENTS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_reflink)| \ ++ (1U << BTREE_ID_freespace)) ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ return (1U << type) & BTREE_ID_IS_EXTENTS; ++} ++ ++#define BTREE_ID_HAS_SNAPSHOTS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_inodes)| \ ++ (1U << BTREE_ID_dirents)| \ ++ (1U << BTREE_ID_xattrs)) ++ ++#define BTREE_ID_HAS_PTRS \ ++ ((1U << BTREE_ID_extents)| \ ++ (1U << BTREE_ID_reflink)) ++ ++static inline bool btree_type_has_snapshots(enum btree_id id) ++{ ++ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; ++} ++ ++static inline bool btree_type_has_ptrs(enum btree_id id) ++{ ++ return (1 << id) & BTREE_ID_HAS_PTRS; ++} ++ ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} ++ ++struct btree_root { ++ struct btree *b; ++ ++ /* On disk root - see async splits: */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u8 level; ++ u8 alive; ++ s8 error; ++}; ++ ++enum btree_insert_ret { ++ BTREE_INSERT_OK, ++ /* leaf node needs to be split */ ++ BTREE_INSERT_BTREE_NODE_FULL, ++ BTREE_INSERT_NEED_MARK_REPLICAS, ++ BTREE_INSERT_NEED_JOURNAL_RES, ++ BTREE_INSERT_NEED_JOURNAL_RECLAIM, ++}; ++ ++enum btree_gc_coalesce_fail_reason { ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, ++}; ++ ++enum btree_node_sibling { ++ btree_prev_sib, ++ btree_next_sib, ++}; ++ ++#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +new file mode 100644 +index 000000000000..89941fb8caa0 +--- /dev/null ++++ b/fs/bcachefs/btree_update.h +@@ -0,0 +1,158 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_H ++#define _BCACHEFS_BTREE_UPDATE_H ++ ++#include "btree_iter.h" ++#include "journal.h" ++ ++struct bch_fs; ++struct btree; ++ ++void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, ++ struct btree *); ++bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, ++ struct btree *, struct btree_node_iter *, ++ struct bkey_i *); ++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); ++ ++enum btree_insert_flags { ++ /* First two bits for journal watermark: */ ++ __BTREE_INSERT_NOFAIL = 2, ++ __BTREE_INSERT_NOCHECK_RW, ++ __BTREE_INSERT_LAZY_RW, ++ __BTREE_INSERT_USE_RESERVE, ++ __BTREE_INSERT_JOURNAL_REPLAY, ++ __BTREE_INSERT_JOURNAL_RECLAIM, ++ __BTREE_INSERT_NOWAIT, ++ __BTREE_INSERT_GC_LOCK_HELD, ++ __BCH_HASH_SET_MUST_CREATE, ++ __BCH_HASH_SET_MUST_REPLACE, ++}; ++ ++/* Don't check for -ENOSPC: */ ++#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) ++ ++#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) ++#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) ++ ++/* for copygc, or when merging btree nodes */ ++#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) ++ ++/* Insert is for journal replay - don't get journal reservations: */ ++#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) ++ ++/* Insert is being called from journal reclaim path: */ ++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) ++ ++/* Don't block on allocation failure (for new btree nodes: */ ++#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) ++#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) ++ ++#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) ++#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) ++ ++int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, ++ unsigned, unsigned); ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); ++ ++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, u64 *, int flags); ++ ++int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, ++ struct bpos, struct bpos, unsigned, u64 *); ++int bch2_btree_delete_range(struct bch_fs *, enum btree_id, ++ struct bpos, struct bpos, unsigned, u64 *); ++ ++int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, ++ struct btree *, unsigned); ++void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); ++int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, ++ struct btree *, struct bkey_i *, bool); ++int bch2_btree_node_update_key_get_iter(struct btree_trans *, ++ struct btree *, struct bkey_i *, bool); ++ ++int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); ++ ++int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_update_flags); ++ ++void bch2_trans_commit_hook(struct btree_trans *, ++ struct btree_trans_commit_hook *); ++int __bch2_trans_commit(struct btree_trans *); ++ ++int bch2_trans_log_msg(struct btree_trans *, const char *); ++ ++/** ++ * bch2_trans_commit - insert keys at given iterator positions ++ * ++ * This is main entry point for btree updates. ++ * ++ * Return values: ++ * -EROFS: filesystem read only ++ * -EIO: journal or btree node IO error ++ */ ++static inline int bch2_trans_commit(struct btree_trans *trans, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ unsigned flags) ++{ ++ trans->disk_res = disk_res; ++ trans->journal_seq = journal_seq; ++ trans->flags = flags; ++ ++ return __bch2_trans_commit(trans); ++} ++ ++#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_flags))) ++ ++#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_flags))) ++ ++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ ++ bch2_trans_exit(&trans); \ ++ \ ++ _ret; \ ++}) ++ ++#define bch2_trans_run(_c, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = (_do); \ ++ bch2_trans_exit(&trans); \ ++ \ ++ _ret; \ ++}) ++ ++#define trans_for_each_update(_trans, _i) \ ++ for ((_i) = (_trans)->updates; \ ++ (_i) < (_trans)->updates + (_trans)->nr_updates; \ ++ (_i)++) ++ ++static inline void bch2_trans_reset_updates(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ bch2_path_put(trans, i->path, true); ++ ++ trans->extra_journal_res = 0; ++ trans->nr_updates = 0; ++ trans->hooks = NULL; ++ trans->extra_journal_entries.nr = 0; ++} ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +new file mode 100644 +index 000000000000..5525635ec04a +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.c +@@ -0,0 +1,2266 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, ++ struct btree_path *, struct btree *, ++ struct keylist *, unsigned); ++static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); ++ ++/* Debug code: */ ++ ++/* ++ * Verify that child nodes correctly span parent node's range: ++ */ ++static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos next_node = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_btree_ptr_v2 bp; ++ struct bkey unpacked; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ ++ BUG_ON(!b->c.level); ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) ++ return; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while (1) { ++ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ break; ++ bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (bpos_cmp(next_node, bp.v->min_key)) { ++ bch2_dump_btree_node(c, b); ++ bch2_bpos_to_text(&buf1, next_node); ++ bch2_bpos_to_text(&buf2, bp.v->min_key); ++ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); ++ } ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (bch2_btree_node_iter_end(&iter)) { ++ if (bpos_cmp(k.k->p, b->key.k.p)) { ++ bch2_dump_btree_node(c, b); ++ bch2_bpos_to_text(&buf1, b->key.k.p); ++ bch2_bpos_to_text(&buf2, k.k->p); ++ panic("expected end %s got %s\n", buf1.buf, buf2.buf); ++ } ++ break; ++ } ++ ++ next_node = bpos_successor(k.k->p); ++ } ++#endif ++} ++ ++/* Calculate ideal packed bkey format for new btree nodes: */ ++ ++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) ++{ ++ struct bkey_packed *k; ++ struct bset_tree *t; ++ struct bkey uk; ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_deleted(k)) { ++ uk = bkey_unpack_key(b, k); ++ bch2_bkey_format_add_key(s, &uk); ++ } ++} ++ ++static struct bkey_format bch2_btree_calc_format(struct btree *b) ++{ ++ struct bkey_format_state s; ++ ++ bch2_bkey_format_init(&s); ++ bch2_bkey_format_add_pos(&s, b->data->min_key); ++ bch2_bkey_format_add_pos(&s, b->data->max_key); ++ __bch2_btree_calc_format(&s, b); ++ ++ return bch2_bkey_format_done(&s); ++} ++ ++static size_t btree_node_u64s_with_format(struct btree *b, ++ struct bkey_format *new_f) ++{ ++ struct bkey_format *old_f = &b->format; ++ ++ /* stupid integer promotion rules */ ++ ssize_t delta = ++ (((int) new_f->key_u64s - old_f->key_u64s) * ++ (int) b->nr.packed_keys) + ++ (((int) new_f->key_u64s - BKEY_U64s) * ++ (int) b->nr.unpacked_keys); ++ ++ BUG_ON(delta + b->nr.live_u64s < 0); ++ ++ return b->nr.live_u64s + delta; ++} ++ ++/** ++ * btree_node_format_fits - check if we could rewrite node with a new format ++ * ++ * This assumes all keys can pack with the new format -- it just checks if ++ * the re-packed keys would fit inside the node itself. ++ */ ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, ++ struct bkey_format *new_f) ++{ ++ size_t u64s = btree_node_u64s_with_format(b, new_f); ++ ++ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); ++} ++ ++/* Btree node freeing/allocation: */ ++ ++static void __btree_node_free(struct bch_fs *c, struct btree *b) ++{ ++ trace_btree_node_free(c, b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_need_write(b)); ++ BUG_ON(b == btree_node_root(c, b)); ++ BUG_ON(b->ob.nr); ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable); ++ ++ clear_btree_node_noevict(b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++} ++ ++static void bch2_btree_node_free_inmem(struct btree_trans *trans, ++ struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; ++ ++ trans_for_each_path(trans, path) ++ BUG_ON(path->l[b->c.level].b == b && ++ path->l[b->c.level].lock_seq == b->c.lock.state.seq); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ __btree_node_free(c, b); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, ++ struct disk_reservation *res, ++ struct closure *cl, ++ bool interior_node, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct write_point *wp; ++ struct btree *b; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct open_buckets ob = { .nr = 0 }; ++ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; ++ unsigned nr_reserve; ++ enum alloc_reserve alloc_reserve; ++ ++ if (flags & BTREE_INSERT_USE_RESERVE) { ++ nr_reserve = 0; ++ alloc_reserve = RESERVE_btree_movinggc; ++ } else { ++ nr_reserve = BTREE_NODE_RESERVE; ++ alloc_reserve = RESERVE_btree; ++ } ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ if (c->btree_reserve_cache_nr > nr_reserve) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ ob = a->ob; ++ bkey_copy(&tmp.k, &a->k); ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto mem_alloc; ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++retry: ++ wp = bch2_alloc_sectors_start_trans(trans, ++ c->opts.metadata_target ?: ++ c->opts.foreground_target, ++ 0, ++ writepoint_ptr(&c->btree_write_point), ++ &devs_have, ++ res->nr_replicas, ++ c->opts.metadata_replicas_required, ++ alloc_reserve, 0, cl); ++ if (IS_ERR(wp)) ++ return ERR_CAST(wp); ++ ++ if (wp->sectors_free < btree_sectors(c)) { ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ob->sectors_free < btree_sectors(c)) ++ ob->sectors_free = 0; ++ ++ bch2_alloc_sectors_done(c, wp); ++ goto retry; ++ } ++ ++ bkey_btree_ptr_v2_init(&tmp.k); ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); ++ ++ bch2_open_bucket_get(c, wp, &ob); ++ bch2_alloc_sectors_done(c, wp); ++mem_alloc: ++ b = bch2_btree_node_mem_alloc(c, interior_node); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ /* we hold cannibalize_lock: */ ++ BUG_ON(IS_ERR(b)); ++ BUG_ON(b->ob.nr); ++ ++ bkey_copy(&b->key, &tmp.k); ++ b->ob = ob; ++ ++ return b; ++} ++ ++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; ++ int ret; ++ ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ BUG_ON(!p->nr); ++ ++ b = p->b[--p->nr]; ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ ++ set_btree_node_accessed(b); ++ set_btree_node_dirty_acct(c, b); ++ set_btree_node_need_write(b); ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ b->c.level = level; ++ b->c.btree_id = as->btree_id; ++ b->version_ondisk = c->sb.version; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ b->data->magic = cpu_to_le64(bset_magic(c)); ++ memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); ++ b->data->flags = 0; ++ SET_BTREE_NODE_ID(b->data, as->btree_id); ++ SET_BTREE_NODE_LEVEL(b->data, level); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); ++ ++ bp->v.mem_ptr = 0; ++ bp->v.seq = b->data->keys.seq; ++ bp->v.sectors_written = 0; ++ } ++ ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); ++ BUG_ON(ret); ++ ++ trace_btree_node_alloc(c, b); ++ return b; ++} ++ ++static void btree_set_min(struct btree *b, struct bpos pos) ++{ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; ++ b->data->min_key = pos; ++} ++ ++static void btree_set_max(struct btree *b, struct bpos pos) ++{ ++ b->key.k.p = pos; ++ b->data->max_key = pos; ++} ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b, ++ struct bkey_format format) ++{ ++ struct btree *n; ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ ++ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); ++ ++ btree_set_min(n, b->data->min_key); ++ btree_set_max(n, b->data->max_key); ++ ++ n->data->format = format; ++ btree_node_set_format(n, format); ++ ++ bch2_btree_sort_into(as->c, n, b); ++ ++ btree_node_reset_sib_u64s(n); ++ ++ n->key.k.p = b->key.k.p; ++ return n; ++} ++ ++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bkey_format new_f = bch2_btree_calc_format(b); ++ ++ /* ++ * The keys might expand with the new format - if they wouldn't fit in ++ * the btree node anymore, use the old format for now: ++ */ ++ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) ++ new_f = b->format; ++ ++ return __bch2_btree_node_alloc_replacement(as, b, new_f); ++} ++ ++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) ++{ ++ struct btree *b = bch2_btree_node_alloc(as, level); ++ ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, SPOS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ ++ btree_node_set_format(b, b->data->format); ++ bch2_btree_build_aux_trees(b); ++ ++ bch2_btree_update_add_new_node(as, b); ++ six_unlock_write(&b->c.lock); ++ ++ return b; ++} ++ ++static void bch2_btree_reserve_put(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct prealloc_nodes *p; ++ ++ for (p = as->prealloc_nodes; ++ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); ++ p++) { ++ while (p->nr) { ++ struct btree *b = p->b[--p->nr]; ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ } ++ } ++} ++ ++static int bch2_btree_reserve_get(struct btree_trans *trans, ++ struct btree_update *as, ++ unsigned nr_nodes[2], ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ unsigned interior; ++ int ret = 0; ++ ++ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); ++ ++ /* ++ * Protects reaping from the btree node cache and using the btree node ++ * open bucket reserve: ++ * ++ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not ++ * blocking on this lock: ++ */ ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ if (ret) ++ return ret; ++ ++ for (interior = 0; interior < 2; interior++) { ++ struct prealloc_nodes *p = as->prealloc_nodes + interior; ++ ++ while (p->nr < nr_nodes[interior]) { ++ b = __bch2_btree_node_alloc(trans, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ? NULL : cl, ++ interior, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err; ++ } ++ ++ p->b[p->nr++] = b; ++ } ++ } ++err: ++ bch2_btree_cache_cannibalize_unlock(c); ++ return ret; ++} ++ ++/* Asynchronous interior node update machinery */ ++ ++static void bch2_btree_update_free(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ if (as->took_gc_lock) ++ up_read(&c->gc_lock); ++ as->took_gc_lock = false; ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ bch2_journal_pin_flush(&c->journal, &as->journal); ++ bch2_disk_reservation_put(c, &as->disk_res); ++ bch2_btree_reserve_put(as); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], ++ as->start_time); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_del(&as->unwritten_list); ++ list_del(&as->list); ++ ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ ++ /* ++ * Have to do the wakeup with btree_interior_update_lock still held, ++ * since being on btree_interior_update_list is our ref on @c: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_add_key(struct btree_update *as, ++ struct keylist *keys, struct btree *b) ++{ ++ struct bkey_i *k = &b->key; ++ ++ BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > ++ ARRAY_SIZE(as->_old_keys)); ++ ++ bkey_copy(keys->top, k); ++ bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; ++ ++ bch2_keylist_push(keys); ++} ++ ++/* ++ * The transactional part of an interior btree node update, where we journal the ++ * update we did to the interior node and update alloc info: ++ */ ++static int btree_update_nodes_written_trans(struct btree_trans *trans, ++ struct btree_update *as) ++{ ++ struct bkey_i *k; ++ int ret; ++ ++ ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); ++ if (ret) ++ return ret; ++ ++ memcpy(&darray_top(trans->extra_journal_entries), ++ as->journal_entries, ++ as->journal_u64s * sizeof(u64)); ++ trans->extra_journal_entries.nr += as->journal_u64s; ++ ++ trans->journal_pin = &as->journal; ++ ++ for_each_keylist_key(&as->old_keys, k) { ++ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ++ ++ ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_keylist_key(&as->new_keys, k) { ++ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ++ ++ ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void btree_update_nodes_written(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b = as->b; ++ struct btree_trans trans; ++ u64 journal_seq = 0; ++ unsigned i; ++ int ret; ++ ++ /* ++ * If we're already in an error state, it might be because a btree node ++ * was never written, and we might be trying to free that same btree ++ * node here, but it won't have been marked as allocated and we'll see ++ * spurious disk usage inconsistencies in the transactional part below ++ * if we don't skip it: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ /* ++ * Wait for any in flight writes to finish before we free the old nodes ++ * on disk: ++ */ ++ for (i = 0; i < as->nr_old_nodes; i++) { ++ struct btree *old = as->old_nodes[i]; ++ __le64 seq; ++ ++ six_lock_read(&old->c.lock, NULL, NULL); ++ seq = old->data ? old->data->keys.seq : 0; ++ six_unlock_read(&old->c.lock); ++ ++ if (seq == as->old_nodes_seq[i]) ++ wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner, ++ TASK_UNINTERRUPTIBLE); ++ } ++ ++ /* ++ * We did an update to a parent node where the pointers we added pointed ++ * to child nodes that weren't written yet: now, the child nodes have ++ * been written so we can write out the update to the interior node. ++ */ ++ ++ /* ++ * We can't call into journal reclaim here: we'd block on the journal ++ * reclaim lock, but we may need to release the open buckets we have ++ * pinned in order for other btree updates to make forward progress, and ++ * journal reclaim does btree updates when flushing bkey_cached entries, ++ * which may require allocations as well. ++ */ ++ bch2_trans_init(&trans, c, 0, 512); ++ ret = commit_do(&trans, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ JOURNAL_WATERMARK_reserved, ++ btree_update_nodes_written_trans(&trans, as)); ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, ++ "error %i in btree_update_nodes_written()", ret); ++err: ++ if (b) { ++ /* ++ * @b is the node we did the final insert into: ++ * ++ * On failure to get a journal reservation, we still have to ++ * unblock the write and allow most of the write path to happen ++ * so that shutdown works, but the i->journal_seq mechanism ++ * won't work to prevent the btree write from being visible (we ++ * didn't get a journal sequence number) - instead ++ * __bch2_btree_node_write() doesn't do the actual write if ++ * we're in journal error state: ++ */ ++ ++ six_lock_intent(&b->c.lock, NULL, NULL); ++ six_lock_write(&b->c.lock, NULL, NULL); ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ list_del(&as->write_blocked_list); ++ if (list_empty(&b->write_blocked)) ++ clear_btree_node_write_blocked(b); ++ ++ /* ++ * Node might have been freed, recheck under ++ * btree_interior_update_lock: ++ */ ++ if (as->b == b) { ++ struct bset *i = btree_bset_last(b); ++ ++ BUG_ON(!b->c.level); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ if (!ret) { ++ i->journal_seq = cpu_to_le64( ++ max(journal_seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ } else { ++ /* ++ * If we didn't get a journal sequence number we ++ * can't write this btree node, because recovery ++ * won't know to ignore this write: ++ */ ++ set_btree_node_never_write(b); ++ } ++ } ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ six_unlock_write(&b->c.lock); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ clear_btree_node_will_make_reachable(b); ++ } ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ for (i = 0; i < as->nr_open_buckets; i++) ++ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); ++ ++ bch2_btree_update_free(as); ++} ++ ++static void btree_interior_update_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, btree_interior_update_work); ++ struct btree_update *as; ++ ++ while (1) { ++ mutex_lock(&c->btree_interior_update_lock); ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (!as) ++ break; ++ ++ btree_update_nodes_written(as); ++ } ++} ++ ++static void btree_update_set_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ as->nodes_written = true; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); ++} ++ ++/* ++ * We're updating @b with pointers to nodes that haven't finished writing yet: ++ * block @b from being written until @as completes ++ */ ++static void btree_update_updated_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ ++ set_btree_node_write_blocked(b); ++ list_add(&as->write_blocked_list, &b->write_blocked); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_reparent(struct btree_update *as, ++ struct btree_update *child) ++{ ++ struct bch_fs *c = as->c; ++ ++ lockdep_assert_held(&c->btree_interior_update_lock); ++ ++ child->b = NULL; ++ child->mode = BTREE_INTERIOR_UPDATING_AS; ++ ++ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); ++} ++ ++static void btree_update_updated_root(struct btree_update *as, struct btree *b) ++{ ++ struct bkey_i *insert = &b->key; ++ struct bch_fs *c = as->c; ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++/* ++ * bch2_btree_update_add_new_node: ++ * ++ * This causes @as to wait on @b to be written, before it gets to ++ * bch2_btree_update_nodes_written ++ * ++ * Additionally, it sets b->will_make_reachable to prevent any additional writes ++ * to @b from happening besides the first until @b is reachable on disk ++ * ++ * And it adds @b to the list of @as's new nodes, so that we can update sector ++ * counts in bch2_btree_update_nodes_written: ++ */ ++static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ closure_get(&as->cl); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); ++ BUG_ON(b->will_make_reachable); ++ ++ as->new_nodes[as->nr_new_nodes++] = b; ++ b->will_make_reachable = 1UL|(unsigned long) as; ++ set_btree_node_will_make_reachable(b); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ btree_update_add_key(as, &as->new_keys, b); ++} ++ ++/* ++ * returns true if @b was a new node ++ */ ++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_update *as; ++ unsigned long v; ++ unsigned i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ ++ v = xchg(&b->will_make_reachable, 0); ++ clear_btree_node_will_make_reachable(b); ++ as = (struct btree_update *) (v & ~1UL); ++ ++ if (!as) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } ++ ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; ++ ++ BUG(); ++found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (v & 1) ++ closure_put(&as->cl); ++} ++ ++static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) ++{ ++ while (b->ob.nr) ++ as->open_buckets[as->nr_open_buckets++] = ++ b->ob.v[--b->ob.nr]; ++} ++ ++/* ++ * @b is being split/rewritten: it may have pointers to not-yet-written btree ++ * nodes and thus outstanding btree_updates - redirect @b's ++ * btree_updates to point to this btree_update: ++ */ ++static void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree_update *p, *n; ++ struct btree_write *w; ++ ++ set_btree_node_dying(b); ++ ++ if (btree_node_fake(b)) ++ return; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* ++ * Does this node have any btree_update operations preventing ++ * it from being written? ++ * ++ * If so, redirect them to point to this btree_update: we can ++ * write out our new nodes, but we won't make them visible until those ++ * operations complete ++ */ ++ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { ++ list_del_init(&p->write_blocked_list); ++ btree_update_reparent(as, p); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ } ++ ++ clear_btree_node_dirty_acct(c, b); ++ clear_btree_node_need_write(b); ++ ++ /* ++ * Does this node have unwritten data that has a pin on the journal? ++ * ++ * If so, transfer that pin to the btree_update operation - ++ * note that if we're freeing multiple nodes, we only need to keep the ++ * oldest pin of any of the nodes we're freeing. We'll release the pin ++ * when the new nodes are persistent and reachable on disk: ++ */ ++ w = btree_current_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ w = btree_prev_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ btree_update_add_key(as, &as->old_keys, b); ++ ++ as->old_nodes[as->nr_old_nodes] = b; ++ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; ++ as->nr_old_nodes++; ++} ++ ++static void bch2_btree_update_done(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ u64 start_time = as->start_time; ++ ++ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); ++ ++ if (as->took_gc_lock) ++ up_read(&as->c->gc_lock); ++ as->took_gc_lock = false; ++ ++ bch2_btree_reserve_put(as); ++ ++ continue_at(&as->cl, btree_update_set_nodes_written, ++ as->c->btree_interior_update_worker); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], ++ start_time); ++} ++ ++static struct btree_update * ++bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ++ unsigned level, bool split, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_update *as; ++ u64 start_time = local_clock(); ++ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ unsigned nr_nodes[2] = { 0, 0 }; ++ unsigned update_level = level; ++ int journal_flags = flags & JOURNAL_WATERMARK_MASK; ++ int ret = 0; ++ u32 restart_count = trans->restart_count; ++ ++ BUG_ON(!path->should_be_locked); ++ ++ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ journal_flags |= JOURNAL_RES_GET_NONBLOCK; ++ ++ while (1) { ++ nr_nodes[!!update_level] += 1 + split; ++ update_level++; ++ ++ if (!btree_path_node(path, update_level)) ++ break; ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ split = update_level + 1 < BTREE_MAX_DEPTH; ++ } ++ ++ /* Might have to allocate a new root: */ ++ if (update_level < BTREE_MAX_DEPTH) ++ nr_nodes[1] += 1; ++ ++ if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); ++ return ERR_PTR(ret); ++ } ++ ++ if (flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&c->gc_lock); ++ else if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ ret = bch2_trans_relock(trans); ++ if (ret) { ++ up_read(&c->gc_lock); ++ return ERR_PTR(ret); ++ } ++ } ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->start_time = start_time; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); ++ as->btree_id = path->btree_id; ++ INIT_LIST_HEAD(&as->list); ++ INIT_LIST_HEAD(&as->unwritten_list); ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ bch2_keylist_init(&as->old_keys, as->_old_keys); ++ bch2_keylist_init(&as->new_keys, as->_new_keys); ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * We don't want to allocate if we're in an error state, that can cause ++ * deadlock on emergency shutdown due to open buckets getting stuck in ++ * the btree_reserve_cache after allocator shutdown has cleared it out. ++ * This check needs to come after adding us to the btree_interior_update ++ * list but before calling bch2_btree_reserve_get, to synchronize with ++ * __bch2_fs_read_only(). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); ++ if (ret) { ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); ++ if (ret) { ++ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); ++ goto err; ++ } ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_disk_reservation_get(c, &as->disk_res, ++ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), ++ c->opts.metadata_replicas, ++ disk_res_flags); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); ++ if (ret && ret != -EINTR) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ bch2_trans_unlock(trans); ++ ++ do { ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); ++ closure_sync(&cl); ++ } while (ret == -EAGAIN); ++ ++ if (ret) { ++ trace_btree_reserve_get_fail(trans->fn, _RET_IP_, ++ nr_nodes[0] + nr_nodes[1]); ++ goto err; ++ } ++ } ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ goto err; ++ ++ bch2_trans_verify_not_restarted(trans, restart_count); ++ return as; ++err: ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); ++} ++ ++/* Btree root updates: */ ++ ++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++{ ++ /* Root nodes cannot be reaped */ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ mutex_lock(&c->btree_root_lock); ++ BUG_ON(btree_node_root(c, b) && ++ (b->c.level < btree_node_root(c, b)->c.level || ++ !btree_node_dying(btree_node_root(c, b)))); ++ ++ btree_node_root(c, b) = b; ++ mutex_unlock(&c->btree_root_lock); ++ ++ bch2_recalc_btree_reserve(c); ++} ++ ++/** ++ * bch_btree_set_root - update the root in memory and on disk ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. However, you must hold an intent lock on the ++ * old root. ++ * ++ * Note: This allocates a journal entry but doesn't add any keys to ++ * it. All the btree roots are part of every journal write, so there ++ * is nothing new to be done. This just guarantees that there is a ++ * journal write. ++ */ ++static void bch2_btree_set_root(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old; ++ ++ trace_btree_set_root(c, b); ++ BUG_ON(!b->written); ++ ++ old = btree_node_root(c, b); ++ ++ /* ++ * Ensure no one is using the old root while we switch to the ++ * new root: ++ */ ++ bch2_btree_node_lock_write(trans, path, old); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ btree_update_updated_root(as, b); ++ ++ /* ++ * Unlock old root after new root is visible: ++ * ++ * The new root isn't persistent, but that's ok: we still have ++ * an intent lock on the new root, and any updates that would ++ * depend on the new root would have to update the new root. ++ */ ++ bch2_btree_node_unlock_write(trans, path, old); ++} ++ ++/* Interior node updates: */ ++ ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = as->c; ++ struct bkey_packed *k; ++ struct printbuf buf = PRINTBUF; ++ ++ BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && ++ !btree_ptr_sectors_written(insert)); ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ btree_node_type(b), WRITE, &buf) ?: ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "inserting invalid bkey\n "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ prt_printf(&buf, "\n "); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ btree_node_type(b), WRITE, &buf); ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); ++ ++ bch2_fs_inconsistent(c, "%s", buf.buf); ++ dump_stack(); ++ } ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_keys, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && ++ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) ++ bch2_btree_node_iter_advance(node_iter, b); ++ ++ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); ++ set_btree_node_dirty_acct(c, b); ++ set_btree_node_need_write(b); ++ ++ printbuf_exit(&buf); ++} ++ ++static void ++__bch2_btree_insert_keys_interior(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct keylist *keys) ++{ ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); ++ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ++ ; ++ ++ while (!bch2_keylist_empty(keys)) { ++ bch2_insert_fixup_btree_ptr(as, trans, path, b, ++ &node_iter, bch2_keylist_front(keys)); ++ bch2_keylist_pop_front(keys); ++ } ++} ++ ++/* ++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher ++ * node) ++ */ ++static struct btree *__btree_split_node(struct btree_update *as, ++ struct btree *n1) ++{ ++ struct bkey_format_state s; ++ size_t nr_packed = 0, nr_unpacked = 0; ++ struct btree *n2; ++ struct bset *set1, *set2; ++ struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; ++ struct bpos n1_pos; ++ ++ n2 = bch2_btree_node_alloc(as, n1->c.level); ++ ++ n2->data->max_key = n1->data->max_key; ++ n2->data->format = n1->format; ++ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); ++ n2->key.k.p = n1->key.k.p; ++ ++ bch2_btree_update_add_new_node(as, n2); ++ ++ set1 = btree_bset_first(n1); ++ set2 = btree_bset_first(n2); ++ ++ /* ++ * Has to be a linear search because we don't have an auxiliary ++ * search tree yet ++ */ ++ k = set1->start; ++ while (1) { ++ struct bkey_packed *n = bkey_next(k); ++ ++ if (n == vstruct_last(set1)) ++ break; ++ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) ++ break; ++ ++ if (bkey_packed(k)) ++ nr_packed++; ++ else ++ nr_unpacked++; ++ ++ prev = k; ++ k = n; ++ } ++ ++ BUG_ON(!prev); ++ set2_start = k; ++ set2_end = vstruct_last(set1); ++ ++ set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data); ++ set_btree_bset_end(n1, n1->set); ++ ++ n1->nr.live_u64s = le16_to_cpu(set1->u64s); ++ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); ++ n1->nr.packed_keys = nr_packed; ++ n1->nr.unpacked_keys = nr_unpacked; ++ ++ n1_pos = bkey_unpack_pos(n1, prev); ++ if (as->c->sb.version < bcachefs_metadata_version_snapshot) ++ n1_pos.snapshot = U32_MAX; ++ ++ btree_set_max(n1, n1_pos); ++ btree_set_min(n2, bpos_successor(n1->key.k.p)); ++ ++ bch2_bkey_format_init(&s); ++ bch2_bkey_format_add_pos(&s, n2->data->min_key); ++ bch2_bkey_format_add_pos(&s, n2->data->max_key); ++ ++ for (k = set2_start; k != set2_end; k = bkey_next(k)) { ++ struct bkey uk = bkey_unpack_key(n1, k); ++ bch2_bkey_format_add_key(&s, &uk); ++ } ++ ++ n2->data->format = bch2_bkey_format_done(&s); ++ btree_node_set_format(n2, n2->data->format); ++ ++ out = set2->start; ++ memset(&n2->nr, 0, sizeof(n2->nr)); ++ ++ for (k = set2_start; k != set2_end; k = bkey_next(k)) { ++ BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k) ++ ? &n1->format : &bch2_bkey_format_current, k)); ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ btree_keys_account_key_add(&n2->nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ set2->u64s = cpu_to_le16((u64 *) out - set2->_data); ++ set_btree_bset_end(n2, n2->set); ++ ++ BUG_ON(!set1->u64s); ++ BUG_ON(!set2->u64s); ++ ++ btree_node_reset_sib_u64s(n1); ++ btree_node_reset_sib_u64s(n2); ++ ++ bch2_verify_btree_nr_keys(n1); ++ bch2_verify_btree_nr_keys(n2); ++ ++ if (n1->c.level) { ++ btree_node_interior_verify(as->c, n1); ++ btree_node_interior_verify(as->c, n2); ++ } ++ ++ return n2; ++} ++ ++/* ++ * For updates to interior nodes, we've got to do the insert before we split ++ * because the stuff we're inserting has to be inserted atomically. Post split, ++ * the keys might have to go in different nodes and the split would no longer be ++ * atomic. ++ * ++ * Worse, if the insert is from btree node coalescing, if we do the insert after ++ * we do the split (and pick the pivot) - the pivot we pick might be between ++ * nodes that were coalesced, and thus in the middle of a child node post ++ * coalescing: ++ */ ++static void btree_split_insert_keys(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct keylist *keys) ++{ ++ struct btree_node_iter node_iter; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct bkey_packed *src, *dst, *n; ++ struct bset *i; ++ ++ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); ++ ++ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); ++ ++ /* ++ * We can't tolerate whiteouts here - with whiteouts there can be ++ * duplicate keys, and it would be rather bad if we picked a duplicate ++ * for the pivot: ++ */ ++ i = btree_bset_first(b); ++ src = dst = i->start; ++ while (src != vstruct_last(i)) { ++ n = bkey_next(src); ++ if (!bkey_deleted(src)) { ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ src = n; ++ } ++ ++ /* Also clear out the unwritten whiteouts area: */ ++ b->whiteout_u64s = 0; ++ ++ i->u64s = cpu_to_le16((u64 *) dst - i->_data); ++ set_btree_bset_end(b, b->set); ++ ++ BUG_ON(b->nsets != 1 || ++ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); ++ ++ btree_node_interior_verify(as->c, b); ++} ++ ++static void btree_split(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *parent = btree_node_parent(path, b); ++ struct btree *n1, *n2 = NULL, *n3 = NULL; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n1 = bch2_btree_node_alloc_replacement(as, b); ++ ++ if (keys) ++ btree_split_insert_keys(as, trans, path, n1, keys); ++ ++ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { ++ trace_btree_split(c, b); ++ ++ n2 = __btree_split_node(as, n1); ++ ++ bch2_btree_build_aux_trees(n2); ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n2->c.lock); ++ six_unlock_write(&n1->c.lock); ++ ++ bch2_btree_update_add_new_node(as, n1); ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); ++ ++ /* ++ * Note that on recursive parent_keys == keys, so we ++ * can't start adding new keys to parent_keys before emptying it ++ * out (which we did with btree_split_insert_keys() above) ++ */ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ bch2_keylist_add(&as->parent_keys, &n2->key); ++ ++ if (!parent) { ++ /* Depth increases, make a new root */ ++ n3 = __btree_root_alloc(as, b->c.level + 1); ++ ++ n3->sib_u64s[0] = U16_MAX; ++ n3->sib_u64s[1] = U16_MAX; ++ ++ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); ++ ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); ++ } ++ } else { ++ trace_btree_compact(c, b); ++ ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n1->c.lock); ++ ++ bch2_btree_update_add_new_node(as, n1); ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); ++ ++ if (parent) ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ } ++ ++ /* New nodes all written, now make them visible: */ ++ ++ if (parent) { ++ /* Split a non root node */ ++ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ } else if (n3) { ++ bch2_btree_set_root(as, trans, path, n3); ++ } else { ++ /* Root filled up but didn't need to be split */ ++ bch2_btree_set_root(as, trans, path, n1); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n1); ++ if (n2) ++ bch2_btree_update_get_open_buckets(as, n2); ++ if (n3) ++ bch2_btree_update_get_open_buckets(as, n3); ++ ++ /* Successful split, update the path to point to the new nodes: */ ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ if (n3) ++ bch2_trans_node_add(trans, n3); ++ if (n2) ++ bch2_trans_node_add(trans, n2); ++ bch2_trans_node_add(trans, n1); ++ ++ /* ++ * The old node must be freed (in memory) _before_ unlocking the new ++ * nodes - else another thread could re-acquire a read lock on the old ++ * node after another thread has locked and updated the new node, thus ++ * seeing stale data: ++ */ ++ bch2_btree_node_free_inmem(trans, b); ++ ++ if (n3) ++ six_unlock_intent(&n3->c.lock); ++ if (n2) ++ six_unlock_intent(&n2->c.lock); ++ six_unlock_intent(&n1->c.lock); ++ ++ bch2_trans_verify_locks(trans); ++ ++ bch2_time_stats_update(&c->times[n2 ++ ? BCH_TIME_btree_node_split ++ : BCH_TIME_btree_node_compact], ++ start_time); ++} ++ ++static void ++bch2_btree_insert_keys_interior(struct btree_update *as, ++ struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct keylist *keys) ++{ ++ struct btree_path *linked; ++ ++ __bch2_btree_insert_keys_interior(as, trans, path, b, ++ path->l[b->c.level].iter, keys); ++ ++ btree_update_updated_node(as, b); ++ ++ trans_for_each_path_with_node(trans, b, linked) ++ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); ++ ++ bch2_trans_verify_paths(trans); ++} ++ ++/** ++ * bch_btree_insert_node - insert bkeys into a given btree node ++ * ++ * @iter: btree iterator ++ * @keys: list of keys to insert ++ * @hook: insert callback ++ * @persistent: if not null, @persistent will wait on journal write ++ * ++ * Inserts as many keys as it can into a given btree node, splitting it if full. ++ * If a split occurred, this function will return early. This can only happen ++ * for leaf nodes -- inserts into interior nodes have to be atomic. ++ */ ++static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, ++ struct btree_path *path, struct btree *b, ++ struct keylist *keys, unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ lockdep_assert_held(&c->gc_lock); ++ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); ++ BUG_ON(!b->c.level); ++ BUG_ON(!as || as->b); ++ bch2_verify_keylist_sorted(keys); ++ ++ bch2_btree_node_lock_for_insert(trans, path, b); ++ ++ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { ++ bch2_btree_node_unlock_write(trans, path, b); ++ goto split; ++ } ++ ++ btree_node_interior_verify(c, b); ++ ++ bch2_btree_insert_keys_interior(as, trans, path, b, keys); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_trans_node_reinit_iter(trans, b); ++ ++ bch2_btree_node_unlock_write(trans, path, b); ++ ++ btree_node_interior_verify(c, b); ++ return; ++split: ++ btree_split(as, trans, path, b, keys, flags); ++} ++ ++int bch2_btree_split_leaf(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned flags) ++{ ++ struct btree *b = path_l(path)->b; ++ struct btree_update *as; ++ unsigned l; ++ int ret = 0; ++ ++ as = bch2_btree_update_start(trans, path, path->level, ++ true, flags); ++ if (IS_ERR(as)) ++ return PTR_ERR(as); ++ ++ btree_split(as, trans, path, b, NULL, flags); ++ bch2_btree_update_done(as); ++ ++ for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) ++ ret = bch2_foreground_maybe_merge(trans, path, l, flags); ++ ++ return ret; ++} ++ ++int __bch2_foreground_maybe_merge(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_path *sib_path = NULL; ++ struct btree_update *as; ++ struct bkey_format_state new_s; ++ struct bkey_format new_f; ++ struct bkey_i delete; ++ struct btree *b, *m, *n, *prev, *next, *parent; ++ struct bpos sib_pos; ++ size_t sib_u64s; ++ u64 start_time = local_clock(); ++ int ret = 0; ++ ++ BUG_ON(!path->should_be_locked); ++ BUG_ON(!btree_node_locked(path, level)); ++ ++ b = path->l[level].b; ++ ++ if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || ++ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { ++ b->sib_u64s[sib] = U16_MAX; ++ return 0; ++ } ++ ++ sib_pos = sib == btree_prev_sib ++ ? bpos_predecessor(b->data->min_key) ++ : bpos_successor(b->data->max_key); ++ ++ sib_path = bch2_path_get(trans, path->btree_id, sib_pos, ++ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, sib_path, false); ++ if (ret) ++ goto err; ++ ++ sib_path->should_be_locked = true; ++ ++ m = sib_path->l[level].b; ++ ++ if (btree_node_parent(path, b) != ++ btree_node_parent(sib_path, m)) { ++ b->sib_u64s[sib] = U16_MAX; ++ goto out; ++ } ++ ++ if (sib == btree_prev_sib) { ++ prev = m; ++ next = b; ++ } else { ++ prev = b; ++ next = m; ++ } ++ ++ if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ ++ bch2_bpos_to_text(&buf1, prev->data->max_key); ++ bch2_bpos_to_text(&buf2, next->data->min_key); ++ bch_err(c, ++ "btree topology error in btree merge:\n" ++ " prev ends at %s\n" ++ " next starts at %s", ++ buf1.buf, buf2.buf); ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); ++ bch2_topology_error(c); ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_bkey_format_init(&new_s); ++ bch2_bkey_format_add_pos(&new_s, prev->data->min_key); ++ __bch2_btree_calc_format(&new_s, prev); ++ __bch2_btree_calc_format(&new_s, next); ++ bch2_bkey_format_add_pos(&new_s, next->data->max_key); ++ new_f = bch2_bkey_format_done(&new_s); ++ ++ sib_u64s = btree_node_u64s_with_format(b, &new_f) + ++ btree_node_u64s_with_format(m, &new_f); ++ ++ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { ++ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ sib_u64s /= 2; ++ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ } ++ ++ sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); ++ b->sib_u64s[sib] = sib_u64s; ++ ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ goto out; ++ ++ parent = btree_node_parent(path, b); ++ as = bch2_btree_update_start(trans, path, level, false, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ flags); ++ ret = PTR_ERR_OR_ZERO(as); ++ if (ret) ++ goto err; ++ ++ trace_btree_merge(c, b); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ ++ SET_BTREE_NODE_SEQ(n->data, ++ max(BTREE_NODE_SEQ(b->data), ++ BTREE_NODE_SEQ(m->data)) + 1); ++ ++ btree_set_min(n, prev->data->min_key); ++ btree_set_max(n, next->data->max_key); ++ ++ bch2_btree_update_add_new_node(as, n); ++ ++ n->data->format = new_f; ++ btree_node_set_format(n, new_f); ++ ++ bch2_btree_sort_into(c, n, prev); ++ bch2_btree_sort_into(c, n, next); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); ++ ++ bkey_init(&delete.k); ++ delete.k.p = prev->key.k.p; ++ bch2_keylist_add(&as->parent_keys, &delete); ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ ++ bch2_trans_verify_paths(trans); ++ ++ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); ++ ++ bch2_trans_verify_paths(trans); ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ six_lock_increment(&m->c.lock, SIX_LOCK_intent); ++ ++ bch2_trans_node_add(trans, n); ++ ++ bch2_trans_verify_paths(trans); ++ ++ bch2_btree_node_free_inmem(trans, b); ++ bch2_btree_node_free_inmem(trans, m); ++ ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); ++out: ++err: ++ bch2_path_put(trans, sib_path, true); ++ bch2_trans_verify_locks(trans); ++ return ret; ++} ++ ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ */ ++int bch2_btree_node_rewrite(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *n, *parent; ++ struct btree_update *as; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++ ++ parent = btree_node_parent(iter->path, b); ++ as = bch2_btree_update_start(trans, iter->path, b->c.level, ++ false, flags); ++ ret = PTR_ERR_OR_ZERO(as); ++ if (ret) ++ goto out; ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ trace_btree_rewrite(c, b); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); ++ ++ if (parent) { ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ bch2_btree_insert_node(as, trans, iter->path, parent, ++ &as->parent_keys, flags); ++ } else { ++ bch2_btree_set_root(as, trans, iter->path, n); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_trans_node_add(trans, n); ++ bch2_btree_node_free_inmem(trans, b); ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++out: ++ bch2_btree_path_downgrade(trans, iter->path); ++ return ret; ++} ++ ++struct async_btree_rewrite { ++ struct bch_fs *c; ++ struct work_struct work; ++ enum btree_id btree_id; ++ unsigned level; ++ struct bpos pos; ++ __le64 seq; ++}; ++ ++static int async_btree_node_rewrite_trans(struct btree_trans *trans, ++ struct async_btree_rewrite *a) ++{ ++ struct btree_iter iter; ++ struct btree *b; ++ int ret; ++ ++ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, ++ BTREE_MAX_DEPTH, a->level, 0); ++ b = bch2_btree_iter_peek_node(&iter); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto out; ++ ++ if (!b || b->data->keys.seq != a->seq) ++ goto out; ++ ++ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); ++out : ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++void async_btree_node_rewrite_work(struct work_struct *work) ++{ ++ struct async_btree_rewrite *a = ++ container_of(work, struct async_btree_rewrite, work); ++ struct bch_fs *c = a->c; ++ ++ bch2_trans_do(c, NULL, NULL, 0, ++ async_btree_node_rewrite_trans(&trans, a)); ++ percpu_ref_put(&c->writes); ++ kfree(a); ++} ++ ++void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) ++{ ++ struct async_btree_rewrite *a; ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return; ++ ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (!a) { ++ percpu_ref_put(&c->writes); ++ return; ++ } ++ ++ a->c = c; ++ a->btree_id = b->c.btree_id; ++ a->level = b->c.level; ++ a->pos = b->key.k.p; ++ a->seq = b->data->keys.seq; ++ ++ INIT_WORK(&a->work, async_btree_node_rewrite_work); ++ queue_work(c->btree_interior_update_worker, &a->work); ++} ++ ++static int __bch2_btree_node_update_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i *new_key, ++ bool skip_triggers) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter2 = { NULL }; ++ struct btree *parent; ++ int ret; ++ ++ if (!skip_triggers) { ++ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, ++ bkey_i_to_s_c(&b->key), 0); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, ++ new_key, 0); ++ if (ret) ++ return ret; ++ } ++ ++ if (new_hash) { ++ bkey_copy(&new_hash->key, new_key); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ } ++ ++ parent = btree_node_parent(iter->path, b); ++ if (parent) { ++ bch2_trans_copy_iter(&iter2, iter); ++ ++ iter2.path = bch2_btree_path_make_mut(trans, iter2.path, ++ iter2.flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ ++ BUG_ON(iter2.path->level != b->c.level); ++ BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); ++ ++ btree_node_unlock(trans, iter2.path, iter2.path->level); ++ path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; ++ iter2.path->level++; ++ btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE); ++ ++ bch2_btree_path_check_sort(trans, iter2.path, 0); ++ ++ ret = bch2_btree_iter_traverse(&iter2) ?: ++ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); ++ if (ret) ++ goto err; ++ } else { ++ BUG_ON(btree_node_root(c, b) != b); ++ ++ ret = darray_make_room(&trans->extra_journal_entries, ++ jset_u64s(new_key->k.u64s)); ++ if (ret) ++ return ret; ++ ++ journal_entry_set((void *) &darray_top(trans->extra_journal_entries), ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ new_key, new_key->k.u64s); ++ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); ++ } ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ JOURNAL_WATERMARK_reserved); ++ if (ret) ++ goto err; ++ ++ bch2_btree_node_lock_write(trans, iter->path, b); ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new_key); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, new_key); ++ } ++ ++ bch2_btree_node_unlock_write(trans, iter->path, b); ++out: ++ bch2_trans_iter_exit(trans, &iter2); ++ return ret; ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ mutex_unlock(&c->btree_cache.lock); ++ } ++ goto out; ++} ++ ++int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b, struct bkey_i *new_key, ++ bool skip_triggers) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *new_hash = NULL; ++ struct btree_path *path = iter->path; ++ struct closure cl; ++ int ret = 0; ++ ++ if (!btree_node_intent_locked(path, b->c.level) && ++ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); ++ ++ closure_init_stack(&cl); ++ ++ /* ++ * check btree_ptr_hash_val() after @b is locked by ++ * btree_iter_traverse(): ++ */ ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ if (ret) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ return ret; ++ } ++ ++ new_hash = bch2_btree_node_mem_alloc(c, false); ++ } ++ ++ path->intent_ref++; ++ ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, ++ new_key, skip_triggers); ++ --path->intent_ref; ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&new_hash->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ six_unlock_write(&new_hash->c.lock); ++ six_unlock_intent(&new_hash->c.lock); ++ } ++ closure_sync(&cl); ++ bch2_btree_cache_cannibalize_unlock(c); ++ return ret; ++} ++ ++int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, ++ struct btree *b, struct bkey_i *new_key, ++ bool skip_triggers) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter); ++ if (ret) ++ goto out; ++ ++ /* has node been freed? */ ++ if (iter.path->l[b->c.level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* Init code: */ ++ ++/* ++ * Only for filesystem bringup, when first reading the btree roots or allocating ++ * btree roots when initializing a new filesystem: ++ */ ++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ++{ ++ BUG_ON(btree_node_root(c, b)); ++ ++ bch2_btree_set_root_inmem(c, b); ++} ++ ++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c, false); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ set_btree_node_fake(b); ++ set_btree_node_need_rewrite(b); ++ b->c.level = 0; ++ b->c.btree_id = id; ++ ++ bkey_btree_ptr_init(&b->key); ++ b->key.k.p = SPOS_MAX; ++ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ bch2_btree_build_aux_trees(b); ++ ++ b->data->flags = 0; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, SPOS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ btree_node_set_format(b, b->data->format); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, ++ b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_update *as; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each_entry(as, &c->btree_interior_update_list, list) ++ prt_printf(out, "%p m %u w %u r %u j %llu\n", ++ as, ++ as->mode, ++ as->nodes_written, ++ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ as->journal.seq); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static bool bch2_btree_interior_updates_pending(struct bch_fs *c) ++{ ++ bool ret; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ret = !list_empty(&c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return ret; ++} ++ ++bool bch2_btree_interior_updates_flush(struct bch_fs *c) ++{ ++ bool ret = bch2_btree_interior_updates_pending(c); ++ ++ if (ret) ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_pending(c)); ++ return ret; ++} ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) ++{ ++ struct btree_root *r; ++ struct jset_entry *entry; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ vstruct_for_each(jset, entry) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) { ++ r = &c->btree_roots[entry->btree_id]; ++ r->level = entry->level; ++ r->alive = true; ++ bkey_copy(&r->key, &entry->start[0]); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++struct jset_entry * ++bch2_btree_roots_to_journal_entries(struct bch_fs *c, ++ struct jset_entry *start, ++ struct jset_entry *end) ++{ ++ struct jset_entry *entry; ++ unsigned long have = 0; ++ unsigned i; ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) ++ __set_bit(entry->btree_id, &have); ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].alive && !test_bit(i, &have)) { ++ journal_entry_set(end, ++ BCH_JSET_ENTRY_btree_root, ++ i, c->btree_roots[i].level, ++ &c->btree_roots[i].key, ++ c->btree_roots[i].key.u64s); ++ end = vstruct_next(end); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ return end; ++} ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *c) ++{ ++ if (c->btree_interior_update_worker) ++ destroy_workqueue(c->btree_interior_update_worker); ++ mempool_exit(&c->btree_interior_update_pool); ++} ++ ++int bch2_fs_btree_interior_update_init(struct bch_fs *c) ++{ ++ mutex_init(&c->btree_reserve_cache_lock); ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); ++ mutex_init(&c->btree_interior_update_lock); ++ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); ++ ++ c->btree_interior_update_worker = ++ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); ++ if (!c->btree_interior_update_worker) ++ return -ENOMEM; ++ ++ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +new file mode 100644 +index 000000000000..adfc6c24a7a4 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.h +@@ -0,0 +1,321 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++ ++#include "btree_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++ ++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, ++ struct bkey_format *); ++ ++#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) ++ ++#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) ++ ++/* ++ * Tracks an in progress split/rewrite of a btree node and the update to the ++ * parent node: ++ * ++ * When we split/rewrite a node, we do all the updates in memory without ++ * waiting for any writes to complete - we allocate the new node(s) and update ++ * the parent node, possibly recursively up to the root. ++ * ++ * The end result is that we have one or more new nodes being written - ++ * possibly several, if there were multiple splits - and then a write (updating ++ * an interior node) which will make all these new nodes visible. ++ * ++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old ++ * nodes can't be freed (their space on disk can't be reclaimed) until the ++ * update to the interior node that makes the new node visible completes - ++ * until then, the old nodes are still reachable on disk. ++ * ++ */ ++struct btree_update { ++ struct closure cl; ++ struct bch_fs *c; ++ u64 start_time; ++ ++ struct list_head list; ++ struct list_head unwritten_list; ++ ++ /* What kind of update are we doing? */ ++ enum { ++ BTREE_INTERIOR_NO_UPDATE, ++ BTREE_INTERIOR_UPDATING_NODE, ++ BTREE_INTERIOR_UPDATING_ROOT, ++ BTREE_INTERIOR_UPDATING_AS, ++ } mode; ++ ++ unsigned nodes_written:1; ++ unsigned took_gc_lock:1; ++ ++ enum btree_id btree_id; ++ ++ struct disk_reservation disk_res; ++ struct journal_preres journal_preres; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_NODE: ++ * The update that made the new nodes visible was a regular update to an ++ * existing interior node - @b. We can't write out the update to @b ++ * until the new nodes we created are finished writing, so we block @b ++ * from writing by putting this btree_interior update on the ++ * @b->write_blocked list with @write_blocked_list: ++ */ ++ struct btree *b; ++ struct list_head write_blocked_list; ++ ++ /* ++ * We may be freeing nodes that were dirty, and thus had journal entries ++ * pinned: we need to transfer the oldest of those pins to the ++ * btree_update operation, and release it when the new node(s) ++ * are all persistent and reachable: ++ */ ++ struct journal_entry_pin journal; ++ ++ /* Preallocated nodes we reserve when we start the update: */ ++ struct prealloc_nodes { ++ struct btree *b[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr; ++ } prealloc_nodes[2]; ++ ++ /* Nodes being freed: */ ++ struct keylist old_keys; ++ u64 _old_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_U64s_MAX]; ++ ++ /* Nodes being added: */ ++ struct keylist new_keys; ++ u64 _new_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_U64s_MAX]; ++ ++ /* New nodes, that will be made reachable by this update: */ ++ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_new_nodes; ++ ++ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; ++ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_old_nodes; ++ ++ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * ++ BCH_REPLICAS_MAX]; ++ open_bucket_idx_t nr_open_buckets; ++ ++ unsigned journal_u64s; ++ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; ++ ++ /* Only here to reduce stack usage on recursive splits: */ ++ struct keylist parent_keys; ++ /* ++ * Enough room for btree_split's keys without realloc - btree node ++ * pointers never have crc/compression info, so we only need to acount ++ * for the pointers for three keys ++ */ ++ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; ++}; ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, ++ struct btree *, ++ struct bkey_format); ++ ++int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); ++ ++int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, ++ unsigned, unsigned, enum btree_node_sibling); ++ ++static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree *b; ++ ++ EBUG_ON(!btree_node_locked(path, level)); ++ ++ b = path->l[level].b; ++ if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) ++ return 0; ++ ++ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); ++} ++ ++static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned level, ++ unsigned flags) ++{ ++ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, ++ btree_prev_sib) ?: ++ bch2_foreground_maybe_merge_sibling(trans, path, level, flags, ++ btree_next_sib); ++} ++ ++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); ++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); ++ ++static inline unsigned btree_update_reserve_required(struct bch_fs *c, ++ struct btree *b) ++{ ++ unsigned depth = btree_node_root(c, b)->c.level + 1; ++ ++ /* ++ * Number of nodes we might have to allocate in a worst case btree ++ * split operation - we split all the way up to the root, then allocate ++ * a new root, unless we're already at max depth: ++ */ ++ if (depth < BTREE_MAX_DEPTH) ++ return (depth - b->c.level) * 2 + 1; ++ else ++ return (depth - b->c.level) * 2 - 1; ++} ++ ++static inline void btree_node_reset_sib_u64s(struct btree *b) ++{ ++ b->sib_u64s[0] = b->nr.live_u64s; ++ b->sib_u64s[1] = b->nr.live_u64s; ++} ++ ++static inline void *btree_data_end(struct bch_fs *c, struct btree *b) ++{ ++ return (void *) b->data + btree_bytes(c); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, ++ struct btree *b) ++{ ++ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, ++ struct btree *b) ++{ ++ return btree_data_end(c, b); ++} ++ ++static inline void *write_block(struct btree *b) ++{ ++ return (void *) b->data + (b->written << 9); ++} ++ ++static inline bool __btree_addr_written(struct btree *b, void *p) ++{ ++ return p < write_block(b); ++} ++ ++static inline bool bset_written(struct btree *b, struct bset *i) ++{ ++ return __btree_addr_written(b, i); ++} ++ ++static inline bool bkey_written(struct btree *b, struct bkey_packed *k) ++{ ++ return __btree_addr_written(b, k); ++} ++ ++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, ++ struct btree *b, ++ void *end) ++{ ++ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + ++ b->whiteout_u64s; ++ ssize_t total = c->opts.btree_node_size >> 3; ++ ++ /* Always leave one extra u64 for bch2_varint_decode: */ ++ used++; ++ ++ return total - used; ++} ++ ++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, ++ struct btree *b) ++{ ++ ssize_t remaining = __bch_btree_u64s_remaining(c, b, ++ btree_bkey_last(b, bset_tree_last(b))); ++ ++ BUG_ON(remaining < 0); ++ ++ if (bset_written(b, btree_bset_last(b))) ++ return 0; ++ ++ return remaining; ++} ++ ++#define BTREE_WRITE_SET_U64s_BITS 9 ++ ++static inline unsigned btree_write_set_buffer(struct btree *b) ++{ ++ /* ++ * Could buffer up larger amounts of keys for btrees with larger keys, ++ * pending benchmarking: ++ */ ++ return 8 << BTREE_WRITE_SET_U64s_BITS; ++} ++ ++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, ++ struct btree *b) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ struct btree_node_entry *bne = max(write_block(b), ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ssize_t remaining_space = ++ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ ++ if (unlikely(bset_written(b, bset(b, t)))) { ++ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) ++ return bne; ++ } else { ++ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && ++ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) ++ return bne; ++ } ++ ++ return NULL; ++} ++ ++static inline void push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bpos pos) ++{ ++ struct bkey_packed k; ++ ++ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); ++ ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey *u = (void *) &k; ++ ++ bkey_init(u); ++ u->p = pos; ++ } ++ ++ k.needs_whiteout = true; ++ ++ b->whiteout_u64s += k.u64s; ++ bkey_copy(unwritten_whiteouts_start(c, b), &k); ++} ++ ++/* ++ * write lock must be held on @b (else the dirty bset that we were going to ++ * insert into could be written out from under us) ++ */ ++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, ++ struct btree *b, unsigned u64s) ++{ ++ if (unlikely(btree_node_need_rewrite(b))) ++ return false; ++ ++ return u64s <= bch_btree_keys_u64s_remaining(c, b); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); ++ ++bool bch2_btree_interior_updates_flush(struct bch_fs *); ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); ++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, ++ struct jset_entry *, struct jset_entry *); ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *); ++int bch2_fs_btree_interior_update_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +new file mode 100644 +index 000000000000..e2ecbd3bca77 +--- /dev/null ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -0,0 +1,1800 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "errcode.h" ++#include "error.h" ++#include "extent_update.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "subvolume.h" ++#include "replicas.h" ++ ++#include ++#include ++#include ++ ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, enum btree_update_flags); ++ ++static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, ++ const struct btree_insert_entry *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->cached, r->cached) ?: ++ -cmp_int(l->level, r->level) ?: ++ bpos_cmp(l->k->k.p, r->k->k.p); ++} ++ ++static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) ++{ ++ return i->path->l + i->level; ++} ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i != trans->updates && ++ insert_l(&i[0])->b == insert_l(&i[-1])->b; ++} ++ ++static inline bool same_leaf_as_next(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i + 1 < trans->updates + trans->nr_updates && ++ insert_l(&i[0])->b == insert_l(&i[1])->b; ++} ++ ++static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ ++ if (path->cached) ++ return; ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_trans_node_reinit_iter(trans, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(trans, b); ++} ++ ++void bch2_btree_node_lock_for_insert(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ bch2_btree_node_lock_write(trans, path, b); ++ bch2_btree_node_prep_for_write(trans, path, b); ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bkey_packed *k; ++ unsigned clobber_u64s = 0, new_u64s = 0; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(trans->c, b)); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) ++ k = NULL; ++ ++ /* @k is the key being overwritten/deleted, if any: */ ++ EBUG_ON(k && bkey_deleted(k)); ++ ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_deleted(&insert->k) && !k) ++ return false; ++ ++ if (bkey_deleted(&insert->k)) { ++ /* Deleting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ if (k->needs_whiteout) ++ push_whiteout(trans->c, b, insert->k.p); ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ bch2_bset_delete(b, k, clobber_u64s); ++ goto fix_iter; ++ } else { ++ bch2_btree_path_fix_key_modified(trans, b, k); ++ } ++ ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ goto overwrite; ++ } else { ++ bch2_btree_path_fix_key_modified(trans, b, k); ++ } ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(trans, path, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++} ++ ++static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ unsigned long old, new, v; ++ unsigned idx = w - b->writes; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ v = READ_ONCE(b->flags); ++ ++ do { ++ old = new = v; ++ ++ if (!(old & (1 << BTREE_NODE_dirty)) || ++ !!(old & (1 << BTREE_NODE_write_idx)) != idx || ++ w->journal.seq != seq) ++ break; ++ ++ new |= 1 << BTREE_NODE_need_write; ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ return 0; ++} ++ ++static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++static void btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_insert_entry *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = insert_l(insert)->b; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, ++ &insert_l(insert)->iter, insert->k))) ++ return; ++ ++ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty_acct(c, b); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_trans_node_reinit_iter(trans, b); ++} ++ ++/* Cached btree updates: */ ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ BUG_ON(bpos_cmp(i->k->k.p, i->path->pos)); ++ BUG_ON(i->cached != i->path->cached); ++ BUG_ON(i->level != i->path->level); ++ BUG_ON(i->btree_id != i->path->btree_id); ++ EBUG_ON(!i->level && ++ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && ++ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && ++ i->k->k.p.snapshot && ++ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, u64s, 0); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) { ++ trace_trans_restart_journal_preres_get(trans->fn, trace_ip); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, ++ trans->journal_u64s, ++ flags| ++ (trans->flags & JOURNAL_WATERMARK_MASK)); ++ ++ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; ++} ++ ++#define JSET_ENTRY_LOG_U64s 4 ++ ++static void journal_transaction_name(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct jset_entry *entry = ++ bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_log, 0, 0, ++ JSET_ENTRY_LOG_U64s); ++ struct jset_entry_log *l = ++ container_of(entry, struct jset_entry_log, entry); ++ ++ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); ++} ++ ++static inline enum btree_insert_ret ++btree_key_can_insert(struct btree_trans *trans, ++ struct btree *b, ++ unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ ++ if (!bch2_btree_node_insert_fits(c, b, u64s)) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ return BTREE_INSERT_OK; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert_cached(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ unsigned old_u64s = ck->u64s, new_u64s; ++ struct bkey_i *new_k; ++ ++ EBUG_ON(path->level); ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bch2_btree_key_cache_must_wait(c) && ++ !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) ++ return BTREE_INSERT_NEED_JOURNAL_RECLAIM; ++ ++ /* ++ * bch2_varint_decode can read past the end of the buffer by at most 7 ++ * bytes (it won't be used): ++ */ ++ u64s += 1; ++ ++ if (u64s <= ck->u64s) ++ return BTREE_INSERT_OK; ++ ++ new_u64s = roundup_pow_of_two(u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[path->btree_id], new_u64s); ++ return -ENOMEM; ++ } ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ /* ++ * Keys returned by peek() are no longer valid pointers, so we need a ++ * transaction restart: ++ */ ++ trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_, ++ path->btree_id, &path->pos, ++ old_u64s, new_u64s); ++ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced); ++} ++ ++/* Triggers: */ ++ ++static int run_one_mem_trigger(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ unsigned flags) ++{ ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ struct bkey_i *new = i->k; ++ int ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(i->btree_id)) ++ return 0; ++ ++ if (bch2_bkey_ops[old.k->type].atomic_trigger == ++ bch2_bkey_ops[i->k->k.type].atomic_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ ++ _deleted.p = i->path->pos; ++ ++ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_mark_key(trans, old, deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ ++ return ret; ++} ++ ++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, ++ bool overwrite) ++{ ++ /* ++ * Transactional triggers create new btree_insert_entries, so we can't ++ * pass them a pointer to a btree_insert_entry, that memory is going to ++ * move: ++ */ ++ struct bkey old_k = i->old_k; ++ struct bkey_s_c old = { &old_k, i->old_v }; ++ ++ if ((i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ return 0; ++ ++ if (!i->insert_trigger_run && ++ !i->overwrite_trigger_run && ++ bch2_bkey_ops[old.k->type].trans_trigger == ++ bch2_bkey_ops[i->k->k.type].trans_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_OVERWRITE| ++ i->flags) ?: 1; ++ } else if (overwrite && !i->overwrite_trigger_run) { ++ i->overwrite_trigger_run = true; ++ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; ++ } else if (!overwrite && !i->insert_trigger_run) { ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; ++ } else { ++ return 0; ++ } ++} ++ ++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, ++ struct btree_insert_entry *btree_id_start) ++{ ++ struct btree_insert_entry *i; ++ bool trans_trigger_run; ++ int ret, overwrite; ++ ++ for (overwrite = 1; overwrite >= 0; --overwrite) { ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ if (i->btree_id != btree_id) ++ continue; ++ ++ ret = run_one_trans_trigger(trans, i, overwrite); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ trans_trigger_run = true; ++ } ++ } while (trans_trigger_run); ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ unsigned btree_id = 0; ++ int ret = 0; ++ ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ if (btree_id == BTREE_ID_alloc) ++ continue; ++ ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; ++ ++ ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ if (ret) ++ return ret; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->btree_id > BTREE_ID_alloc) ++ break; ++ if (i->btree_id == BTREE_ID_alloc) { ++ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); ++ if (ret) ++ return ret; ++ break; ++ } ++ } ++ ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); ++ ++ return 0; ++} ++ ++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0; ++ ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ * XXX: synchronization of interior node updates with gc ++ */ ++ BUG_ON(i->cached || i->level); ++ ++ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { ++ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ struct btree_trans_commit_hook *h; ++ unsigned u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_trans_restart_fault_inject(trans->fn, trace_ip); ++ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ h = trans->hooks; ++ while (h) { ++ ret = h->fn(trans, h); ++ if (ret) ++ return ret; ++ h = h->next; ++ } ++ ++ trans_for_each_update(trans, i) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, i)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = !i->cached ++ ? btree_key_can_insert(trans, insert_l(i)->b, u64s) ++ : btree_key_can_insert_cached(trans, i->path, u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->bkey_type)) ++ marking = true; ++ ++ /* ++ * Revalidate before calling mem triggers - XXX, ugly: ++ * ++ * - successful btree node splits don't cause transaction ++ * restarts and will have invalidated the pointer to the bkey ++ * value ++ * - btree_node_lock_for_insert() -> btree_node_prep_for_write() ++ * when it has to resort ++ * - btree_key_can_insert_cached() when it has to reallocate ++ * ++ * Ugly because we currently have no way to tell if the ++ * pointer's been invalidated, which means it's debatabale ++ * whether we should be stashing the old key at all. ++ */ ++ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, ++ i->k->k.p); ++ ++ if (j_k) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } ++ } ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ return ret; ++ ++ journal_transaction_name(trans); ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; ++ } ++ ++ if (unlikely(trans->extra_journal_entries.nr)) { ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), ++ trans->extra_journal_entries.data, ++ trans->extra_journal_entries.nr); ++ ++ trans->journal_res.offset += trans->extra_journal_entries.nr; ++ trans->journal_res.u64s -= trans->extra_journal_entries.nr; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (bch2_journal_seq_verify) ++ trans_for_each_update(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (bch2_inject_invalid_keys) ++ trans_for_each_update(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ if (trans->fs_usage_deltas && ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) ++ return BTREE_INSERT_NEED_MARK_REPLICAS; ++ ++ trans_for_each_update(trans, i) ++ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ++ ret = run_one_mem_trigger(trans, i, i->flags); ++ if (ret) ++ return ret; ++ } ++ ++ if (unlikely(c->gc_pos.phase)) { ++ ret = bch2_trans_commit_run_gc_triggers(trans); ++ if (ret) ++ return ret; ++ } ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ trans_for_each_update(trans, i) { ++ struct journal *j = &c->journal; ++ struct jset_entry *entry; ++ ++ if (i->key_cache_already_flushed) ++ continue; ++ ++ entry = bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_overwrite, ++ i->btree_id, i->level, ++ i->old_k.u64s); ++ bkey_reassemble(&entry->start[0], ++ (struct bkey_s_c) { &i->old_k, i->old_v }); ++ ++ entry = bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_btree_keys, ++ i->btree_id, i->level, ++ i->k->k.u64s); ++ bkey_copy(&entry->start[0], i->k); ++ } ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } ++ ++ trans_for_each_update(trans, i) { ++ i->k->k.needs_whiteout = false; ++ ++ if (!i->cached) ++ btree_insert_key_leaf(trans, i); ++ else if (!i->key_cache_already_flushed) ++ bch2_btree_insert_key_cached(trans, i->path, i->k); ++ else ++ bch2_btree_key_cache_drop(trans, i->path); ++ } ++ ++ return ret; ++} ++ ++static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) ++{ ++ unsigned l; ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) ++ if (btree_node_read_locked(path, l)) ++ BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); ++} ++ ++static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) ++{ ++ struct btree *b = path_l(path)->b; ++ ++ do { ++ if (path->nodes_locked && ++ path->nodes_locked != path->nodes_intent_locked) ++ path_upgrade_readers(trans, path); ++ } while ((path = prev_btree_path(trans, path)) && ++ path_l(path)->b == b); ++} ++ ++/* ++ * Check for nodes that we have both read and intent locks on, and upgrade the ++ * readers to intent: ++ */ ++static inline void normalize_read_intent_locks(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ unsigned i, nr_read = 0, nr_intent = 0; ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ struct btree_path *next = i + 1 < trans->nr_sorted ++ ? trans->paths + trans->sorted[i + 1] ++ : NULL; ++ ++ if (path->nodes_locked) { ++ if (path->nodes_intent_locked) ++ nr_intent++; ++ else ++ nr_read++; ++ } ++ ++ if (!next || path_l(path)->b != path_l(next)->b) { ++ if (nr_read && nr_intent) ++ upgrade_readers(trans, path); ++ ++ nr_read = nr_intent = 0; ++ } ++ } ++ ++ bch2_trans_verify_locks(trans); ++} ++ ++static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) ++{ ++ struct btree_path *path; ++ unsigned i; ++ ++ trans_for_each_path_inorder(trans, path, i) { ++ //if (path == pos) ++ // break; ++ ++ if (path->nodes_locked != path->nodes_intent_locked && ++ !bch2_btree_path_upgrade(trans, path, path->level + 1)) ++ return true; ++ } ++ ++ return false; ++} ++ ++static inline int trans_lock_write(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ int ret; ++ ++ trans_for_each_update(trans, i) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ if (!six_trylock_write(&insert_l(i)->b->c.lock)) { ++ if (have_conflicting_read_lock(trans, i->path)) ++ goto fail; ++ ++ ret = btree_node_lock_type(trans, i->path, ++ insert_l(i)->b, ++ i->path->pos, i->level, ++ SIX_LOCK_write, NULL, NULL); ++ BUG_ON(ret); ++ } ++ ++ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); ++ } ++ ++ return 0; ++fail: ++ while (--i >= trans->updates) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); ++ } ++ ++ trace_trans_restart_would_deadlock_write(trans->fn); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); ++} ++ ++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); ++} ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ struct printbuf buf = PRINTBUF; ++ int ret, u64s_delta = 0; ++ int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; ++ ++ trans_for_each_update(trans, i) { ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, rw, &buf)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "invalid bkey on insert from %s -> %ps", ++ trans->fn, (void *) i->ip_allocated); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); ++ prt_newline(&buf); ++ ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, rw, &buf); ++ ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ printbuf_exit(&buf); ++ return -EINVAL; ++ } ++ btree_insert_entry_checks(trans, i); ++ } ++ ++ printbuf_exit(&buf); ++ ++ trans_for_each_update(trans, i) { ++ if (i->cached) ++ continue; ++ ++ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; ++ u64s_delta -= i->old_btree_u64s; ++ ++ if (!same_leaf_as_next(trans, i)) { ++ if (u64s_delta <= 0) { ++ ret = bch2_foreground_maybe_merge(trans, i->path, ++ i->level, trans->flags); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ u64s_delta = 0; ++ } ++ } ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, trans->journal_preres_u64s, ++ JOURNAL_RES_GET_NONBLOCK| ++ (trans->flags & JOURNAL_WATERMARK_MASK)); ++ if (unlikely(ret == -EAGAIN)) ++ ret = bch2_trans_journal_preres_get_cold(trans, ++ trans->journal_preres_u64s, trace_ip); ++ if (unlikely(ret)) ++ return ret; ++ ++ normalize_read_intent_locks(trans); ++ ++ ret = trans_lock_write(trans); ++ if (unlikely(ret)) ++ return ret; ++ ++ ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); ++ ++ if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) ++ bch2_drop_overwrites_from_journal(trans); ++ ++ trans_for_each_update(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_unlock_write_inlined(trans, i->path, ++ insert_l(i)->b); ++ ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ bch2_trans_downgrade(trans); ++ ++ return 0; ++} ++ ++static int journal_reclaim_wait_done(struct bch_fs *c) ++{ ++ int ret = bch2_journal_error(&c->journal) ?: ++ !bch2_btree_key_cache_must_wait(c); ++ ++ if (!ret) ++ journal_reclaim_kick(&c->journal); ++ return ret; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ int ret, unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ ++ switch (ret) { ++ case BTREE_INSERT_BTREE_NODE_FULL: ++ ret = bch2_btree_split_leaf(trans, i->path, trans->flags); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ trace_trans_restart_btree_node_split(trans->fn, trace_ip, ++ i->btree_id, &i->path->pos); ++ break; ++ case BTREE_INSERT_NEED_MARK_REPLICAS: ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); ++ if (ret) ++ break; ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_mark_replicas(trans->fn, trace_ip); ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RES: ++ bch2_trans_unlock(trans); ++ ++ if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && ++ !(trans->flags & JOURNAL_WATERMARK_reserved)) { ++ ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ break; ++ } ++ ++ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ++ if (ret) ++ break; ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_journal_res_get(trans->fn, trace_ip); ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RECLAIM: ++ bch2_trans_unlock(trans); ++ ++ trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); ++ ++ wait_event_freezable(c->journal.reclaim_wait, ++ (ret = journal_reclaim_wait_done(c))); ++ if (ret < 0) ++ break; ++ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ trace_trans_restart_journal_reclaim(trans->fn, trace_ip); ++ break; ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); ++ BUG_ON(ret == -ENOSPC && ++ !(trans->flags & BTREE_INSERT_NOWAIT) && ++ (trans->flags & BTREE_INSERT_NOFAIL)); ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || ++ test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EROFS; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_fs_read_write_early(c) ?: ++ bch2_trans_relock(trans); ++ if (ret) ++ return ret; ++ ++ percpu_ref_get(&c->writes); ++ return 0; ++} ++ ++/* ++ * This is for updates done in the early part of fsck - btree_gc - before we've ++ * gone RW. we only add the new key to the list of keys for journal replay to ++ * do. ++ */ ++static noinline int ++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0; ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i = NULL; ++ unsigned u64s; ++ int ret = 0; ++ ++ if (!trans->nr_updates && ++ !trans->extra_journal_entries.nr) ++ goto out_reset; ++ ++ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&c->gc_lock); ++ ++ ret = bch2_trans_commit_run_triggers(trans); ++ if (ret) ++ goto out_reset; ++ ++ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ++ ret = do_bch2_trans_commit_to_journal_replay(trans); ++ goto out_reset; ++ } ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget_live(&c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ goto out_reset; ++ } ++ ++ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ trans->journal_u64s = trans->extra_journal_entries.nr; ++ trans->journal_preres_u64s = 0; ++ ++ /* For journalling transaction name: */ ++ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); ++ ++ trans_for_each_update(trans, i) { ++ BUG_ON(!i->path->should_be_locked); ++ ++ if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { ++ trace_trans_restart_upgrade(trans->fn, _RET_IP_, ++ i->btree_id, &i->path->pos); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_intent_locked(i->path, i->level)); ++ ++ if (i->key_cache_already_flushed) ++ continue; ++ ++ /* we're going to journal the key being updated: */ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (i->cached && ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) ++ trans->journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ ++ /* and we're also going to log the overwrite: */ ++ trans->journal_u64s += jset_u64s(i->old_k.u64s); ++ } ++ ++ if (trans->extra_journal_res) { ++ ret = bch2_disk_reservation_add(c, trans->disk_res, ++ trans->extra_journal_res, ++ (trans->flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); ++ if (ret) ++ goto err; ++ } ++retry: ++ BUG_ON(trans->restarted); ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ ++ ret = do_bch2_trans_commit(trans, &i, _RET_IP_); ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++ ++ trace_transaction_commit(trans->fn, _RET_IP_); ++out: ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) ++ percpu_ref_put(&c->writes); ++out_reset: ++ bch2_trans_reset_updates(trans); ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset((void *) trans->fs_usage_deltas + ++ offsetof(struct replicas_delta_list, memset_start), 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_); ++ if (ret) ++ goto out; ++ ++ goto retry; ++} ++ ++static int check_pos_snapshot_overwritten(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (!btree_type_has_snapshots(id)) ++ return 0; ++ ++ if (!snapshot_t(c, pos.snapshot)->children[0]) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while (1) { ++ k = bch2_btree_iter_prev(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (!k.k) ++ break; ++ ++ if (bkey_cmp(pos, k.k->p)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { ++ ret = 1; ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++int bch2_trans_update_extent(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert, ++ enum btree_update_flags flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter, update_iter; ++ struct bpos start = bkey_start_pos(&insert->k); ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ enum btree_id btree_id = orig_iter->btree_id; ++ int ret = 0, compressed_sectors; ++ ++ bch2_trans_iter_init(trans, &iter, btree_id, start, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_NOT_EXTENTS); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ ++ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { ++ /* ++ * We can't merge extents if they belong to interior snapshot ++ * tree nodes, and there's a snapshot in which one extent is ++ * visible and the other is not - i.e. if visibility is ++ * different. ++ * ++ * Instead of checking if visibilitiy of the two extents is ++ * different, for now we just check if either has been ++ * overwritten: ++ */ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge1; ++ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge1; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { ++ ret = bch2_btree_delete_at(trans, &iter, flags); ++ if (ret) ++ goto err; ++ ++ insert = update; ++ goto next; ++ } ++ } ++nomerge1: ++ ret = 0; ++ if (!bkey_cmp(k.k->p, start)) ++ goto next; ++ ++ while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { ++ bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; ++ bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; ++ ++ /* ++ * If we're going to be splitting a compressed extent, note it ++ * so that __bch2_trans_commit() can increase our disk ++ * reservation: ++ */ ++ if (((front_split && back_split) || ++ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && ++ (compressed_sectors = bch2_bkey_sectors_compressed(k))) ++ trans->extra_journal_res += compressed_sectors; ++ ++ if (front_split) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ bch2_cut_back(start, update); ++ ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ if (k.k->p.snapshot != insert->k.p.snapshot && ++ (front_split || back_split)) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ ++ bch2_cut_front(start, update); ++ bch2_cut_back(insert->k.p, update); ++ ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (bkey_cmp(k.k->p, insert->k.p) <= 0) { ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = k.k->p; ++ ++ if (insert->k.p.snapshot != k.k->p.snapshot) { ++ update->k.p.snapshot = insert->k.p.snapshot; ++ update->k.type = KEY_TYPE_whiteout; ++ } ++ ++ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ bch2_trans_iter_exit(trans, &update_iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ if (back_split) { ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(insert->k.p, update); ++ ++ ret = bch2_trans_update_by_path(trans, iter.path, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags); ++ if (ret) ++ goto err; ++ goto out; ++ } ++next: ++ bch2_btree_iter_advance(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ } ++ ++ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { ++ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge2; ++ ++ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ goto nomerge2; ++ ++ bch2_bkey_merge(c, bkey_i_to_s(insert), k); ++ } ++nomerge2: ++ ret = 0; ++out: ++ if (!bkey_deleted(&insert->k)) { ++ /* ++ * Rewinding iterators is expensive: get a new one and the one ++ * that points to the start of insert will be cloned from: ++ */ ++ bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, insert, flags); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++/* ++ * When deleting, check if we need to emit a whiteout (because we're overwriting ++ * something in an ancestor snapshot) ++ */ ++static int need_whiteout_for_snapshot(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 snapshot = pos.snapshot; ++ int ret; ++ ++ if (!bch2_snapshot_parent(trans->c, pos.snapshot)) ++ return 0; ++ ++ pos.snapshot++; ++ ++ for_each_btree_key_norestart(trans, iter, btree_id, pos, ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_NOPRESERVE, k, ret) { ++ if (bkey_cmp(k.k->p, pos)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(trans->c, snapshot, ++ k.k->p.snapshot)) { ++ ret = !bkey_whiteout(k.k); ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static int __must_check ++bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags, ++ unsigned long ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i, n; ++ int ret = 0; ++ ++ BUG_ON(!path->should_be_locked); ++ ++ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ BUG_ON(bpos_cmp(k->k.p, path->pos)); ++ ++ n = (struct btree_insert_entry) { ++ .flags = flags, ++ .bkey_type = __btree_node_type(path->level, path->btree_id), ++ .btree_id = path->btree_id, ++ .level = path->level, ++ .cached = path->cached, ++ .path = path, ++ .k = k, ++ .ip_allocated = ip, ++ }; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ BUG_ON(i != trans->updates && ++ btree_insert_entry_cmp(i - 1, i) >= 0); ++#endif ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update, ++ * then delete/trim any updates the new update overwrites: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_insert_entry_cmp(&n, i) <= 0) ++ break; ++ ++ if (i < trans->updates + trans->nr_updates && ++ !btree_insert_entry_cmp(&n, i)) { ++ BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); ++ ++ bch2_path_put(trans, i->path, true); ++ i->flags = n.flags; ++ i->cached = n.cached; ++ i->k = n.k; ++ i->path = n.path; ++ i->ip_allocated = n.ip_allocated; ++ } else { ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ ++ i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; ++ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; ++ ++ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); ++ ++ if (j_k) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } ++ } ++ ++ __btree_path_get(i->path, true); ++ ++ /* ++ * If a key is present in the key cache, it must also exist in the ++ * btree - this is necessary for cache coherency. When iterating over ++ * a btree that's cached in the key cache, the btree iter code checks ++ * the key cache - but the key has to exist in the btree for that to ++ * work: ++ */ ++ if (path->cached && ++ bkey_deleted(&i->old_k)) { ++ struct btree_path *btree_path; ++ ++ i->key_cache_already_flushed = true; ++ i->flags |= BTREE_TRIGGER_NORUN; ++ ++ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT, _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, btree_path, 0); ++ if (ret) ++ goto err; ++ ++ btree_path->should_be_locked = true; ++ ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip); ++err: ++ bch2_path_put(trans, btree_path, true); ++ } ++ ++ return ret; ++} ++ ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_); ++} ++ ++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ struct btree_path *path = iter->update_path ?: iter->path; ++ struct bkey_cached *ck; ++ int ret; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ ++ if (bkey_deleted(&k->k) && ++ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { ++ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ if (ret) ++ k->k.type = KEY_TYPE_whiteout; ++ } ++ ++ /* ++ * Ensure that updates to cached btrees go to the key cache: ++ */ ++ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ !path->cached && ++ !path->level && ++ btree_id_cached(trans->c, path->btree_id)) { ++ if (!iter->key_cache_path || ++ !iter->key_cache_path->should_be_locked || ++ bpos_cmp(iter->key_cache_path->pos, k->k.p)) { ++ if (!iter->key_cache_path) ++ iter->key_cache_path = ++ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_CACHED, _THIS_IP_); ++ ++ iter->key_cache_path = ++ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, ++ BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return ret; ++ ++ ck = (void *) iter->key_cache_path->l[0].b; ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); ++ } ++ ++ iter->key_cache_path->should_be_locked = true; ++ } ++ ++ path = iter->key_cache_path; ++ } ++ ++ return bch2_trans_update_by_path(trans, path, k, flags); ++} ++ ++void bch2_trans_commit_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ h->next = trans->hooks; ++ trans->hooks = h; ++} ++ ++int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, 0); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k)); ++} ++ ++int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, ++ unsigned len, unsigned update_flags) ++{ ++ struct bkey_i *k; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.p = iter->pos; ++ bch2_key_resize(&k->k, len); ++ return bch2_trans_update(trans, iter, k, update_flags); ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned update_flags) ++{ ++ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); ++} ++ ++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ++ struct bpos start, struct bpos end, ++ unsigned update_flags, ++ u64 *journal_seq) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); ++retry: ++ while ((bch2_trans_begin(trans), ++ (k = bch2_btree_iter_peek(&iter)).k) && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter.pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(trans->c, 0); ++ struct bkey_i delete; ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * This could probably be more efficient for extents: ++ */ ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter.pos; ++ ++ if (iter.flags & BTREE_ITER_IS_EXTENTS) { ++ unsigned max_sectors = ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_trim_atomic(trans, &iter, &delete); ++ if (ret) ++ break; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: ++ bch2_trans_commit(trans, &disk_res, journal_seq, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(trans->c, &disk_res); ++ if (ret) ++ break; ++ } ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ ret = 0; ++ goto retry; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ unsigned update_flags, ++ u64 *journal_seq) ++{ ++ return bch2_trans_do(c, NULL, journal_seq, 0, ++ bch2_btree_delete_range_trans(&trans, id, start, end, ++ update_flags, journal_seq)); ++} ++ ++int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) ++{ ++ unsigned len = strlen(msg); ++ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); ++ struct jset_entry_log *l; ++ int ret; ++ ++ ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s)); ++ if (ret) ++ return ret; ++ ++ l = (void *) &darray_top(trans->extra_journal_entries); ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 1; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ memcpy(l->d, msg, len); ++ while (len & 7) ++ l->d[len++] = '\0'; ++ ++ trans->extra_journal_entries.nr += jset_u64s(u64s); ++ return 0; ++} +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +new file mode 100644 +index 000000000000..b4be2122c2d5 +--- /dev/null ++++ b/fs/bcachefs/buckets.c +@@ -0,0 +1,2113 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "backpointers.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "buckets_waiting_for_journal.h" ++#include "ec.h" ++#include "error.h" ++#include "inode.h" ++#include "movinggc.h" ++#include "recovery.h" ++#include "reflink.h" ++#include "replicas.h" ++#include "subvolume.h" ++ ++#include ++#include ++ ++static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, ++ enum bch_data_type data_type, ++ s64 sectors) ++{ ++ switch (data_type) { ++ case BCH_DATA_btree: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_cached: ++ fs_usage->cached += sectors; ++ break; ++ default: ++ break; ++ } ++} ++ ++void bch2_fs_usage_initialize(struct bch_fs *c) ++{ ++ struct bch_fs_usage *usage; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ usage = c->usage_base; ++ ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ usage->reserved += usage->persistent_reserved[i]; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage dev = bch2_dev_usage_read(ca); ++ ++ usage->hidden += (dev.d[BCH_DATA_sb].buckets + ++ dev.d[BCH_DATA_journal].buckets) * ++ ca->mi.bucket_size; ++ } ++ ++ percpu_up_write(&c->mark_lock); ++} ++ ++static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, ++ unsigned journal_seq, ++ bool gc) ++{ ++ BUG_ON(!gc && !journal_seq); ++ ++ return this_cpu_ptr(gc ++ ? ca->usage_gc ++ : ca->usage[journal_seq & JOURNAL_BUF_MASK]); ++} ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage ret; ++ unsigned seq, i, u64s = dev_usage_u64s(); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) ++ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, ++ unsigned journal_seq, ++ bool gc) ++{ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ BUG_ON(!gc && !journal_seq); ++ ++ return this_cpu_ptr(gc ++ ? c->usage_gc ++ : c->usage[journal_seq & JOURNAL_BUF_MASK]); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) ++{ ++ ssize_t offset = v - (u64 *) c->usage_base; ++ unsigned i, seq; ++ u64 ret; ++ ++ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ ret = *v; ++ ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) ++{ ++ struct bch_fs_usage_online *ret; ++ unsigned seq, i, u64s; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ ret = kmalloc(sizeof(struct bch_fs_usage_online) + ++ sizeof(u64) * c->replicas.nr, GFP_NOFS); ++ if (unlikely(!ret)) { ++ percpu_up_read(&c->mark_lock); ++ return NULL; ++ } ++ ++ ret->online_reserved = percpu_u64_get(c->online_reserved); ++ ++ u64s = fs_usage_u64s(c); ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) ++{ ++ struct bch_dev *ca; ++ unsigned i, u64s = fs_usage_u64s(c); ++ ++ BUG_ON(idx >= ARRAY_SIZE(c->usage)); ++ ++ preempt_disable(); ++ write_seqcount_begin(&c->usage_lock); ++ ++ acc_u64s_percpu((u64 *) c->usage_base, ++ (u64 __percpu *) c->usage[idx], u64s); ++ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) { ++ u64s = dev_usage_u64s(); ++ ++ acc_u64s_percpu((u64 *) ca->usage_base, ++ (u64 __percpu *) ca->usage[idx], u64s); ++ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); ++ } ++ rcu_read_unlock(); ++ ++ write_seqcount_end(&c->usage_lock); ++ preempt_enable(); ++} ++ ++void bch2_fs_usage_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_fs_usage_online *fs_usage) ++{ ++ unsigned i; ++ ++ prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); ++ ++ prt_printf(out, "hidden:\t\t\t\t%llu\n", ++ fs_usage->u.hidden); ++ prt_printf(out, "data:\t\t\t\t%llu\n", ++ fs_usage->u.data); ++ prt_printf(out, "cached:\t\t\t\t%llu\n", ++ fs_usage->u.cached); ++ prt_printf(out, "reserved:\t\t\t%llu\n", ++ fs_usage->u.reserved); ++ prt_printf(out, "nr_inodes:\t\t\t%llu\n", ++ fs_usage->u.nr_inodes); ++ prt_printf(out, "online reserved:\t\t%llu\n", ++ fs_usage->online_reserved); ++ ++ for (i = 0; ++ i < ARRAY_SIZE(fs_usage->u.persistent_reserved); ++ i++) { ++ prt_printf(out, "%u replicas:\n", i + 1); ++ prt_printf(out, "\treserved:\t\t%llu\n", ++ fs_usage->u.persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ prt_printf(out, "\t"); ++ bch2_replicas_entry_to_text(out, e); ++ prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); ++ } ++} ++ ++static u64 reserve_factor(u64 r) ++{ ++ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); ++} ++ ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) ++{ ++ return min(fs_usage->u.hidden + ++ fs_usage->u.btree + ++ fs_usage->u.data + ++ reserve_factor(fs_usage->u.reserved + ++ fs_usage->online_reserved), ++ c->capacity); ++} ++ ++static struct bch_fs_usage_short ++__bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ u64 data, reserved; ++ ++ ret.capacity = c->capacity - ++ bch2_fs_usage_read_one(c, &c->usage_base->hidden); ++ ++ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + ++ bch2_fs_usage_read_one(c, &c->usage_base->btree); ++ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + ++ percpu_u64_get(c->online_reserved); ++ ++ ret.used = min(ret.capacity, data + reserve_factor(reserved)); ++ ret.free = ret.capacity - ret.used; ++ ++ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = __bch2_fs_usage_read_short(c); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++void bch2_dev_usage_init(struct bch_dev *ca) ++{ ++ ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++} ++ ++static inline int bucket_sectors_fragmented(struct bch_dev *ca, ++ struct bch_alloc_v4 a) ++{ ++ return a.dirty_sectors ++ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) ++ : 0; ++} ++ ++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_alloc_v4 old, ++ struct bch_alloc_v4 new, ++ u64 journal_seq, bool gc) ++{ ++ struct bch_fs_usage *fs_usage; ++ struct bch_dev_usage *u; ++ ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, gc); ++ ++ if (data_type_is_hidden(old.data_type)) ++ fs_usage->hidden -= ca->mi.bucket_size; ++ if (data_type_is_hidden(new.data_type)) ++ fs_usage->hidden += ca->mi.bucket_size; ++ ++ u = dev_usage_ptr(ca, journal_seq, gc); ++ ++ u->d[old.data_type].buckets--; ++ u->d[new.data_type].buckets++; ++ ++ u->buckets_ec -= (int) !!old.stripe; ++ u->buckets_ec += (int) !!new.stripe; ++ ++ u->d[old.data_type].sectors -= old.dirty_sectors; ++ u->d[new.data_type].sectors += new.dirty_sectors; ++ ++ u->d[BCH_DATA_cached].sectors += new.cached_sectors; ++ u->d[BCH_DATA_cached].sectors -= old.cached_sectors; ++ ++ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); ++ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); ++ ++ preempt_enable(); ++} ++ ++static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, ++ struct bucket old, struct bucket new, ++ u64 journal_seq, bool gc) ++{ ++ struct bch_alloc_v4 old_a = { ++ .gen = old.gen, ++ .data_type = old.data_type, ++ .dirty_sectors = old.dirty_sectors, ++ .cached_sectors = old.cached_sectors, ++ .stripe = old.stripe, ++ }; ++ struct bch_alloc_v4 new_a = { ++ .gen = new.gen, ++ .data_type = new.data_type, ++ .dirty_sectors = new.dirty_sectors, ++ .cached_sectors = new.cached_sectors, ++ .stripe = new.stripe, ++ }; ++ ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); ++} ++ ++static inline int __update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ ++static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_replicas_entry *r, s64 sectors, ++ unsigned journal_seq, bool gc) ++{ ++ struct bch_fs_usage __percpu *fs_usage; ++ int idx, ret = 0; ++ struct printbuf buf = PRINTBUF; ++ ++ percpu_down_read(&c->mark_lock); ++ buf.atomic++; ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ if (idx < 0 && ++ fsck_err(c, "no replicas entry\n" ++ " while marking %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ percpu_up_read(&c->mark_lock); ++ ret = bch2_mark_replicas(c, r); ++ percpu_down_read(&c->mark_lock); ++ ++ if (ret) ++ goto err; ++ idx = bch2_replicas_entry_idx(c, r); ++ } ++ if (idx < 0) { ++ ret = -1; ++ goto err; ++ } ++ ++ preempt_disable(); ++ fs_usage = fs_usage_ptr(c, journal_seq, gc); ++ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); ++ fs_usage->replicas[idx] += sectors; ++ preempt_enable(); ++err: ++fsck_err: ++ percpu_up_read(&c->mark_lock); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static inline int update_cached_sectors(struct bch_fs *c, ++ struct bkey_s_c k, ++ unsigned dev, s64 sectors, ++ unsigned journal_seq, bool gc) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ return update_replicas(c, k, &r.e, sectors, journal_seq, gc); ++} ++ ++static struct replicas_delta_list * ++replicas_deltas_realloc(struct btree_trans *trans, unsigned more) ++{ ++ struct replicas_delta_list *d = trans->fs_usage_deltas; ++ unsigned new_size = d ? (d->size + more) * 2 : 128; ++ unsigned alloc_size = sizeof(*d) + new_size; ++ ++ WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); ++ ++ if (!d || d->used + more > d->size) { ++ d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO); ++ ++ BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX); ++ ++ if (!d) { ++ d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO); ++ memset(d, 0, REPLICAS_DELTA_LIST_MAX); ++ ++ if (trans->fs_usage_deltas) ++ memcpy(d, trans->fs_usage_deltas, ++ trans->fs_usage_deltas->size + sizeof(*d)); ++ ++ new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); ++ kfree(trans->fs_usage_deltas); ++ } ++ ++ d->size = new_size; ++ trans->fs_usage_deltas = d; ++ } ++ return d; ++} ++ ++static inline void update_replicas_list(struct btree_trans *trans, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ struct replicas_delta_list *d; ++ struct replicas_delta *n; ++ unsigned b; ++ ++ if (!sectors) ++ return; ++ ++ b = replicas_entry_bytes(r) + 8; ++ d = replicas_deltas_realloc(trans, b); ++ ++ n = (void *) d->d + d->used; ++ n->delta = sectors; ++ memcpy((void *) n + offsetof(struct replicas_delta, r), ++ r, replicas_entry_bytes(r)); ++ bch2_replicas_entry_sort(&n->r); ++ d->used += b; ++} ++ ++static inline void update_cached_sectors_list(struct btree_trans *trans, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas_list(trans, &r.e, sectors); ++} ++ ++int bch2_mark_alloc(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; ++ struct bch_alloc_v4 old_a, new_a; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ /* ++ * alloc btree is read in by bch2_alloc_read, not gc: ++ */ ++ if ((flags & BTREE_TRIGGER_GC) && ++ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) ++ return 0; ++ ++ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, ++ "alloc key for invalid device or bucket")) ++ return -EIO; ++ ++ ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ ++ bch2_alloc_to_v4(old, &old_a); ++ bch2_alloc_to_v4(new, &new_a); ++ ++ if ((flags & BTREE_TRIGGER_INSERT) && ++ data_type_is_empty(old_a.data_type) != ++ data_type_is_empty(new_a.data_type) && ++ new.k->type == KEY_TYPE_alloc_v4) { ++ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; ++ ++ BUG_ON(!journal_seq); ++ ++ /* ++ * If the btree updates referring to a bucket weren't flushed ++ * before the bucket became empty again, then the we don't have ++ * to wait on a journal flush before we can reuse the bucket: ++ */ ++ new_a.journal_seq = data_type_is_empty(new_a.data_type) && ++ (journal_seq == v->journal_seq || ++ bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ++ ? 0 : journal_seq; ++ v->journal_seq = new_a.journal_seq; ++ } ++ ++ if (!data_type_is_empty(old_a.data_type) && ++ data_type_is_empty(new_a.data_type) && ++ new_a.journal_seq) { ++ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ new.k->p.inode, new.k->p.offset, ++ new_a.journal_seq); ++ if (ret) { ++ bch2_fs_fatal_error(c, ++ "error setting bucket_needs_journal_commit: %i", ret); ++ return ret; ++ } ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ if (!gc && new_a.gen != old_a.gen) ++ *bucket_gen(ca, new.k->p.offset) = new_a.gen; ++ ++ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); ++ ++ if (gc) { ++ struct bucket *g = gc_bucket(ca, new.k->p.offset); ++ ++ bucket_lock(g); ++ ++ g->gen_valid = 1; ++ g->gen = new_a.gen; ++ g->data_type = new_a.data_type; ++ g->stripe = new_a.stripe; ++ g->stripe_redundancy = new_a.stripe_redundancy; ++ g->dirty_sectors = new_a.dirty_sectors; ++ g->cached_sectors = new_a.cached_sectors; ++ ++ bucket_unlock(g); ++ } ++ percpu_up_read(&c->mark_lock); ++ ++ /* ++ * need to know if we're getting called from the invalidate path or ++ * not: ++ */ ++ ++ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && ++ old_a.cached_sectors) { ++ ret = update_cached_sectors(c, new, ca->dev_idx, ++ -old_a.cached_sectors, ++ journal_seq, gc); ++ if (ret) { ++ bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); ++ return ret; ++ } ++ } ++ ++ if (new_a.data_type == BCH_DATA_free && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) ++ closure_wake_up(&c->freelist_wait); ++ ++ if (new_a.data_type == BCH_DATA_need_discard && ++ (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) ++ bch2_do_discards(c); ++ ++ if (old_a.data_type != BCH_DATA_cached && ++ new_a.data_type == BCH_DATA_cached && ++ should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) ++ bch2_do_invalidates(c); ++ ++ if (new_a.data_type == BCH_DATA_need_gc_gens) ++ bch2_do_gc_gens(c); ++ ++ return 0; ++} ++ ++int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) ++{ ++ struct bucket old, new, *g; ++ int ret = 0; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ BUG_ON(data_type != BCH_DATA_sb && ++ data_type != BCH_DATA_journal); ++ ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return 0; ++ ++ percpu_down_read(&c->mark_lock); ++ g = gc_bucket(ca, b); ++ ++ bucket_lock(g); ++ old = *g; ++ ++ if (bch2_fs_inconsistent_on(g->data_type && ++ g->data_type != data_type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[g->data_type], ++ bch2_data_types[data_type])) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ++ ca->dev_idx, b, g->gen, ++ bch2_data_types[g->data_type ?: data_type], ++ g->dirty_sectors, sectors)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ ++ g->data_type = data_type; ++ g->dirty_sectors += sectors; ++ new = *g; ++err: ++ bucket_unlock(g); ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, 0, true); ++ percpu_up_read(&c->mark_lock); ++ return ret; ++} ++ ++static int check_bucket_ref(struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 b_gen, u8 bucket_data_type, ++ u32 dirty_sectors, u32 cached_sectors) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); ++ u16 bucket_sectors = !ptr->cached ++ ? dirty_sectors ++ : cached_sectors; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (bucket_data_type == BCH_DATA_cached) ++ bucket_data_type = BCH_DATA_user; ++ ++ if (gen_after(ptr->gen, b_gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, b_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, b_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (b_gen != ptr->gen && !ptr->cached) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, b_gen, ++ *bucket_gen(ca, bucket_nr), ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (b_gen != ptr->gen) { ++ ret = 1; ++ goto err; ++ } ++ ++ if (!data_type_is_empty(bucket_data_type) && ++ ptr_data_type && ++ bucket_data_type != ptr_data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, b_gen, ++ bch2_data_types[bucket_data_type], ++ bch2_data_types[ptr_data_type], ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, b_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ bucket_sectors, sectors, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EIO; ++ goto err; ++ } ++err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int mark_stripe_bucket(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned ptr_idx, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ u64 journal_seq = trans->journal_res.seq; ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ bool parity = ptr_idx >= nr_data; ++ enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; ++ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; ++ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket old, new, *g; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ /* * XXX doesn't handle deletion */ ++ ++ percpu_down_read(&c->mark_lock); ++ buf.atomic++; ++ g = PTR_GC_BUCKET(ca, ptr); ++ ++ if (g->dirty_sectors || ++ (g->stripe && g->stripe != k.k->p.offset)) { ++ bch2_fs_inconsistent(c, ++ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ bucket_lock(g); ++ old = *g; ++ ++ ret = check_bucket_ref(c, k, ptr, sectors, data_type, ++ g->gen, g->data_type, ++ g->dirty_sectors, g->cached_sectors); ++ if (ret) ++ goto err; ++ ++ if (data_type) ++ g->data_type = data_type; ++ g->dirty_sectors += sectors; ++ ++ g->stripe = k.k->p.offset; ++ g->stripe_redundancy = s->nr_redundant; ++ new = *g; ++err: ++ bucket_unlock(g); ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); ++ percpu_up_read(&c->mark_lock); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int __mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u32 *dirty_sectors, u32 *cached_sectors) ++{ ++ u32 *dst_sectors = !ptr->cached ++ ? dirty_sectors ++ : cached_sectors; ++ int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, ++ bucket_gen, *bucket_data_type, ++ *dirty_sectors, *cached_sectors); ++ ++ if (ret) ++ return ret; ++ ++ *dst_sectors += sectors; ++ *bucket_data_type = *dirty_sectors || *cached_sectors ++ ? ptr_data_type : 0; ++ return 0; ++} ++ ++static int bch2_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ unsigned flags) ++{ ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket old, new, *g; ++ u8 bucket_data_type; ++ int ret = 0; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ percpu_down_read(&c->mark_lock); ++ g = PTR_GC_BUCKET(ca, &p.ptr); ++ bucket_lock(g); ++ old = *g; ++ ++ bucket_data_type = g->data_type; ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, ++ data_type, g->gen, ++ &bucket_data_type, ++ &g->dirty_sectors, ++ &g->cached_sectors); ++ if (!ret) ++ g->data_type = bucket_data_type; ++ ++ new = *g; ++ bucket_unlock(g); ++ if (!ret) ++ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++static int bch2_mark_stripe_ptr(struct btree_trans *trans, ++ struct bkey_s_c k, ++ struct bch_extent_stripe_ptr p, ++ enum bch_data_type data_type, ++ s64 sectors, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_replicas_padded r; ++ struct gc_stripe *m; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ (u64) p.idx); ++ return -ENOMEM; ++ } ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ bch2_inconsistent_error(c); ++ return -EIO; ++ } ++ ++ m->block_sectors[p.block] += sectors; ++ ++ r = m->r; ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ r.e.data_type = data_type; ++ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); ++ ++ return 0; ++} ++ ++int bch2_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ++ ? BCH_DATA_btree ++ : BCH_DATA_user; ++ s64 sectors = bkey_is_btree_ptr(k.k) ++ ? btree_sectors(c) ++ : k.k->size; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = ptr_disk_sectors(sectors, p); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ disk_sectors = -disk_sectors; ++ ++ ret = bch2_mark_pointer(trans, k, p, disk_sectors, ++ data_type, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) { ++ ret = update_cached_sectors(c, k, p.ptr.dev, ++ disk_sectors, journal_seq, true); ++ if (ret) { ++ bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); ++ return ret; ++ } ++ } ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, ++ disk_sectors, flags); ++ if (ret) ++ return ret; ++ ++ /* ++ * There may be other dirty pointers in this extent, but ++ * if so they're not required for mounting if we have an ++ * erasure coded pointer in this extent: ++ */ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) { ++ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); ++ if (ret) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++int bch2_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ u64 journal_seq = trans->journal_res.seq; ++ struct bch_fs *c = trans->c; ++ u64 idx = new.k->p.offset; ++ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(old).v : NULL; ++ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(new).v : NULL; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(gc && old_s); ++ ++ if (!gc) { ++ struct stripe *m = genradix_ptr(&c->stripes, idx); ++ ++ if (!m || (old_s && !m->alive)) { ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf1, c, old); ++ bch2_bkey_val_to_text(&buf2, c, new); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" ++ "old %s\n" ++ "new %s", idx, buf1.buf, buf2.buf); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ bch2_inconsistent_error(c); ++ return -1; ++ } ++ ++ if (!new_s) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ memset(m, 0, sizeof(*m)); ++ } else { ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ } else { ++ struct gc_stripe *m = ++ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); ++ ++ if (!m) { ++ bch_err(c, "error allocating memory for gc_stripes, idx %llu", ++ idx); ++ return -ENOMEM; ++ } ++ /* ++ * This will be wrong when we bring back runtime gc: we should ++ * be unmarking the old key and then marking the new key ++ */ ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) ++ m->ptrs[i] = new_s->ptrs[i]; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); ++ ++ /* ++ * gc recalculates this field from stripe ptr ++ * references: ++ */ ++ memset(m->block_sectors, 0, sizeof(m->block_sectors)); ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ ret = mark_stripe_bucket(trans, new, i, flags); ++ if (ret) ++ return ret; ++ } ++ ++ ret = update_replicas(c, new, &m->r.e, ++ ((s64) m->sectors * m->nr_redundant), ++ journal_seq, gc); ++ if (ret) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, new); ++ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); ++ printbuf_exit(&buf); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++int bch2_mark_inode(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_fs_usage __percpu *fs_usage; ++ u64 journal_seq = trans->journal_res.seq; ++ ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; ++ ++ BUG_ON(!journal_seq); ++ BUG_ON(new.k->type != KEY_TYPE_inode_v2); ++ ++ v->bi_journal_seq = cpu_to_le64(journal_seq); ++ } ++ ++ if (flags & BTREE_TRIGGER_GC) { ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ ++ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); ++ fs_usage->nr_inodes += bkey_is_inode(new.k); ++ fs_usage->nr_inodes -= bkey_is_inode(old.k); ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ } ++ return 0; ++} ++ ++int bch2_mark_reservation(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bch_fs_usage __percpu *fs_usage; ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ s64 sectors = (s64) k.k->size; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ sectors *= replicas; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ ++ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ ++ return 0; ++} ++ ++static s64 __bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 start, u64 end, ++ u64 *idx, unsigned flags, size_t r_idx) ++{ ++ struct bch_fs *c = trans->c; ++ struct reflink_gc *r; ++ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ u64 next_idx = end; ++ s64 ret = 0; ++ struct printbuf buf = PRINTBUF; ++ ++ if (r_idx >= c->reflink_gc_nr) ++ goto not_found; ++ ++ r = genradix_ptr(&c->reflink_gc_table, r_idx); ++ next_idx = min(next_idx, r->offset - r->size); ++ if (*idx < next_idx) ++ goto not_found; ++ ++ BUG_ON((s64) r->refcount + add < 0); ++ ++ r->refcount += add; ++ *idx = r->offset; ++ return 0; ++not_found: ++ if (fsck_err(c, "pointer to missing indirect extent\n" ++ " %s\n" ++ " missing range %llu-%llu", ++ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), ++ *idx, next_idx)) { ++ struct bkey_i_error new; ++ ++ bkey_init(&new.k); ++ new.k.type = KEY_TYPE_error; ++ new.k.p = bkey_start_pos(p.k); ++ new.k.p.offset += *idx - start; ++ bch2_key_resize(&new.k, next_idx - *idx); ++ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); ++ } ++ ++ *idx = next_idx; ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ struct reflink_gc *ref; ++ size_t l, r, m; ++ u64 idx = le64_to_cpu(p.v->idx), start = idx; ++ u64 end = le64_to_cpu(p.v->idx) + p.k->size; ++ int ret = 0; ++ ++ BUG_ON(!(flags & BTREE_TRIGGER_GC)); ++ ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { ++ idx -= le32_to_cpu(p.v->front_pad); ++ end += le32_to_cpu(p.v->back_pad); ++ } ++ ++ l = 0; ++ r = c->reflink_gc_nr; ++ while (l < r) { ++ m = l + (r - l) / 2; ++ ++ ref = genradix_ptr(&c->reflink_gc_table, m); ++ if (ref->offset <= idx) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ while (idx < end && !ret) ++ ret = __bch2_mark_reflink_p(trans, p, start, end, ++ &idx, flags, l++); ++ ++ return ret; ++} ++ ++static noinline __cold ++void fs_usage_apply_warn(struct btree_trans *trans, ++ unsigned disk_res_sectors, ++ s64 should_not_have_added) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ struct printbuf buf = PRINTBUF; ++ ++ bch_err(c, "disk usage increased %lli more than %u sectors reserved", ++ should_not_have_added, disk_res_sectors); ++ ++ trans_for_each_update(trans, i) { ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ ++ pr_err("while inserting"); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); ++ pr_err(" %s", buf.buf); ++ pr_err("overlapping with"); ++ printbuf_reset(&buf); ++ bch2_bkey_val_to_text(&buf, c, old); ++ pr_err(" %s", buf.buf); ++ } ++ ++ __WARN(); ++ printbuf_exit(&buf); ++} ++ ++int bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct replicas_delta_list *deltas) ++{ ++ struct bch_fs *c = trans->c; ++ static int warned_disk_usage = 0; ++ bool warn = false; ++ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ struct replicas_delta *d = deltas->d, *d2; ++ struct replicas_delta *top = (void *) deltas->d + deltas->used; ++ struct bch_fs_usage *dst; ++ s64 added = 0, should_not_have_added; ++ unsigned i; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ dst = fs_usage_ptr(c, trans->journal_res.seq, false); ++ ++ for (d = deltas->d; d != top; d = replicas_delta_next(d)) { ++ switch (d->r.data_type) { ++ case BCH_DATA_btree: ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ added += d->delta; ++ } ++ ++ if (__update_replicas(c, dst, &d->r, d->delta)) ++ goto need_mark; ++ } ++ ++ dst->nr_inodes += deltas->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ added += deltas->persistent_reserved[i]; ++ dst->reserved += deltas->persistent_reserved[i]; ++ dst->persistent_reserved[i] += deltas->persistent_reserved[i]; ++ } ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) disk_res_sectors; ++ if (unlikely(should_not_have_added > 0)) { ++ u64 old, new, v = atomic64_read(&c->sectors_available); ++ ++ do { ++ old = v; ++ new = max_t(s64, 0, old - should_not_have_added); ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, new)) != old); ++ ++ added -= should_not_have_added; ++ warn = true; ++ } ++ ++ if (added > 0) { ++ trans->disk_res->sectors -= added; ++ this_cpu_sub(*c->online_reserved, added); ++ } ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ ++ if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) ++ fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); ++ return 0; ++need_mark: ++ /* revert changes: */ ++ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) ++ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return -1; ++} ++ ++/* trans_mark: */ ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ unsigned flags) ++{ ++ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); ++ struct btree_iter iter; ++ struct bkey_i_alloc_v4 *a; ++ struct bpos bucket_pos; ++ struct bch_backpointer bp; ++ s64 sectors; ++ int ret; ++ ++ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); ++ sectors = bp.bucket_len; ++ if (!insert) ++ sectors = -sectors; ++ ++ a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); ++ ++ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, ++ a->v.gen, &a->v.data_type, ++ &a->v.dirty_sectors, &a->v.cached_sectors); ++ if (ret) ++ goto err; ++ ++ if (!p.ptr.cached) { ++ ret = insert ++ ? bch2_bucket_backpointer_add(trans, a, bp, k) ++ : bch2_bucket_backpointer_del(trans, a, bp, k); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_stripe *s; ++ struct bch_replicas_padded r; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_stripe) { ++ bch2_trans_inconsistent(trans, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.ec.idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { ++ bch2_trans_inconsistent(trans, ++ "stripe pointer doesn't match stripe %llu", ++ (u64) p.ec.idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&s->k_i, k); ++ stripe_blockcount_set(&s->v, p.ec.block, ++ stripe_blockcount_get(&s->v, p.ec.block) + ++ sectors); ++ ++ ret = bch2_trans_update(trans, &iter, &s->k_i, 0); ++ if (ret) ++ goto err; ++ ++ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); ++ r.e.data_type = data_type; ++ update_replicas_list(trans, &r.e, sectors); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_trans_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ++ ? BCH_DATA_btree ++ : BCH_DATA_user; ++ s64 sectors = bkey_is_btree_ptr(k.k) ++ ? btree_sectors(c) ++ : k.k->size; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = ptr_disk_sectors(sectors, p); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ disk_sectors = -disk_sectors; ++ ++ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors_list(trans, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ ret = bch2_trans_mark_stripe_ptr(trans, p, ++ disk_sectors, data_type); ++ if (ret) ++ return ret; ++ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas_list(trans, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, ++ struct bkey_s_c_stripe s, ++ unsigned idx, bool deleting) ++{ ++ struct bch_fs *c = trans->c; ++ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; ++ struct btree_iter iter; ++ struct bkey_i_alloc_v4 *a; ++ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant ++ ? BCH_DATA_parity : 0; ++ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; ++ int ret = 0; ++ ++ if (deleting) ++ sectors = -sectors; ++ ++ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); ++ ++ ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, ++ a->v.gen, a->v.data_type, ++ a->v.dirty_sectors, a->v.cached_sectors); ++ if (ret) ++ goto err; ++ ++ if (!deleting) { ++ if (bch2_trans_inconsistent_on(a->v.stripe || ++ a->v.stripe_redundancy, trans, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, ++ a->v.stripe, s.k->p.offset)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, ++ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ a->v.dirty_sectors, ++ s.k->p.offset)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ a->v.stripe = s.k->p.offset; ++ a->v.stripe_redundancy = s.v->nr_redundant; ++ } else { ++ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || ++ a->v.stripe_redundancy != s.v->nr_redundant, trans, ++ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ s.k->p.offset, a->v.stripe)) { ++ ret = -EIO; ++ goto err; ++ } ++ ++ a->v.stripe = 0; ++ a->v.stripe_redundancy = 0; ++ } ++ ++ a->v.dirty_sectors += sectors; ++ if (data_type) ++ a->v.data_type = !deleting ? data_type : 0; ++ ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_trans_mark_stripe(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ const struct bch_stripe *old_s = NULL; ++ struct bch_stripe *new_s = NULL; ++ struct bch_replicas_padded r; ++ unsigned i, nr_blocks; ++ int ret = 0; ++ ++ if (old.k->type == KEY_TYPE_stripe) ++ old_s = bkey_s_c_to_stripe(old).v; ++ if (new->k.type == KEY_TYPE_stripe) ++ new_s = &bkey_i_to_stripe(new)->v; ++ ++ /* ++ * If the pointers aren't changing, we don't need to do anything: ++ */ ++ if (new_s && old_s && ++ new_s->nr_blocks == old_s->nr_blocks && ++ new_s->nr_redundant == old_s->nr_redundant && ++ !memcmp(old_s->ptrs, new_s->ptrs, ++ new_s->nr_blocks * sizeof(struct bch_extent_ptr))) ++ return 0; ++ ++ BUG_ON(new_s && old_s && ++ (new_s->nr_blocks != old_s->nr_blocks || ++ new_s->nr_redundant != old_s->nr_redundant)); ++ ++ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; ++ ++ if (new_s) { ++ s64 sectors = le16_to_cpu(new_s->sectors); ++ ++ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); ++ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); ++ } ++ ++ if (old_s) { ++ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); ++ ++ bch2_bkey_to_replicas(&r.e, old); ++ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); ++ } ++ ++ for (i = 0; i < nr_blocks; i++) { ++ if (new_s && old_s && ++ !memcmp(&new_s->ptrs[i], ++ &old_s->ptrs[i], ++ sizeof(new_s->ptrs[i]))) ++ continue; ++ ++ if (new_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_i_to_s_c_stripe(new), i, false); ++ if (ret) ++ break; ++ } ++ ++ if (old_s) { ++ ret = bch2_trans_mark_stripe_bucket(trans, ++ bkey_s_c_to_stripe(old), i, true); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_trans_mark_inode(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) ++{ ++ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); ++ ++ if (nr) { ++ struct replicas_delta_list *d = ++ replicas_deltas_realloc(trans, 0); ++ d->nr_inodes += nr; ++ } ++ ++ return 0; ++} ++ ++int bch2_trans_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ s64 sectors = (s64) k.k->size; ++ struct replicas_delta_list *d; ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE) ++ sectors = -sectors; ++ sectors *= replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++} ++ ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 *idx, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i *n; ++ __le64 *refcount; ++ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(n, k); ++ ++ refcount = bkey_refcount(n); ++ if (!refcount) { ++ bch2_bkey_val_to_text(&buf, c, p.s_c); ++ bch2_trans_inconsistent(trans, ++ "nonexistent indirect extent at %llu while marking\n %s", ++ *idx, buf.buf); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { ++ bch2_bkey_val_to_text(&buf, c, p.s_c); ++ bch2_trans_inconsistent(trans, ++ "indirect extent refcount underflow at %llu while marking\n %s", ++ *idx, buf.buf); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; ++ u64 pad; ++ ++ pad = max_t(s64, le32_to_cpu(v->front_pad), ++ le64_to_cpu(v->idx) - bkey_start_offset(k.k)); ++ BUG_ON(pad > U32_MAX); ++ v->front_pad = cpu_to_le32(pad); ++ ++ pad = max_t(s64, le32_to_cpu(v->back_pad), ++ k.k->p.offset - p.k->size - le64_to_cpu(v->idx)); ++ BUG_ON(pad > U32_MAX); ++ v->back_pad = cpu_to_le32(pad); ++ } ++ ++ le64_add_cpu(refcount, add); ++ ++ bch2_btree_iter_set_pos_to_extent_start(&iter); ++ ret = bch2_trans_update(trans, &iter, n, 0); ++ if (ret) ++ goto err; ++ ++ *idx = k.k->p.offset; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx, end_idx; ++ int ret = 0; ++ ++ if (flags & BTREE_TRIGGER_INSERT) { ++ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; ++ ++ v->front_pad = v->back_pad = 0; ++ } ++ ++ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); ++ end_idx = le64_to_cpu(p.v->idx) + p.k->size + ++ le32_to_cpu(p.v->back_pad); ++ ++ while (idx < end_idx && !ret) ++ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); ++ ++ return ret; ++} ++ ++static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ++ struct bch_dev *ca, size_t b, ++ enum bch_data_type type, ++ unsigned sectors) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_i_alloc_v4 *a; ++ int ret = 0; ++ ++ /* ++ * Backup superblock might be past the end of our normal usable space: ++ */ ++ if (b >= ca->mi.nbuckets) ++ return 0; ++ ++ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); ++ if (IS_ERR(a)) ++ return PTR_ERR(a); ++ ++ if (a->v.data_type && a->v.data_type != type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_types[a->v.data_type], ++ bch2_data_types[type], ++ bch2_data_types[type]); ++ ret = -EIO; ++ goto out; ++ } ++ ++ a->v.data_type = type; ++ a->v.dirty_sectors = sectors; ++ ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto out; ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ++ struct bch_dev *ca, size_t b, ++ enum bch_data_type type, ++ unsigned sectors) ++{ ++ return commit_do(trans, NULL, NULL, 0, ++ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); ++} ++ ++static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, ++ struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ u64 *bucket, unsigned *bucket_sectors) ++{ ++ do { ++ u64 b = sector_to_bucket(ca, start); ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ if (b != *bucket && *bucket_sectors) { ++ int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, ++ type, *bucket_sectors); ++ if (ret) ++ return ret; ++ ++ *bucket_sectors = 0; ++ } ++ ++ *bucket = b; ++ *bucket_sectors += sectors; ++ start += sectors; ++ } while (start < end); ++ ++ return 0; ++} ++ ++static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, ++ struct bch_dev *ca) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 bucket = 0; ++ unsigned i, bucket_sectors = 0; ++ int ret; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) { ++ ret = bch2_trans_mark_metadata_sectors(trans, ca, ++ 0, BCH_SB_SECTOR, ++ BCH_DATA_sb, &bucket, &bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_sb, &bucket, &bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ if (bucket_sectors) { ++ ret = bch2_trans_mark_metadata_bucket(trans, ca, ++ bucket, BCH_DATA_sb, bucket_sectors); ++ if (ret) ++ return ret; ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ ret = bch2_trans_mark_metadata_bucket(trans, ca, ++ ca->journal.buckets[i], ++ BCH_DATA_journal, ca->mi.bucket_size); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); ++} ++ ++/* Disk reservations: */ ++ ++#define SECTORS_CACHE 1024 ++ ++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ u64 sectors, int flags) ++{ ++ struct bch_fs_pcpu *pcpu; ++ u64 old, v, get; ++ s64 sectors_available; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ pcpu = this_cpu_ptr(c->pcpu); ++ ++ if (sectors <= pcpu->sectors_available) ++ goto out; ++ ++ v = atomic64_read(&c->sectors_available); ++ do { ++ old = v; ++ get = min((u64) sectors + SECTORS_CACHE, old); ++ ++ if (get < sectors) { ++ preempt_enable(); ++ goto recalculate; ++ } ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, old - get)) != old); ++ ++ pcpu->sectors_available += get; ++ ++out: ++ pcpu->sectors_available -= sectors; ++ this_cpu_add(*c->online_reserved, sectors); ++ res->sectors += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return 0; ++ ++recalculate: ++ mutex_lock(&c->sectors_available_lock); ++ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(*c->online_reserved, sectors); ++ res->sectors += sectors; ++ ret = 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ ret = -ENOSPC; ++ } ++ ++ mutex_unlock(&c->sectors_available_lock); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++/* Startup/shutdown: */ ++ ++static void bucket_gens_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_gens *buckets = ++ container_of(rcu, struct bucket_gens, rcu); ++ ++ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; ++ unsigned long *buckets_nouse = NULL; ++ bool resize = ca->bucket_gens != NULL; ++ int ret = -ENOMEM; ++ ++ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, ++ GFP_KERNEL|__GFP_ZERO)) || ++ (c->opts.buckets_nouse && ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ sizeof(unsigned long), ++ GFP_KERNEL|__GFP_ZERO)))) ++ goto err; ++ ++ bucket_gens->first_bucket = ca->mi.first_bucket; ++ bucket_gens->nbuckets = nbuckets; ++ ++ bch2_copygc_stop(c); ++ ++ if (resize) { ++ down_write(&c->gc_lock); ++ down_write(&ca->bucket_lock); ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); ++ ++ if (resize) { ++ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); ++ ++ memcpy(bucket_gens->b, ++ old_bucket_gens->b, ++ n); ++ if (buckets_nouse) ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ } ++ ++ rcu_assign_pointer(ca->bucket_gens, bucket_gens); ++ bucket_gens = old_bucket_gens; ++ ++ swap(ca->buckets_nouse, buckets_nouse); ++ ++ nbuckets = ca->mi.nbuckets; ++ ++ if (resize) { ++ percpu_up_write(&c->mark_lock); ++ up_write(&ca->bucket_lock); ++ up_write(&c->gc_lock); ++ } ++ ++ ret = 0; ++err: ++ kvpfree(buckets_nouse, ++ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (bucket_gens) ++ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); ++ ++ return ret; ++} ++ ++void bch2_dev_buckets_free(struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ kvpfree(ca->buckets_nouse, ++ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), ++ sizeof(struct bucket_gens) + ca->mi.nbuckets); ++ ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) ++ free_percpu(ca->usage[i]); ++ kfree(ca->usage_base); ++} ++ ++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); ++ if (!ca->usage_base) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { ++ ca->usage[i] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[i]) ++ return -ENOMEM; ++ } ++ ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++} +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +new file mode 100644 +index 000000000000..6881502d95f1 +--- /dev/null ++++ b/fs/bcachefs/buckets.h +@@ -0,0 +1,300 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#ifndef _BUCKETS_H ++#define _BUCKETS_H ++ ++#include "buckets_types.h" ++#include "extents.h" ++#include "super.h" ++ ++#define for_each_bucket(_b, _buckets) \ ++ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ ++ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) ++ ++static inline void bucket_unlock(struct bucket *b) ++{ ++ smp_store_release(&b->lock, 0); ++} ++ ++static inline void bucket_lock(struct bucket *b) ++{ ++ while (xchg(&b->lock, 1)) ++ cpu_relax(); ++} ++ ++static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) ++{ ++ return rcu_dereference_check(ca->buckets_gc, ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) ++{ ++ struct bucket_array *buckets = gc_bucket_array(ca); ++ ++ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); ++ return buckets->b + b; ++} ++ ++static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) ++{ ++ return rcu_dereference_check(ca->bucket_gens, ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket_gens *gens = bucket_gens(ca); ++ ++ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); ++ return gens->b + b; ++} ++ ++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return sector_to_bucket(ca, ptr->offset); ++} ++ ++static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); ++} ++ ++static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, ++ const struct bch_extent_ptr *ptr, ++ u32 *bucket_offset) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); ++} ++ ++static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); ++} ++ ++static inline enum bch_data_type ptr_data_type(const struct bkey *k, ++ const struct bch_extent_ptr *ptr) ++{ ++ if (bkey_is_btree_ptr(k)) ++ return BCH_DATA_btree; ++ ++ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; ++} ++ ++static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) ++{ ++ EBUG_ON(sectors < 0); ++ ++ return crc_is_compressed(p.crc) ++ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, ++ p.crc.uncompressed_size) ++ : sectors; ++} ++ ++static inline int gen_cmp(u8 a, u8 b) ++{ ++ return (s8) (a - b); ++} ++ ++static inline int gen_after(u8 a, u8 b) ++{ ++ int r = gen_cmp(a, b); ++ ++ return r > 0 ? r : 0; ++} ++ ++/** ++ * ptr_stale() - check if a pointer points into a bucket that has been ++ * invalidated. ++ */ ++static inline u8 ptr_stale(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ u8 ret; ++ ++ rcu_read_lock(); ++ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/* Device usage: */ ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); ++void bch2_dev_usage_init(struct bch_dev *); ++ ++static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve) ++{ ++ s64 reserved = 0; ++ ++ switch (reserve) { ++ case RESERVE_none: ++ reserved += ca->mi.nbuckets >> 6; ++ fallthrough; ++ case RESERVE_movinggc: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_btree: ++ reserved += ca->nr_btree_reserve; ++ fallthrough; ++ case RESERVE_btree_movinggc: ++ break; ++ } ++ ++ return reserved; ++} ++ ++static inline u64 dev_buckets_free(struct bch_dev *ca, ++ struct bch_dev_usage usage, ++ enum alloc_reserve reserve) ++{ ++ return max_t(s64, 0, ++ usage.d[BCH_DATA_free].buckets - ++ ca->nr_open_buckets - ++ bch2_dev_buckets_reserved(ca, reserve)); ++} ++ ++static inline u64 __dev_buckets_available(struct bch_dev *ca, ++ struct bch_dev_usage usage, ++ enum alloc_reserve reserve) ++{ ++ return max_t(s64, 0, ++ usage.d[BCH_DATA_free].buckets ++ + usage.d[BCH_DATA_cached].buckets ++ + usage.d[BCH_DATA_need_gc_gens].buckets ++ + usage.d[BCH_DATA_need_discard].buckets ++ - ca->nr_open_buckets ++ - bch2_dev_buckets_reserved(ca, reserve)); ++} ++ ++static inline u64 dev_buckets_available(struct bch_dev *ca, ++ enum alloc_reserve reserve) ++{ ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); ++} ++ ++/* Filesystem usage: */ ++ ++static inline unsigned fs_usage_u64s(struct bch_fs *c) ++{ ++ return sizeof(struct bch_fs_usage) / sizeof(u64) + ++ READ_ONCE(c->replicas.nr); ++} ++ ++static inline unsigned dev_usage_u64s(void) ++{ ++ return sizeof(struct bch_dev_usage) / sizeof(u64); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); ++ ++struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); ++ ++void bch2_fs_usage_to_text(struct printbuf *, ++ struct bch_fs *, struct bch_fs_usage_online *); ++ ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *); ++ ++/* key/bucket marking: */ ++ ++void bch2_fs_usage_initialize(struct bch_fs *); ++ ++int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); ++ ++int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++ ++int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); ++int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); ++ ++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); ++ ++int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); ++ ++int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned); ++int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); ++ ++/* disk reservations: */ ++ ++static inline void bch2_disk_reservation_put(struct bch_fs *c, ++ struct disk_reservation *res) ++{ ++ this_cpu_sub(*c->online_reserved, res->sectors); ++ res->sectors = 0; ++} ++ ++#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) ++ ++int bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ u64, int); ++ ++static inline struct disk_reservation ++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) ++{ ++ return (struct disk_reservation) { ++ .sectors = 0, ++#if 0 ++ /* not used yet: */ ++ .gen = c->capacity_gen, ++#endif ++ .nr_replicas = nr_replicas, ++ }; ++} ++ ++static inline int bch2_disk_reservation_get(struct bch_fs *c, ++ struct disk_reservation *res, ++ u64 sectors, unsigned nr_replicas, ++ int flags) ++{ ++ *res = bch2_disk_reservation_init(c, nr_replicas); ++ ++ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); ++} ++ ++#define RESERVE_FACTOR 6 ++ ++static inline u64 avail_factor(u64 r) ++{ ++ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); ++void bch2_dev_buckets_free(struct bch_dev *); ++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); ++ ++#endif /* _BUCKETS_H */ +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +new file mode 100644 +index 000000000000..1dbba7d906dd +--- /dev/null ++++ b/fs/bcachefs/buckets_types.h +@@ -0,0 +1,103 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_TYPES_H ++#define _BUCKETS_TYPES_H ++ ++#include "bcachefs_format.h" ++#include "util.h" ++ ++#define BUCKET_JOURNAL_SEQ_BITS 16 ++ ++struct bucket { ++ u8 lock; ++ u8 gen_valid:1; ++ u8 data_type:7; ++ u8 gen; ++ u8 stripe_redundancy; ++ u32 stripe; ++ u32 dirty_sectors; ++ u32 cached_sectors; ++}; ++ ++struct bucket_array { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ struct bucket b[]; ++}; ++ ++struct bucket_gens { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ u8 b[]; ++}; ++ ++struct bch_dev_usage { ++ u64 buckets_ec; ++ ++ struct { ++ u64 buckets; ++ u64 sectors; /* _compressed_ sectors: */ ++ /* ++ * XXX ++ * Why do we have this? Isn't it just buckets * bucket_size - ++ * sectors? ++ */ ++ u64 fragmented; ++ } d[BCH_DATA_NR]; ++}; ++ ++struct bch_fs_usage { ++ /* all fields are in units of 512 byte sectors: */ ++ u64 hidden; ++ u64 btree; ++ u64 data; ++ u64 cached; ++ u64 reserved; ++ u64 nr_inodes; ++ ++ /* XXX: add stats for compression ratio */ ++#if 0 ++ u64 uncompressed; ++ u64 compressed; ++#endif ++ ++ /* broken out: */ ++ ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ u64 replicas[]; ++}; ++ ++struct bch_fs_usage_online { ++ u64 online_reserved; ++ struct bch_fs_usage u; ++}; ++ ++struct bch_fs_usage_short { ++ u64 capacity; ++ u64 used; ++ u64 free; ++ u64 nr_inodes; ++}; ++ ++/* ++ * A reservation for space on disk: ++ */ ++struct disk_reservation { ++ u64 sectors; ++ u32 gen; ++ unsigned nr_replicas; ++}; ++ ++struct copygc_heap_entry { ++ u8 dev; ++ u8 gen; ++ u8 replicas; ++ u32 fragmentation; ++ u32 sectors; ++ u64 bucket; ++}; ++ ++typedef HEAP(struct copygc_heap_entry) copygc_heap; ++ ++#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c +new file mode 100644 +index 000000000000..2e5b955080de +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.c +@@ -0,0 +1,167 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets_waiting_for_journal.h" ++#include ++ ++static inline struct bucket_hashed * ++bucket_hash(struct buckets_waiting_for_journal_table *t, ++ unsigned hash_seed_idx, u64 dev_bucket) ++{ ++ unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); ++ ++ BUG_ON(!is_power_of_2(t->size)); ++ ++ return t->d + (h & (t->size - 1)); ++} ++ ++static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) ++{ ++ unsigned i; ++ ++ t->size = size; ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) ++ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); ++ memset(t->d, 0, sizeof(t->d[0]) * size); ++} ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket) ++{ ++ struct buckets_waiting_for_journal_table *t; ++ u64 dev_bucket = (u64) dev << 56 | bucket; ++ bool ret = false; ++ unsigned i; ++ ++ mutex_lock(&b->lock); ++ t = b->t; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); ++ ++ if (h->dev_bucket == dev_bucket) { ++ ret = h->journal_seq > flushed_seq; ++ break; ++ } ++ } ++ ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, ++ struct bucket_hashed *new, ++ u64 flushed_seq) ++{ ++ struct bucket_hashed *last_evicted = NULL; ++ unsigned tries, i; ++ ++ for (tries = 0; tries < 10; tries++) { ++ struct bucket_hashed *old, *victim = NULL; ++ ++ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { ++ old = bucket_hash(t, i, new->dev_bucket); ++ ++ if (old->dev_bucket == new->dev_bucket || ++ old->journal_seq <= flushed_seq) { ++ *old = *new; ++ return true; ++ } ++ ++ if (last_evicted != old) ++ victim = old; ++ } ++ ++ /* hashed to same slot 3 times: */ ++ if (!victim) ++ break; ++ ++ /* Failed to find an empty slot: */ ++ swap(*new, *victim); ++ last_evicted = victim; ++ } ++ ++ return false; ++} ++ ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, ++ u64 flushed_seq, ++ unsigned dev, u64 bucket, ++ u64 journal_seq) ++{ ++ struct buckets_waiting_for_journal_table *t, *n; ++ struct bucket_hashed tmp, new = { ++ .dev_bucket = (u64) dev << 56 | bucket, ++ .journal_seq = journal_seq, ++ }; ++ size_t i, new_size, nr_elements = 1, nr_rehashes = 0; ++ int ret = 0; ++ ++ mutex_lock(&b->lock); ++ ++ if (likely(bucket_table_insert(b->t, &new, flushed_seq))) ++ goto out; ++ ++ t = b->t; ++ for (i = 0; i < t->size; i++) ++ nr_elements += t->d[i].journal_seq > flushed_seq; ++ ++ new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; ++ ++ n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++retry_rehash: ++ nr_rehashes++; ++ bucket_table_init(n, new_size); ++ ++ tmp = new; ++ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); ++ ++ for (i = 0; i < t->size; i++) { ++ if (t->d[i].journal_seq <= flushed_seq) ++ continue; ++ ++ tmp = t->d[i]; ++ if (!bucket_table_insert(n, &tmp, flushed_seq)) ++ goto retry_rehash; ++ } ++ ++ b->t = n; ++ kvfree(t); ++ ++ pr_debug("took %zu rehashes, table at %zu/%zu elements", ++ nr_rehashes, nr_elements, b->t->size); ++out: ++ mutex_unlock(&b->lock); ++ ++ return ret; ++} ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ kvfree(b->t); ++} ++ ++#define INITIAL_TABLE_SIZE 8 ++ ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) ++{ ++ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; ++ ++ mutex_init(&b->lock); ++ ++ b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); ++ if (!b->t) ++ return -ENOMEM; ++ ++ bucket_table_init(b->t, INITIAL_TABLE_SIZE); ++ return 0; ++} +diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h +new file mode 100644 +index 000000000000..d2ae19cbe18c +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_H ++ ++#include "buckets_waiting_for_journal_types.h" ++ ++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64); ++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, ++ u64, unsigned, u64, u64); ++ ++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); ++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h +new file mode 100644 +index 000000000000..fea7f944d0ed +--- /dev/null ++++ b/fs/bcachefs/buckets_waiting_for_journal_types.h +@@ -0,0 +1,23 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H ++ ++#include ++ ++struct bucket_hashed { ++ u64 dev_bucket; ++ u64 journal_seq; ++}; ++ ++struct buckets_waiting_for_journal_table { ++ size_t size; ++ siphash_key_t hash_seeds[3]; ++ struct bucket_hashed d[]; ++}; ++ ++struct buckets_waiting_for_journal { ++ struct mutex lock; ++ struct buckets_waiting_for_journal_table *t; ++}; ++ ++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +new file mode 100644 +index 000000000000..dbb7e5e0b35b +--- /dev/null ++++ b/fs/bcachefs/chardev.c +@@ -0,0 +1,760 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_CHARDEV ++ ++#include "bcachefs.h" ++#include "bcachefs_ioctl.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "journal.h" ++#include "move.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* returns with ref on ca->ref */ ++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, ++ unsigned flags) ++{ ++ struct bch_dev *ca; ++ ++ if (flags & BCH_BY_INDEX) { ++ if (dev >= c->sb.nr_devices) ++ return ERR_PTR(-EINVAL); ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ return ERR_PTR(-EINVAL); ++ } else { ++ char *path; ++ ++ path = strndup_user((const char __user *) ++ (unsigned long) dev, PATH_MAX); ++ if (IS_ERR(path)) ++ return ERR_CAST(path); ++ ++ ca = bch2_dev_lookup(c, path); ++ kfree(path); ++ } ++ ++ return ca; ++} ++ ++#if 0 ++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) ++{ ++ struct bch_ioctl_assemble arg; ++ struct bch_fs *c; ++ u64 *user_devs = NULL; ++ char **devs = NULL; ++ unsigned i; ++ int ret = -EFAULT; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); ++ if (!user_devs) ++ return -ENOMEM; ++ ++ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); ++ ++ if (copy_from_user(user_devs, user_arg->devs, ++ sizeof(u64) * arg.nr_devs)) ++ goto err; ++ ++ for (i = 0; i < arg.nr_devs; i++) { ++ devs[i] = strndup_user((const char __user *)(unsigned long) ++ user_devs[i], ++ PATH_MAX); ++ if (!devs[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (!ret) ++ closure_put(&c->cl); ++err: ++ if (devs) ++ for (i = 0; i < arg.nr_devs; i++) ++ kfree(devs[i]); ++ kfree(devs); ++ return ret; ++} ++ ++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) ++{ ++ struct bch_ioctl_incremental arg; ++ const char *err; ++ char *path; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ err = bch2_fs_open_incremental(path); ++ kfree(path); ++ ++ if (err) { ++ pr_err("Could not register bcachefs devices: %s", err); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ ++static long bch2_global_ioctl(unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_ASSEMBLE: ++ return bch2_ioctl_assemble(arg); ++ case BCH_IOCTL_INCREMENTAL: ++ return bch2_ioctl_incremental(arg); ++#endif ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static long bch2_ioctl_query_uuid(struct bch_fs *c, ++ struct bch_ioctl_query_uuid __user *user_arg) ++{ ++ return copy_to_user(&user_arg->uuid, ++ &c->sb.user_uuid, ++ sizeof(c->sb.user_uuid)); ++} ++ ++#if 0 ++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) ++{ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ return bch2_fs_start(c); ++} ++ ++static long bch2_ioctl_stop(struct bch_fs *c) ++{ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ bch2_fs_stop(c); ++ return 0; ++} ++#endif ++ ++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_add(c, path); ++ kfree(path); ++ ++ return ret; ++} ++ ++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ return bch2_dev_remove(c, ca, arg.flags); ++} ++ ++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_online(c, path); ++ kfree(path); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_offline(c, ca, arg.flags); ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_set_state(struct bch_fs *c, ++ struct bch_ioctl_disk_set_state arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad[0] || arg.pad[1] || arg.pad[2] || ++ arg.new_state >= BCH_MEMBER_STATE_NR) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++struct bch_data_ctx { ++ struct bch_fs *c; ++ struct bch_ioctl_data arg; ++ struct bch_move_stats stats; ++ ++ int ret; ++ ++ struct task_struct *thread; ++}; ++ ++static int bch2_data_thread(void *arg) ++{ ++ struct bch_data_ctx *ctx = arg; ++ ++ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ++ ctx->stats.data_type = U8_MAX; ++ return 0; ++} ++ ++static int bch2_data_job_release(struct inode *inode, struct file *file) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ ++ kthread_stop(ctx->thread); ++ put_task_struct(ctx->thread); ++ kfree(ctx); ++ return 0; ++} ++ ++static ssize_t bch2_data_job_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ struct bch_fs *c = ctx->c; ++ struct bch_ioctl_data_event e = { ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.btree_id, ++ .p.pos = ctx->stats.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ }; ++ ++ if (len < sizeof(e)) ++ return -EINVAL; ++ ++ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); ++} ++ ++static const struct file_operations bcachefs_data_ops = { ++ .release = bch2_data_job_release, ++ .read = bch2_data_job_read, ++ .llseek = no_llseek, ++}; ++ ++static long bch2_ioctl_data(struct bch_fs *c, ++ struct bch_ioctl_data arg) ++{ ++ struct bch_data_ctx *ctx = NULL; ++ struct file *file = NULL; ++ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; ++ int ret, fd = -1; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) ++ return -EINVAL; ++ ++ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ ctx->c = c; ++ ctx->arg = arg; ++ ++ ctx->thread = kthread_create(bch2_data_thread, ctx, ++ "bch-data/%s", c->name); ++ if (IS_ERR(ctx->thread)) { ++ ret = PTR_ERR(ctx->thread); ++ goto err; ++ } ++ ++ ret = get_unused_fd_flags(flags); ++ if (ret < 0) ++ goto err; ++ fd = ret; ++ ++ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); ++ if (IS_ERR(file)) { ++ ret = PTR_ERR(file); ++ goto err; ++ } ++ ++ fd_install(fd, file); ++ ++ get_task_struct(ctx->thread); ++ wake_up_process(ctx->thread); ++ ++ return fd; ++err: ++ if (fd >= 0) ++ put_unused_fd(fd); ++ if (!IS_ERR_OR_NULL(ctx->thread)) ++ kthread_stop(ctx->thread); ++ kfree(ctx); ++ return ret; ++} ++ ++static long bch2_ioctl_fs_usage(struct bch_fs *c, ++ struct bch_ioctl_fs_usage __user *user_arg) ++{ ++ struct bch_ioctl_fs_usage *arg = NULL; ++ struct bch_replicas_usage *dst_e, *dst_end; ++ struct bch_fs_usage_online *src; ++ u32 replica_entries_bytes; ++ unsigned i; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) ++ return -EFAULT; ++ ++ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); ++ if (!arg) ++ return -ENOMEM; ++ ++ src = bch2_fs_usage_read(c); ++ if (!src) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ arg->capacity = c->capacity; ++ arg->used = bch2_fs_sectors_used(c, src); ++ arg->online_reserved = src->online_reserved; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ arg->persistent_reserved[i] = src->u.persistent_reserved[i]; ++ ++ dst_e = arg->replicas; ++ dst_end = (void *) arg->replicas + replica_entries_bytes; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *src_e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ /* check that we have enough space for one replicas entry */ ++ if (dst_e + 1 > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ dst_e->sectors = src->u.replicas[i]; ++ dst_e->r = *src_e; ++ ++ /* recheck after setting nr_devs: */ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); ++ ++ dst_e = replicas_usage_next(dst_e); ++ } ++ ++ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; ++ ++ percpu_up_read(&c->mark_lock); ++ kfree(src); ++ ++ if (!ret) ++ ret = copy_to_user(user_arg, arg, ++ sizeof(*arg) + arg->replica_entries_bytes); ++err: ++ kfree(arg); ++ return ret; ++} ++ ++static long bch2_ioctl_dev_usage(struct bch_fs *c, ++ struct bch_ioctl_dev_usage __user *user_arg) ++{ ++ struct bch_ioctl_dev_usage arg; ++ struct bch_dev_usage src; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad[0] || ++ arg.pad[1] || ++ arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ src = bch2_dev_usage_read(ca); ++ ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.buckets_ec = src.buckets_ec; ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ arg.d[i].buckets = src.d[i].buckets; ++ arg.d[i].sectors = src.d[i].sectors; ++ arg.d[i].fragmented = src.d[i].fragmented; ++ } ++ ++ percpu_ref_put(&ca->ref); ++ ++ return copy_to_user(user_arg, &arg, sizeof(arg)); ++} ++ ++static long bch2_ioctl_read_super(struct bch_fs *c, ++ struct bch_ioctl_read_super arg) ++{ ++ struct bch_dev *ca = NULL; ++ struct bch_sb *sb; ++ int ret = 0; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || ++ arg.pad) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (arg.flags & BCH_READ_DEV) { ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ ++ if (IS_ERR(ca)) { ++ ret = PTR_ERR(ca); ++ goto err; ++ } ++ ++ sb = ca->disk_sb.sb; ++ } else { ++ sb = c->disk_sb.sb; ++ } ++ ++ if (vstruct_bytes(sb) > arg.size) { ++ ret = -ERANGE; ++ goto err; ++ } ++ ++ ret = copy_to_user((void __user *)(unsigned long)arg.sb, ++ sb, vstruct_bytes(sb)); ++err: ++ if (!IS_ERR_OR_NULL(ca)) ++ percpu_ref_put(&ca->ref); ++ mutex_unlock(&c->sb_lock); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_get_idx(struct bch_fs *c, ++ struct bch_ioctl_disk_get_idx arg) ++{ ++ dev_t dev = huge_decode_dev(arg.dev); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (!dev) ++ return -EINVAL; ++ ++ for_each_online_member(ca, c, i) ++ if (ca->dev == dev) { ++ percpu_ref_put(&ca->io_ref); ++ return i; ++ } ++ ++ return -ENOENT; ++} ++ ++static long bch2_ioctl_disk_resize(struct bch_fs *c, ++ struct bch_ioctl_disk_resize arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_resize(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ++ struct bch_ioctl_disk_resize_journal arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++#define BCH_IOCTL(_name, _argtype) \ ++do { \ ++ _argtype i; \ ++ \ ++ if (copy_from_user(&i, arg, sizeof(i))) \ ++ return -EFAULT; \ ++ return bch2_ioctl_##_name(c, i); \ ++} while (0) ++ ++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++ case BCH_IOCTL_QUERY_UUID: ++ return bch2_ioctl_query_uuid(c, arg); ++ case BCH_IOCTL_FS_USAGE: ++ return bch2_ioctl_fs_usage(c, arg); ++ case BCH_IOCTL_DEV_USAGE: ++ return bch2_ioctl_dev_usage(c, arg); ++#if 0 ++ case BCH_IOCTL_START: ++ BCH_IOCTL(start, struct bch_ioctl_start); ++ case BCH_IOCTL_STOP: ++ return bch2_ioctl_stop(c); ++#endif ++ case BCH_IOCTL_READ_SUPER: ++ BCH_IOCTL(read_super, struct bch_ioctl_read_super); ++ case BCH_IOCTL_DISK_GET_IDX: ++ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); ++ } ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ switch (cmd) { ++ case BCH_IOCTL_DISK_ADD: ++ BCH_IOCTL(disk_add, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_REMOVE: ++ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_ONLINE: ++ BCH_IOCTL(disk_online, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_OFFLINE: ++ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_SET_STATE: ++ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); ++ case BCH_IOCTL_DATA: ++ BCH_IOCTL(data, struct bch_ioctl_data); ++ case BCH_IOCTL_DISK_RESIZE: ++ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ case BCH_IOCTL_DISK_RESIZE_JOURNAL: ++ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); ++ ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static DEFINE_IDR(bch_chardev_minor); ++ ++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) ++{ ++ unsigned minor = iminor(file_inode(filp)); ++ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; ++ void __user *arg = (void __user *) v; ++ ++ return c ++ ? bch2_fs_ioctl(c, cmd, arg) ++ : bch2_global_ioctl(cmd, arg); ++} ++ ++static const struct file_operations bch_chardev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = bch2_chardev_ioctl, ++ .open = nonseekable_open, ++}; ++ ++static int bch_chardev_major; ++static struct class *bch_chardev_class; ++static struct device *bch_chardev; ++ ++void bch2_fs_chardev_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->chardev)) ++ device_unregister(c->chardev); ++ if (c->minor >= 0) ++ idr_remove(&bch_chardev_minor, c->minor); ++} ++ ++int bch2_fs_chardev_init(struct bch_fs *c) ++{ ++ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); ++ if (c->minor < 0) ++ return c->minor; ++ ++ c->chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, c->minor), c, ++ "bcachefs%u-ctl", c->minor); ++ if (IS_ERR(c->chardev)) ++ return PTR_ERR(c->chardev); ++ ++ return 0; ++} ++ ++void bch2_chardev_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ device_destroy(bch_chardev_class, ++ MKDEV(bch_chardev_major, U8_MAX)); ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ class_destroy(bch_chardev_class); ++ if (bch_chardev_major > 0) ++ unregister_chrdev(bch_chardev_major, "bcachefs"); ++} ++ ++int __init bch2_chardev_init(void) ++{ ++ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); ++ if (bch_chardev_major < 0) ++ return bch_chardev_major; ++ ++ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); ++ if (IS_ERR(bch_chardev_class)) ++ return PTR_ERR(bch_chardev_class); ++ ++ bch_chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, U8_MAX), ++ NULL, "bcachefs-ctl"); ++ if (IS_ERR(bch_chardev)) ++ return PTR_ERR(bch_chardev); ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h +new file mode 100644 +index 000000000000..3a4890d39ff9 +--- /dev/null ++++ b/fs/bcachefs/chardev.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHARDEV_H ++#define _BCACHEFS_CHARDEV_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); ++ ++void bch2_fs_chardev_exit(struct bch_fs *); ++int bch2_fs_chardev_init(struct bch_fs *); ++ ++void bch2_chardev_exit(void); ++int __init bch2_chardev_init(void); ++ ++#else ++ ++static inline long bch2_fs_ioctl(struct bch_fs *c, ++ unsigned cmd, void __user * arg) ++{ ++ return -ENOSYS; ++} ++ ++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} ++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } ++ ++static inline void bch2_chardev_exit(void) {} ++static inline int __init bch2_chardev_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_CHARDEV_H */ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +new file mode 100644 +index 000000000000..b5850a761b91 +--- /dev/null ++++ b/fs/bcachefs/checksum.c +@@ -0,0 +1,712 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "errcode.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * bch2_checksum state is an abstraction of the checksum state calculated over different pages. ++ * it features page merging without having the checksum algorithm lose its state. ++ * for native checksum aglorithms (like crc), a default seed value will do. ++ * for hash-like algorithms, a state needs to be stored ++ */ ++ ++struct bch2_checksum_state { ++ union { ++ u64 seed; ++ struct xxh64_state h64state; ++ }; ++ unsigned int type; ++}; ++ ++static void bch2_checksum_init(struct bch2_checksum_state *state) ++{ ++ switch (state->type) { ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: ++ state->seed = 0; ++ break; ++ case BCH_CSUM_crc32c_nonzero: ++ state->seed = U32_MAX; ++ break; ++ case BCH_CSUM_crc64_nonzero: ++ state->seed = U64_MAX; ++ break; ++ case BCH_CSUM_xxhash: ++ xxh64_reset(&state->h64state, 0); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_final(const struct bch2_checksum_state *state) ++{ ++ switch (state->type) { ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: ++ return state->seed; ++ case BCH_CSUM_crc32c_nonzero: ++ return state->seed ^ U32_MAX; ++ case BCH_CSUM_crc64_nonzero: ++ return state->seed ^ U64_MAX; ++ case BCH_CSUM_xxhash: ++ return xxh64_digest(&state->h64state); ++ default: ++ BUG(); ++ } ++} ++ ++static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) ++{ ++ switch (state->type) { ++ case BCH_CSUM_none: ++ return; ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc32c: ++ state->seed = crc32c(state->seed, data, len); ++ break; ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc64: ++ state->seed = crc64_be(state->seed, data, len); ++ break; ++ case BCH_CSUM_xxhash: ++ xxh64_update(&state->h64state, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) ++{ ++ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); ++ int ret; ++ ++ skcipher_request_set_sync_tfm(req, tfm); ++ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ++ ++ ret = crypto_skcipher_encrypt(req); ++ if (ret) ++ pr_err("got error %i from crypto_skcipher_encrypt()", ret); ++ ++ return ret; ++} ++ ++static inline int do_encrypt(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ void *buf, size_t len) ++{ ++ if (!is_vmalloc_addr(buf)) { ++ struct scatterlist sg; ++ ++ sg_init_table(&sg, 1); ++ sg_set_page(&sg, ++ is_vmalloc_addr(buf) ++ ? vmalloc_to_page(buf) ++ : virt_to_page(buf), ++ len, offset_in_page(buf)); ++ return do_encrypt_sg(tfm, nonce, &sg, len); ++ } else { ++ unsigned pages = buf_pages(buf, len); ++ struct scatterlist *sg; ++ size_t orig_len = len; ++ int ret, i; ++ ++ sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL); ++ if (!sg) ++ return -ENOMEM; ++ ++ sg_init_table(sg, pages); ++ ++ for (i = 0; i < pages; i++) { ++ unsigned offset = offset_in_page(buf); ++ unsigned pg_len = min(len, PAGE_SIZE - offset); ++ ++ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); ++ buf += pg_len; ++ len -= pg_len; ++ } ++ ++ ret = do_encrypt_sg(tfm, nonce, sg, orig_len); ++ kfree(sg); ++ return ret; ++ } ++} ++ ++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct crypto_sync_skcipher *chacha20 = ++ crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ int ret; ++ ++ if (!chacha20) { ++ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); ++ return PTR_ERR(chacha20); ++ } ++ ++ ret = crypto_skcipher_setkey(&chacha20->base, ++ (void *) key, sizeof(*key)); ++ if (ret) { ++ pr_err("crypto_skcipher_setkey() error: %i", ret); ++ goto err; ++ } ++ ++ ret = do_encrypt(chacha20, nonce, buf, len); ++err: ++ crypto_free_sync_skcipher(chacha20); ++ return ret; ++} ++ ++static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) ++{ ++ u8 key[POLY1305_KEY_SIZE]; ++ int ret; ++ ++ nonce.d[3] ^= BCH_NONCE_POLY; ++ ++ memset(key, 0, sizeof(key)); ++ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ if (ret) ++ return ret; ++ ++ desc->tfm = c->poly1305; ++ crypto_shash_init(desc); ++ crypto_shash_update(desc, key, sizeof(key)); ++ return 0; ++} ++ ++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, ++ struct nonce nonce, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_xxhash: ++ case BCH_CSUM_crc64: { ++ struct bch2_checksum_state state; ++ ++ state.type = type; ++ ++ bch2_checksum_init(&state); ++ bch2_checksum_update(&state, data, len); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; ++ } ++ ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++ crypto_shash_update(desc, data, len); ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_encrypt(struct bch_fs *c, unsigned type, ++ struct nonce nonce, void *data, size_t len) ++{ ++ if (!bch2_csum_type_is_encryption(type)) ++ return 0; ++ ++ return do_encrypt(c->chacha20, nonce, data, len); ++} ++ ++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio, ++ struct bvec_iter *iter) ++{ ++ struct bio_vec bv; ++ ++ switch (type) { ++ case BCH_CSUM_none: ++ return (struct bch_csum) { 0 }; ++ case BCH_CSUM_crc32c_nonzero: ++ case BCH_CSUM_crc64_nonzero: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_xxhash: ++ case BCH_CSUM_crc64: { ++ struct bch2_checksum_state state; ++ ++ state.type = type; ++ bch2_checksum_init(&state); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ bch2_checksum_update(&state, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; ++ } ++ ++ case BCH_CSUM_chacha20_poly1305_80: ++ case BCH_CSUM_chacha20_poly1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ ++ crypto_shash_update(desc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crypto_shash_update(desc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ ++ return __bch2_checksum_bio(c, type, nonce, bio, &iter); ++} ++ ++int bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ struct scatterlist sgl[16], *sg = sgl; ++ size_t bytes = 0; ++ int ret = 0; ++ ++ if (!bch2_csum_type_is_encryption(type)) ++ return 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ if (sg == sgl + ARRAY_SIZE(sgl)) { ++ sg_mark_end(sg - 1); ++ ++ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ if (ret) ++ return ret; ++ ++ nonce = nonce_add(nonce, bytes); ++ bytes = 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ sg = sgl; ++ } ++ ++ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); ++ bytes += bv.bv_len; ++ } ++ ++ sg_mark_end(sg - 1); ++ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, ++ struct bch_csum b, size_t b_len) ++{ ++ struct bch2_checksum_state state; ++ ++ state.type = type; ++ bch2_checksum_init(&state); ++ state.seed = a.lo; ++ ++ BUG_ON(!bch2_checksum_mergeable(type)); ++ ++ while (b_len) { ++ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); ++ ++ bch2_checksum_update(&state, ++ page_address(ZERO_PAGE(0)), b); ++ b_len -= b; ++ } ++ a.lo = bch2_checksum_final(&state); ++ a.lo ^= b.lo; ++ a.hi ^= b.hi; ++ return a; ++} ++ ++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc_old, ++ struct bch_extent_crc_unpacked *crc_a, ++ struct bch_extent_crc_unpacked *crc_b, ++ unsigned len_a, unsigned len_b, ++ unsigned new_csum_type) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ struct nonce nonce = extent_nonce(version, crc_old); ++ struct bch_csum merged = { 0 }; ++ struct crc_split { ++ struct bch_extent_crc_unpacked *crc; ++ unsigned len; ++ unsigned csum_type; ++ struct bch_csum csum; ++ } splits[3] = { ++ { crc_a, len_a, new_csum_type }, ++ { crc_b, len_b, new_csum_type }, ++ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, ++ }, *i; ++ bool mergeable = crc_old.csum_type == new_csum_type && ++ bch2_checksum_mergeable(new_csum_type); ++ unsigned crc_nonce = crc_old.nonce; ++ ++ BUG_ON(len_a + len_b > bio_sectors(bio)); ++ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); ++ BUG_ON(crc_is_compressed(crc_old)); ++ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)); ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ iter.bi_size = i->len << 9; ++ if (mergeable || i->crc) ++ i->csum = __bch2_checksum_bio(c, i->csum_type, ++ nonce, bio, &iter); ++ else ++ bio_advance_iter(bio, &iter, i->len << 9); ++ nonce = nonce_add(nonce, i->len << 9); ++ } ++ ++ if (mergeable) ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) ++ merged = bch2_checksum_merge(new_csum_type, merged, ++ i->csum, i->len << 9); ++ else ++ merged = bch2_checksum_bio(c, crc_old.csum_type, ++ extent_nonce(version, crc_old), bio); ++ ++ if (bch2_crc_cmp(merged, crc_old.csum)) { ++ bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" ++ "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", ++ crc_old.csum.hi, ++ crc_old.csum.lo, ++ merged.hi, ++ merged.lo, ++ bch2_csum_types[crc_old.csum_type], ++ bch2_csum_types[new_csum_type]); ++ return -EIO; ++ } ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ if (i->crc) ++ *i->crc = (struct bch_extent_crc_unpacked) { ++ .csum_type = i->csum_type, ++ .compression_type = crc_old.compression_type, ++ .compressed_size = i->len, ++ .uncompressed_size = i->len, ++ .offset = 0, ++ .live_size = i->len, ++ .nonce = crc_nonce, ++ .csum = i->csum, ++ }; ++ ++ if (bch2_csum_type_is_encryption(new_csum_type)) ++ crc_nonce += i->len; ++ } ++ ++ return 0; ++} ++ ++#ifdef __KERNEL__ ++static int __bch2_request_key(char *key_description, struct bch_key *key) ++{ ++ struct key *keyring_key; ++ const struct user_key_payload *ukp; ++ int ret; ++ ++ keyring_key = request_key(&key_type_user, key_description, NULL); ++ if (IS_ERR(keyring_key)) ++ return PTR_ERR(keyring_key); ++ ++ down_read(&keyring_key->sem); ++ ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ up_read(&keyring_key->sem); ++ key_put(keyring_key); ++ ++ return ret; ++} ++#else ++#include ++ ++static int __bch2_request_key(char *key_description, struct bch_key *key) ++{ ++ key_serial_t key_id; ++ ++ key_id = request_key("user", key_description, NULL, ++ KEY_SPEC_USER_KEYRING); ++ if (key_id < 0) ++ return -errno; ++ ++ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) ++ return -1; ++ ++ return 0; ++} ++#endif ++ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ struct printbuf key_description = PRINTBUF; ++ int ret; ++ ++ prt_printf(&key_description, "bcachefs:"); ++ pr_uuid(&key_description, sb->user_uuid.b); ++ ++ ret = __bch2_request_key(key_description.buf, key); ++ printbuf_exit(&key_description); ++ return ret; ++} ++ ++int bch2_decrypt_sb_key(struct bch_fs *c, ++ struct bch_sb_field_crypt *crypt, ++ struct bch_key *key) ++{ ++ struct bch_encrypted_key sb_key = crypt->key; ++ struct bch_key user_key; ++ int ret = 0; ++ ++ /* is key encrypted? */ ++ if (!bch2_key_is_encrypted(&sb_key)) ++ goto out; ++ ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ /* decrypt real key: */ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &sb_key, sizeof(sb_key)); ++ if (ret) ++ goto err; ++ ++ if (bch2_key_is_encrypted(&sb_key)) { ++ bch_err(c, "incorrect encryption key"); ++ ret = -EINVAL; ++ goto err; ++ } ++out: ++ *key = sb_key.key; ++err: ++ memzero_explicit(&sb_key, sizeof(sb_key)); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ return ret; ++} ++ ++static int bch2_alloc_ciphers(struct bch_fs *c) ++{ ++ int ret; ++ ++ if (!c->chacha20) ++ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ ret = PTR_ERR_OR_ZERO(c->chacha20); ++ ++ if (ret) { ++ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ if (!c->poly1305) ++ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ++ ret = PTR_ERR_OR_ZERO(c->poly1305); ++ ++ if (ret) { ++ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_disable_encryption(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ /* is key encrypted? */ ++ ret = 0; ++ if (bch2_key_is_encrypted(&crypt->key)) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.key = key; ++ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_enable_encryption(struct bch_fs *c, bool keyed) ++{ ++ struct bch_encrypted_key key; ++ struct bch_key user_key; ++ struct bch_sb_field_crypt *crypt; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ /* Do we already have an encryption key? */ ++ if (bch2_sb_get_crypt(c->disk_sb.sb)) ++ goto err; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto err; ++ ++ key.magic = BCH_KEY_MAGIC; ++ get_random_bytes(&key.key, sizeof(key.key)); ++ ++ if (keyed) { ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &key, sizeof(key)); ++ if (ret) ++ goto err; ++ } ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto err; ++ ++ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); ++ if (!crypt) { ++ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ ++ goto err; ++ } ++ ++ crypt->key = key; ++ ++ /* write superblock */ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); ++ bch2_write_super(c); ++err: ++ mutex_unlock(&c->sb_lock); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ memzero_explicit(&key, sizeof(key)); ++ return ret; ++} ++ ++void bch2_fs_encryption_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->poly1305)) ++ crypto_free_shash(c->poly1305); ++ if (!IS_ERR_OR_NULL(c->chacha20)) ++ crypto_free_sync_skcipher(c->chacha20); ++ if (!IS_ERR_OR_NULL(c->sha256)) ++ crypto_free_shash(c->sha256); ++} ++ ++int bch2_fs_encryption_init(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->sha256 = crypto_alloc_shash("sha256", 0, 0); ++ ret = PTR_ERR_OR_ZERO(c->sha256); ++ if (ret) { ++ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); ++ goto out; ++ } ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto out; ++out: ++ memzero_explicit(&key, sizeof(key)); ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +new file mode 100644 +index 000000000000..c86c3c05d620 +--- /dev/null ++++ b/fs/bcachefs/checksum.h +@@ -0,0 +1,204 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHECKSUM_H ++#define _BCACHEFS_CHECKSUM_H ++ ++#include "bcachefs.h" ++#include "extents_types.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static inline bool bch2_checksum_mergeable(unsigned type) ++{ ++ ++ switch (type) { ++ case BCH_CSUM_none: ++ case BCH_CSUM_crc32c: ++ case BCH_CSUM_crc64: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, ++ struct bch_csum, size_t); ++ ++#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) ++#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) ++#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) ++#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) ++#define BCH_NONCE_POLY cpu_to_le32(1 << 31) ++ ++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, ++ const void *, size_t); ++ ++/* ++ * This is used for various on disk data structures - bch_sb, prio_set, bset, ++ * jset: The checksum is _always_ the first field of these structs ++ */ ++#define csum_vstruct(_c, _type, _nonce, _i) \ ++({ \ ++ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ ++ const void *end = vstruct_end(_i); \ ++ \ ++ bch2_checksum(_c, _type, _nonce, start, end - start); \ ++}) ++ ++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); ++int bch2_request_key(struct bch_sb *, struct bch_key *); ++ ++int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++ void *data, size_t); ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, ++ struct bch_extent_crc_unpacked, ++ struct bch_extent_crc_unpacked *, ++ struct bch_extent_crc_unpacked *, ++ unsigned, unsigned, unsigned); ++ ++int bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, ++ struct bch_key *); ++ ++int bch2_disable_encryption(struct bch_fs *); ++int bch2_enable_encryption(struct bch_fs *, bool); ++ ++void bch2_fs_encryption_exit(struct bch_fs *); ++int bch2_fs_encryption_init(struct bch_fs *); ++ ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++ bool data) ++{ ++ switch (type) { ++ case BCH_CSUM_OPT_none: ++ return BCH_CSUM_none; ++ case BCH_CSUM_OPT_crc32c: ++ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; ++ case BCH_CSUM_OPT_crc64: ++ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; ++ case BCH_CSUM_OPT_xxhash: ++ return BCH_CSUM_xxhash; ++ default: ++ BUG(); ++ } ++} ++ ++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, ++ unsigned opt) ++{ ++ if (c->sb.encryption_type) ++ return c->opts.wide_macs ++ ? BCH_CSUM_chacha20_poly1305_128 ++ : BCH_CSUM_chacha20_poly1305_80; ++ ++ return bch2_csum_opt_to_type(opt, true); ++} ++ ++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) ++{ ++ if (c->sb.encryption_type) ++ return BCH_CSUM_chacha20_poly1305_128; ++ ++ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); ++} ++ ++static const unsigned bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++static inline bool bch2_checksum_type_valid(const struct bch_fs *c, ++ unsigned type) ++{ ++ if (type >= BCH_CSUM_NR) ++ return false; ++ ++ if (bch2_csum_type_is_encryption(type) && !c->chacha20) ++ return false; ++ ++ return true; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) ++{ ++ /* ++ * XXX: need some way of preventing the compiler from optimizing this ++ * into a form that isn't constant time.. ++ */ ++ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; ++} ++ ++/* for skipping ahead and encrypting/decrypting at an offset: */ ++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) ++{ ++ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); ++ ++ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); ++ return nonce; ++} ++ ++static inline struct nonce null_nonce(void) ++{ ++ struct nonce ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ return ret; ++} ++ ++static inline struct nonce extent_nonce(struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ unsigned compression_type = crc_is_compressed(crc) ++ ? crc.compression_type ++ : 0; ++ unsigned size = compression_type ? crc.uncompressed_size : 0; ++ struct nonce nonce = (struct nonce) {{ ++ [0] = cpu_to_le32(size << 22), ++ [1] = cpu_to_le32(version.lo), ++ [2] = cpu_to_le32(version.lo >> 32), ++ [3] = cpu_to_le32(version.hi| ++ (compression_type << 24))^BCH_NONCE_EXTENT, ++ }}; ++ ++ return nonce_add(nonce, crc.nonce << 9); ++} ++ ++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) ++{ ++ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; ++} ++ ++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) ++{ ++ __le64 magic = __bch2_sb_magic(sb); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) ++{ ++ __le64 magic = bch2_sb_magic(c); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++#endif /* _BCACHEFS_CHECKSUM_H */ +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +new file mode 100644 +index 000000000000..f3ffdbc38485 +--- /dev/null ++++ b/fs/bcachefs/clock.c +@@ -0,0 +1,191 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "clock.h" ++ ++#include ++#include ++#include ++ ++static inline long io_timer_cmp(io_timer_heap *h, ++ struct io_timer *l, ++ struct io_timer *r) ++{ ++ return l->expire - r->expire; ++} ++ ++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (time_after_eq((unsigned long) atomic64_read(&clock->now), ++ timer->expire)) { ++ spin_unlock(&clock->timer_lock); ++ timer->fn(timer); ++ return; ++ } ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) ++ goto out; ++ ++ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); ++out: ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) { ++ heap_del(&clock->timers, i, io_timer_cmp, NULL); ++ break; ++ } ++ ++ spin_unlock(&clock->timer_lock); ++} ++ ++struct io_clock_wait { ++ struct io_timer io_timer; ++ struct timer_list cpu_timer; ++ struct task_struct *task; ++ int expired; ++}; ++ ++static void io_clock_wait_fn(struct io_timer *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, io_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++static void io_clock_cpu_timeout(struct timer_list *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, cpu_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) ++{ ++ struct io_clock_wait wait; ++ ++ /* XXX: calculate sleep time rigorously */ ++ wait.io_timer.expire = until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ schedule(); ++ ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ unsigned long io_until, ++ unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct io_clock_wait wait; ++ ++ wait.io_timer.expire = io_until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); ++ ++ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) ++ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (wait.expired) ++ break; ++ ++ schedule(); ++ try_to_freeze(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ del_singleshot_timer_sync(&wait.cpu_timer); ++ destroy_timer_on_stack(&wait.cpu_timer); ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++static struct io_timer *get_expired_timer(struct io_clock *clock, ++ unsigned long now) ++{ ++ struct io_timer *ret = NULL; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (clock->timers.used && ++ time_after_eq(now, clock->timers.data[0]->expire)) ++ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); ++ ++ spin_unlock(&clock->timer_lock); ++ ++ return ret; ++} ++ ++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) ++{ ++ struct io_timer *timer; ++ unsigned long now = atomic64_add_return(sectors, &clock->now); ++ ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} ++ ++void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) ++{ ++ unsigned long now; ++ unsigned i; ++ ++ spin_lock(&clock->timer_lock); ++ now = atomic64_read(&clock->now); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ prt_printf(out, "%ps:\t%li\n", ++ clock->timers.data[i]->fn, ++ clock->timers.data[i]->expire - now); ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_clock_exit(struct io_clock *clock) ++{ ++ free_heap(&clock->timers); ++ free_percpu(clock->pcpu_buf); ++} ++ ++int bch2_io_clock_init(struct io_clock *clock) ++{ ++ atomic64_set(&clock->now, 0); ++ spin_lock_init(&clock->timer_lock); ++ ++ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); ++ ++ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); ++ if (!clock->pcpu_buf) ++ return -ENOMEM; ++ ++ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +new file mode 100644 +index 000000000000..70a0f7436c84 +--- /dev/null ++++ b/fs/bcachefs/clock.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_H ++#define _BCACHEFS_CLOCK_H ++ ++void bch2_io_timer_add(struct io_clock *, struct io_timer *); ++void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, ++ unsigned long); ++ ++void __bch2_increment_clock(struct io_clock *, unsigned); ++ ++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, ++ int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ ++ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= ++ IO_CLOCK_PCPU_SECTORS)) ++ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); ++ ++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ++({ \ ++ long __ret = timeout; \ ++ might_sleep(); \ ++ if (!___wait_cond_timeout(condition)) \ ++ __ret = __wait_event_timeout(wq, condition, timeout); \ ++ __ret; \ ++}) ++ ++void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); ++ ++void bch2_io_clock_exit(struct io_clock *); ++int bch2_io_clock_init(struct io_clock *); ++ ++#endif /* _BCACHEFS_CLOCK_H */ +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +new file mode 100644 +index 000000000000..5fae0012d808 +--- /dev/null ++++ b/fs/bcachefs/clock_types.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_TYPES_H ++#define _BCACHEFS_CLOCK_TYPES_H ++ ++#include "util.h" ++ ++#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) ++ ++/* ++ * Clocks/timers in units of sectors of IO: ++ * ++ * Note - they use percpu batching, so they're only approximate. ++ */ ++ ++struct io_timer; ++typedef void (*io_timer_fn)(struct io_timer *); ++ ++struct io_timer { ++ io_timer_fn fn; ++ unsigned long expire; ++}; ++ ++/* Amount to buffer up on a percpu counter */ ++#define IO_CLOCK_PCPU_SECTORS 128 ++ ++typedef HEAP(struct io_timer *) io_timer_heap; ++ ++struct io_clock { ++ atomic64_t now; ++ u16 __percpu *pcpu_buf; ++ unsigned max_slop; ++ ++ spinlock_t timer_lock; ++ io_timer_heap timers; ++}; ++ ++#endif /* _BCACHEFS_CLOCK_TYPES_H */ +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +new file mode 100644 +index 000000000000..f692f35a6a98 +--- /dev/null ++++ b/fs/bcachefs/compress.c +@@ -0,0 +1,639 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "compress.h" ++#include "extents.h" ++#include "io.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++/* Bounce buffer: */ ++struct bbuf { ++ void *b; ++ enum { ++ BB_NONE, ++ BB_VMAP, ++ BB_KMALLOC, ++ BB_MEMPOOL, ++ } type; ++ int rw; ++}; ++ ++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) ++{ ++ void *b; ++ ++ BUG_ON(size > c->opts.encoded_extent_max); ++ ++ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ BUG(); ++} ++ ++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ void *expected_start = NULL; ++ ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (expected_start && ++ expected_start != page_address(bv.bv_page) + bv.bv_offset) ++ return false; ++ ++ expected_start = page_address(bv.bv_page) + ++ bv.bv_offset + bv.bv_len; ++ } ++ ++ return true; ++} ++ ++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, ++ struct bvec_iter start, int rw) ++{ ++ struct bbuf ret; ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ unsigned nr_pages = 0; ++ struct page *stack_pages[16]; ++ struct page **pages = NULL; ++ void *data; ++ ++ BUG_ON(start.bi_size > c->opts.encoded_extent_max); ++ ++ if (!PageHighMem(bio_iter_page(bio, start)) && ++ bio_phys_contig(bio, start)) ++ return (struct bbuf) { ++ .b = page_address(bio_iter_page(bio, start)) + ++ bio_iter_offset(bio, start), ++ .type = BB_NONE, .rw = rw ++ }; ++ ++ /* check if we can map the pages contiguously: */ ++ __bio_for_each_segment(bv, bio, iter, start) { ++ if (iter.bi_size != start.bi_size && ++ bv.bv_offset) ++ goto bounce; ++ ++ if (bv.bv_len < iter.bi_size && ++ bv.bv_offset + bv.bv_len < PAGE_SIZE) ++ goto bounce; ++ ++ nr_pages++; ++ } ++ ++ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); ++ ++ pages = nr_pages > ARRAY_SIZE(stack_pages) ++ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) ++ : stack_pages; ++ if (!pages) ++ goto bounce; ++ ++ nr_pages = 0; ++ __bio_for_each_segment(bv, bio, iter, start) ++ pages[nr_pages++] = bv.bv_page; ++ ++ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (pages != stack_pages) ++ kfree(pages); ++ ++ if (data) ++ return (struct bbuf) { ++ .b = data + bio_iter_offset(bio, start), ++ .type = BB_VMAP, .rw = rw ++ }; ++bounce: ++ ret = __bounce_alloc(c, start.bi_size, rw); ++ ++ if (rw == READ) ++ memcpy_from_bio(ret.b, bio, start); ++ ++ return ret; ++} ++ ++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) ++{ ++ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); ++} ++ ++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) ++{ ++ switch (buf.type) { ++ case BB_NONE: ++ break; ++ case BB_VMAP: ++ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); ++ break; ++ case BB_KMALLOC: ++ kfree(buf.b); ++ break; ++ case BB_MEMPOOL: ++ mempool_free(buf.b, &c->compression_bounce[buf.rw]); ++ break; ++ } ++} ++ ++static inline void zlib_set_workspace(z_stream *strm, void *workspace) ++{ ++#ifdef __KERNEL__ ++ strm->workspace = workspace; ++#endif ++} ++ ++static int __bio_uncompress(struct bch_fs *c, struct bio *src, ++ void *dst_data, struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf src_data = { NULL }; ++ size_t src_len = src->bi_iter.bi_size; ++ size_t dst_len = crc.uncompressed_size << 9; ++ void *workspace; ++ int ret; ++ ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ switch (crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: ++ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, ++ src_len, dst_len, dst_len); ++ if (ret != dst_len) ++ goto err; ++ break; ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src_data.b, ++ .avail_in = src_len, ++ .next_out = dst_data, ++ .avail_out = dst_len, ++ }; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_inflateInit2(&strm, -MAX_WBITS); ++ ret = zlib_inflate(&strm, Z_FINISH); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != Z_STREAM_END) ++ goto err; ++ break; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_DCtx *ctx; ++ size_t real_src_len = le32_to_cpup(src_data.b); ++ ++ if (real_src_len > src_len - 4) ++ goto err; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ++ ++ ret = zstd_decompress_dctx(ctx, ++ dst_data, dst_len, ++ src_data.b + 4, real_src_len); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != dst_len) ++ goto err; ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ret = 0; ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ return ret; ++err: ++ ret = -EIO; ++ goto out; ++} ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, ++ struct bch_extent_crc_unpacked *crc) ++{ ++ struct bbuf data = { NULL }; ++ size_t dst_len = crc->uncompressed_size << 9; ++ ++ /* bio must own its pages: */ ++ BUG_ON(!bio->bi_vcnt); ++ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); ++ ++ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc->compressed_size << 9 > c->opts.encoded_extent_max) { ++ bch_err(c, "error rewriting existing data: extent too big"); ++ return -EIO; ++ } ++ ++ data = __bounce_alloc(c, dst_len, WRITE); ++ ++ if (__bio_uncompress(c, bio, data.b, *crc)) { ++ bch_err(c, "error rewriting existing data: decompression error"); ++ bio_unmap_or_unbounce(c, data); ++ return -EIO; ++ } ++ ++ /* ++ * XXX: don't have a good way to assert that the bio was allocated with ++ * enough space, we depend on bch2_move_extent doing the right thing ++ */ ++ bio->bi_iter.bi_size = crc->live_size << 9; ++ ++ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); ++ ++ crc->csum_type = 0; ++ crc->compression_type = 0; ++ crc->compressed_size = crc->live_size; ++ crc->uncompressed_size = crc->live_size; ++ crc->offset = 0; ++ crc->csum = (struct bch_csum) { 0, 0 }; ++ ++ bio_unmap_or_unbounce(c, data); ++ return 0; ++} ++ ++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, ++ struct bio *dst, struct bvec_iter dst_iter, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf dst_data = { NULL }; ++ size_t dst_len = crc.uncompressed_size << 9; ++ int ret = -ENOMEM; ++ ++ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || ++ crc.compressed_size << 9 > c->opts.encoded_extent_max) ++ return -EIO; ++ ++ dst_data = dst_len == dst_iter.bi_size ++ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) ++ : __bounce_alloc(c, dst_len, WRITE); ++ ++ ret = __bio_uncompress(c, src, dst_data.b, crc); ++ if (ret) ++ goto err; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); ++err: ++ bio_unmap_or_unbounce(c, dst_data); ++ return ret; ++} ++ ++static int attempt_compress(struct bch_fs *c, ++ void *workspace, ++ void *dst, size_t dst_len, ++ void *src, size_t src_len, ++ enum bch_compression_type compression_type) ++{ ++ switch (compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4: { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ ++ if (len < src_len) ++ return -len; ++ ++ return ret; ++ } ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src, ++ .avail_in = src_len, ++ .next_out = dst, ++ .avail_out = dst_len, ++ }; ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, ++ Z_DEFAULT_STRATEGY); ++ ++ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) ++ return 0; ++ ++ if (zlib_deflateEnd(&strm) != Z_OK) ++ return 0; ++ ++ return strm.total_out; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_CCtx *ctx = zstd_init_cctx(workspace, ++ zstd_cctx_workspace_bound(&c->zstd_params.cParams)); ++ ++ /* ++ * ZSTD requires that when we decompress we pass in the exact ++ * compressed size - rounding it up to the nearest sector ++ * doesn't work, so we use the first 4 bytes of the buffer for ++ * that. ++ * ++ * Additionally, the ZSTD code seems to have a bug where it will ++ * write just past the end of the buffer - so subtract a fudge ++ * factor (7 bytes) from the dst buffer size to account for ++ * that. ++ */ ++ size_t len = zstd_compress_cctx(ctx, ++ dst + 4, dst_len - 4 - 7, ++ src, src_len, ++ &c->zstd_params); ++ if (zstd_is_error(len)) ++ return 0; ++ ++ *((__le32 *) dst) = cpu_to_le32(len); ++ return len + 4; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned __bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ enum bch_compression_type compression_type) ++{ ++ struct bbuf src_data = { NULL }, dst_data = { NULL }; ++ void *workspace; ++ unsigned pad; ++ int ret = 0; ++ ++ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); ++ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ ++ /* If it's only one block, don't bother trying to compress: */ ++ if (src->bi_iter.bi_size <= c->opts.block_size) ++ return 0; ++ ++ dst_data = bio_map_or_bounce(c, dst, WRITE); ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); ++ ++ *src_len = src->bi_iter.bi_size; ++ *dst_len = dst->bi_iter.bi_size; ++ ++ /* ++ * XXX: this algorithm sucks when the compression code doesn't tell us ++ * how much would fit, like LZ4 does: ++ */ ++ while (1) { ++ if (*src_len <= block_bytes(c)) { ++ ret = -1; ++ break; ++ } ++ ++ ret = attempt_compress(c, workspace, ++ dst_data.b, *dst_len, ++ src_data.b, *src_len, ++ compression_type); ++ if (ret > 0) { ++ *dst_len = ret; ++ ret = 0; ++ break; ++ } ++ ++ /* Didn't fit: should we retry with a smaller amount? */ ++ if (*src_len <= *dst_len) { ++ ret = -1; ++ break; ++ } ++ ++ /* ++ * If ret is negative, it's a hint as to how much data would fit ++ */ ++ BUG_ON(-ret >= *src_len); ++ ++ if (ret < 0) ++ *src_len = -ret; ++ else ++ *src_len -= (*src_len - *dst_len) / 2; ++ *src_len = round_down(*src_len, block_bytes(c)); ++ } ++ ++ mempool_free(workspace, &c->compress_workspace[compression_type]); ++ ++ if (ret) ++ goto err; ++ ++ /* Didn't get smaller: */ ++ if (round_up(*dst_len, block_bytes(c)) >= *src_len) ++ goto err; ++ ++ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; ++ ++ memset(dst_data.b + *dst_len, 0, pad); ++ *dst_len += pad; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); ++ ++ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); ++ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); ++ BUG_ON(*dst_len & (block_bytes(c) - 1)); ++ BUG_ON(*src_len & (block_bytes(c) - 1)); ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ bio_unmap_or_unbounce(c, dst_data); ++ return compression_type; ++err: ++ compression_type = BCH_COMPRESSION_TYPE_incompressible; ++ goto out; ++} ++ ++unsigned bch2_bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ unsigned orig_dst = dst->bi_iter.bi_size; ++ unsigned orig_src = src->bi_iter.bi_size; ++ ++ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ ++ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, ++ c->opts.encoded_extent_max); ++ /* Don't generate a bigger output than input: */ ++ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ ++ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) ++ compression_type = BCH_COMPRESSION_TYPE_lz4; ++ ++ compression_type = ++ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ ++ dst->bi_iter.bi_size = orig_dst; ++ src->bi_iter.bi_size = orig_src; ++ return compression_type; ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *, u64); ++ ++#define BCH_FEATURE_none 0 ++ ++static const unsigned bch2_compression_opt_to_feature[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++#undef BCH_FEATURE_none ++ ++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) ++{ ++ int ret = 0; ++ ++ if ((c->sb.features & f) == f) ++ return 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if ((c->sb.features & f) == f) { ++ mutex_unlock(&c->sb_lock); ++ return 0; ++ } ++ ++ ret = __bch2_fs_compress_init(c, c->sb.features|f); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ret; ++ } ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(f); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *c, ++ unsigned compression_type) ++{ ++ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); ++ ++ return compression_type ++ ? __bch2_check_set_has_compressed_data(c, ++ 1ULL << bch2_compression_opt_to_feature[compression_type]) ++ : 0; ++} ++ ++void bch2_fs_compress_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mempool_exit(&c->decompress_workspace); ++ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) ++ mempool_exit(&c->compress_workspace[i]); ++ mempool_exit(&c->compression_bounce[WRITE]); ++ mempool_exit(&c->compression_bounce[READ]); ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++{ ++ size_t decompress_workspace_size = 0; ++ bool decompress_workspace_needed; ++ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); ++ struct { ++ unsigned feature; ++ unsigned type; ++ size_t compress_workspace; ++ size_t decompress_workspace; ++ } compression_types[] = { ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, ++ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize(), }, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, ++ zstd_cctx_workspace_bound(¶ms.cParams), ++ zstd_dctx_workspace_bound() }, ++ }, *i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->zstd_params = params; ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) ++ if (features & (1 << i->feature)) ++ goto have_compressed; ++ ++ goto out; ++have_compressed: ++ ++ if (!mempool_initialized(&c->compression_bounce[READ])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], ++ 1, c->opts.encoded_extent_max); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->compression_bounce[WRITE])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], ++ 1, c->opts.encoded_extent_max); ++ if (ret) ++ goto out; ++ } ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) { ++ decompress_workspace_size = ++ max(decompress_workspace_size, i->decompress_workspace); ++ ++ if (!(features & (1 << i->feature))) ++ continue; ++ ++ if (i->decompress_workspace) ++ decompress_workspace_needed = true; ++ ++ if (mempool_initialized(&c->compress_workspace[i->type])) ++ continue; ++ ++ ret = mempool_init_kvpmalloc_pool( ++ &c->compress_workspace[i->type], ++ 1, i->compress_workspace); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->decompress_workspace)) { ++ ret = mempool_init_kvpmalloc_pool( ++ &c->decompress_workspace, ++ 1, decompress_workspace_size); ++ if (ret) ++ goto out; ++ } ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_compress_init(struct bch_fs *c) ++{ ++ u64 f = c->sb.features; ++ ++ if (c->opts.compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; ++ ++ if (c->opts.background_compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ ++ return __bch2_fs_compress_init(c, f); ++ ++} +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +new file mode 100644 +index 000000000000..4bab1f61b3b5 +--- /dev/null ++++ b/fs/bcachefs/compress.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COMPRESS_H ++#define _BCACHEFS_COMPRESS_H ++ ++#include "extents_types.h" ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, ++ struct bch_extent_crc_unpacked *); ++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, ++ struct bvec_iter, struct bch_extent_crc_unpacked); ++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, ++ struct bio *, size_t *, unsigned); ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); ++void bch2_fs_compress_exit(struct bch_fs *); ++int bch2_fs_compress_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c +new file mode 100644 +index 000000000000..745f856e6d3e +--- /dev/null ++++ b/fs/bcachefs/counters.c +@@ -0,0 +1,107 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "super-io.h" ++#include "counters.h" ++ ++/* BCH_SB_FIELD_counters */ ++ ++const char * const bch2_counter_names[] = { ++#define x(t, n, ...) (#t), ++ BCH_PERSISTENT_COUNTERS() ++#undef x ++ NULL ++}; ++ ++static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) ++{ ++ if (!ctrs) ++ return 0; ++ ++ return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; ++}; ++ ++static int bch2_sb_counters_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ return 0; ++}; ++ ++void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_counters *ctrs = field_to_type(f, counters); ++ unsigned int i; ++ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); ++ ++ for (i = 0; i < nr; i++) { ++ if (i < BCH_COUNTER_NR) ++ prt_printf(out, "%s", bch2_counter_names[i]); ++ else ++ prt_printf(out, "(unknown)"); ++ ++ prt_tab(out); ++ prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); ++ prt_newline(out); ++ }; ++}; ++ ++int bch2_sb_counters_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); ++ unsigned int i; ++ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); ++ u64 val = 0; ++ ++ for (i = 0; i < BCH_COUNTER_NR; i++) ++ c->counters_on_mount[i] = 0; ++ ++ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { ++ val = le64_to_cpu(ctrs->d[i]); ++ percpu_u64_set(&c->counters[i], val); ++ c->counters_on_mount[i] = val; ++ } ++ return 0; ++}; ++ ++int bch2_sb_counters_from_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); ++ struct bch_sb_field_counters *ret; ++ unsigned int i; ++ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); ++ ++ if (nr < BCH_COUNTER_NR) { ++ ret = bch2_sb_resize_counters(&c->disk_sb, ++ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); ++ ++ if (ret) { ++ ctrs = ret; ++ nr = bch2_sb_counter_nr_entries(ctrs); ++ } ++ } ++ ++ ++ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) ++ ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); ++ return 0; ++} ++ ++void bch2_fs_counters_exit(struct bch_fs *c) ++{ ++ free_percpu(c->counters); ++} ++ ++int bch2_fs_counters_init(struct bch_fs *c) ++{ ++ c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); ++ if (!c->counters) ++ return -ENOMEM; ++ ++ return bch2_sb_counters_to_cpu(c); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_counters = { ++ .validate = bch2_sb_counters_validate, ++ .to_text = bch2_sb_counters_to_text, ++}; +diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h +new file mode 100644 +index 000000000000..4778aa19bf34 +--- /dev/null ++++ b/fs/bcachefs/counters.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COUNTERS_H ++#define _BCACHEFS_COUNTERS_H ++ ++#include "bcachefs.h" ++#include "super-io.h" ++ ++ ++int bch2_sb_counters_to_cpu(struct bch_fs *); ++int bch2_sb_counters_from_cpu(struct bch_fs *); ++ ++void bch2_fs_counters_exit(struct bch_fs *); ++int bch2_fs_counters_init(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_counters; ++ ++#endif // _BCACHEFS_COUNTERS_H +diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h +new file mode 100644 +index 000000000000..519ab9b96e67 +--- /dev/null ++++ b/fs/bcachefs/darray.h +@@ -0,0 +1,77 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DARRAY_H ++#define _BCACHEFS_DARRAY_H ++ ++/* ++ * Dynamic arrays: ++ * ++ * Inspired by CCAN's darray ++ */ ++ ++#include "util.h" ++#include ++ ++#define DARRAY(type) \ ++struct { \ ++ size_t nr, size; \ ++ type *data; \ ++} ++ ++typedef DARRAY(void) darray_void; ++ ++static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) ++{ ++ if (d->nr + more > d->size) { ++ size_t new_size = roundup_pow_of_two(d->nr + more); ++ void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ d->data = data; ++ d->size = new_size; ++ } ++ ++ return 0; ++} ++ ++#define darray_make_room(_d, _more) \ ++ __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) ++ ++#define darray_top(_d) ((_d).data[(_d).nr]) ++ ++#define darray_push(_d, _item) \ ++({ \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ (_d)->data[(_d)->nr++] = (_item); \ ++ _ret; \ ++}) ++ ++#define darray_insert_item(_d, _pos, _item) \ ++({ \ ++ size_t pos = (_pos); \ ++ int _ret = darray_make_room((_d), 1); \ ++ \ ++ if (!_ret) \ ++ array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ ++ _ret; \ ++}) ++ ++#define darray_for_each(_d, _i) \ ++ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) ++ ++#define darray_init(_d) \ ++do { \ ++ (_d)->data = NULL; \ ++ (_d)->nr = (_d)->size = 0; \ ++} while (0) ++ ++#define darray_exit(_d) \ ++do { \ ++ kfree((_d)->data); \ ++ darray_init(_d); \ ++} while (0) ++ ++#endif /* _BCACHEFS_DARRAY_H */ +diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c +new file mode 100644 +index 000000000000..3b442b01ca86 +--- /dev/null ++++ b/fs/bcachefs/data_update.c +@@ -0,0 +1,376 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "data_update.h" ++#include "ec.h" ++#include "extents.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "subvolume.h" ++ ++#include ++ ++static int insert_snapshot_whiteouts(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos old_pos, ++ struct bpos new_pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter, update_iter; ++ struct bkey_s_c k; ++ snapshot_id_list s; ++ int ret; ++ ++ if (!btree_type_has_snapshots(id)) ++ return 0; ++ ++ darray_init(&s); ++ ++ if (!bkey_cmp(old_pos, new_pos)) ++ return 0; ++ ++ if (!snapshot_t(c, old_pos.snapshot)->children[0]) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, old_pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while (1) { ++ k = bch2_btree_iter_prev(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (bkey_cmp(old_pos, k.k->p)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { ++ struct bkey_i *update; ++ ++ if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) ++ continue; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ break; ++ ++ bkey_init(&update->k); ++ update->k.p = new_pos; ++ update->k.p.snapshot = k.k->p.snapshot; ++ ++ bch2_trans_iter_init(trans, &update_iter, id, update->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&update_iter) ?: ++ bch2_trans_update(trans, &update_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &update_iter); ++ if (ret) ++ break; ++ ++ ret = snapshot_list_add(c, &s, k.k->p.snapshot); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ darray_exit(&s); ++ ++ return ret; ++} ++ ++static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ ptr->cached = true; ++} ++ ++static int bch2_data_update_index_update(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct data_update *m = ++ container_of(op, struct data_update, op); ++ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_buf _new, _insert; ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&_new); ++ bch2_bkey_buf_init(&_insert); ++ bch2_bkey_buf_realloc(&_insert, c, U8_MAX); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ bch2_trans_iter_init(&trans, &iter, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (1) { ++ struct bkey_s_c k; ++ struct bkey_s_c old = bkey_i_to_s_c(m->k.k); ++ struct bkey_i *insert; ++ struct bkey_i_extent *new; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bpos next_pos; ++ bool did_work = false; ++ bool should_check_enospc; ++ s64 i_sectors_delta = 0, disk_sectors_delta = 0; ++ unsigned i; ++ ++ bch2_trans_begin(&trans); ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ new = bkey_i_to_extent(bch2_keylist_front(keys)); ++ ++ if (!bch2_extents_match(k, old)) ++ goto nomatch; ++ ++ bkey_reassemble(_insert.k, k); ++ insert = _insert.k; ++ ++ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(_new.k); ++ bch2_cut_front(iter.pos, &new->k_i); ++ ++ bch2_cut_front(iter.pos, insert); ++ bch2_cut_back(new->k.p, insert); ++ bch2_cut_back(insert->k.p, &new->k_i); ++ ++ /* ++ * @old: extent that we read from ++ * @insert: key that we're going to update, initialized from ++ * extent currently in btree - same as @old unless we raced with ++ * other updates ++ * @new: extent with new pointers that we'll be adding to @insert ++ * ++ * Fist, drop rewrite_ptrs from @new: ++ */ ++ i = 0; ++ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { ++ if (((1U << i) & m->data_opts.rewrite_ptrs) && ++ bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) { ++ /* ++ * If we're going to be adding a pointer to the ++ * same device, we have to drop the old one - ++ * otherwise, we can just mark it cached: ++ */ ++ if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev)) ++ bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev); ++ else ++ bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev); ++ } ++ i++; ++ } ++ ++ ++ /* Add new ptrs: */ ++ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { ++ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { ++ /* ++ * raced with another move op? extent already ++ * has a pointer to the device we just wrote ++ * data to ++ */ ++ continue; ++ } ++ ++ bch2_extent_ptr_decoded_append(insert, &p); ++ did_work = true; ++ } ++ ++ if (!did_work) ++ goto nomatch; ++ ++ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); ++ bch2_extent_normalize(c, bkey_i_to_s(insert)); ++ ++ ret = bch2_sum_sector_overwrites(&trans, &iter, insert, ++ &should_check_enospc, ++ &i_sectors_delta, ++ &disk_sectors_delta); ++ if (ret) ++ goto err; ++ ++ if (disk_sectors_delta > (s64) op->res.sectors) { ++ ret = bch2_disk_reservation_add(c, &op->res, ++ disk_sectors_delta - op->res.sectors, ++ !should_check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); ++ if (ret) ++ goto out; ++ } ++ ++ next_pos = insert->k.p; ++ ++ ret = insert_snapshot_whiteouts(&trans, m->btree_id, ++ k.k->p, insert->k.p) ?: ++ bch2_trans_update(&trans, &iter, insert, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: ++ bch2_trans_commit(&trans, &op->res, ++ op_journal_seq(op), ++ BTREE_INSERT_NOFAIL| ++ m->data_opts.btree_insert_flags); ++ if (!ret) { ++ bch2_btree_iter_set_pos(&iter, next_pos); ++ atomic_long_inc(&c->extent_migrate_done); ++ if (ec_ob) ++ bch2_ob_add_backpointer(c, ec_ob, &insert->k); ++ } ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ if (ret) ++ break; ++next: ++ while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ bch2_keylist_pop_front(keys); ++ if (bch2_keylist_empty(keys)) ++ goto out; ++ } ++ continue; ++nomatch: ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, old); ++ bch_info(c, "no match for %s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ++ if (m->ctxt) { ++ BUG_ON(k.k->p.offset <= iter.pos.offset); ++ atomic64_inc(&m->ctxt->stats->keys_raced); ++ atomic64_add(k.k->p.offset - iter.pos.offset, ++ &m->ctxt->stats->sectors_raced); ++ } ++ atomic_long_inc(&c->extent_migrate_raced); ++ trace_move_race(&new->k); ++ bch2_btree_iter_advance(&iter); ++ goto next; ++ } ++out: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&_insert, c); ++ bch2_bkey_buf_exit(&_new, c); ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); ++ return ret; ++} ++ ++void bch2_data_update_read_done(struct data_update *m, ++ struct bch_extent_crc_unpacked crc, ++ struct closure *cl) ++{ ++ /* write bio must own pages: */ ++ BUG_ON(!m->op.wbio.bio.bi_vcnt); ++ ++ m->op.crc = crc; ++ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; ++ ++ closure_call(&m->op.cl, bch2_write, NULL, cl); ++} ++ ++void bch2_data_update_exit(struct data_update *update) ++{ ++ struct bch_fs *c = update->op.c; ++ ++ bch2_bkey_buf_exit(&update->k, c); ++ bch2_disk_reservation_put(c, &update->op.res); ++ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++} ++ ++int bch2_data_update_init(struct bch_fs *c, struct data_update *m, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ struct data_update_opts data_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; ++ int ret; ++ ++ bch2_bkey_buf_init(&m->k); ++ bch2_bkey_buf_reassemble(&m->k, c, k); ++ m->btree_id = btree_id; ++ m->data_opts = data_opts; ++ ++ bch2_write_op_init(&m->op, c, io_opts); ++ m->op.pos = bkey_start_pos(k.k); ++ m->op.version = k.k->version; ++ m->op.target = data_opts.target, ++ m->op.write_point = wp; ++ m->op.flags |= BCH_WRITE_PAGES_STABLE| ++ BCH_WRITE_PAGES_OWNED| ++ BCH_WRITE_DATA_ENCODED| ++ BCH_WRITE_FROM_INTERNAL| ++ m->data_opts.write_flags; ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) ++ m->op.alloc_reserve = RESERVE_movinggc; ++ m->op.index_update_fn = bch2_data_update_index_update; ++ ++ i = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ m->data_opts.rewrite_ptrs &= ~(1U << i); ++ ++ if (!((1U << i) & m->data_opts.rewrite_ptrs)) ++ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); ++ ++ if (((1U << i) & m->data_opts.rewrite_ptrs) && ++ crc_is_compressed(p.crc)) ++ reserve_sectors += k.k->size; ++ ++ /* ++ * op->csum_type is normally initialized from the fs/file's ++ * current options - but if an extent is encrypted, we require ++ * that it stays encrypted: ++ */ ++ if (bch2_csum_type_is_encryption(p.crc.csum_type)) { ++ m->op.nonce = p.crc.nonce + p.crc.offset; ++ m->op.csum_type = p.crc.csum_type; ++ } ++ ++ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ m->op.incompressible = true; ++ ++ i++; ++ } ++ ++ if (reserve_sectors) { ++ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, ++ m->data_opts.extra_replicas ++ ? 0 ++ : BCH_DISK_RESERVATION_NOFAIL); ++ if (ret) ++ return ret; ++ } ++ ++ m->op.nr_replicas = m->op.nr_replicas_required = ++ hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas; ++ return 0; ++} +diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h +new file mode 100644 +index 000000000000..e64505453a55 +--- /dev/null ++++ b/fs/bcachefs/data_update.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _BCACHEFS_DATA_UPDATE_H ++#define _BCACHEFS_DATA_UPDATE_H ++ ++#include "bkey_buf.h" ++#include "io_types.h" ++ ++struct moving_context; ++ ++struct data_update_opts { ++ unsigned rewrite_ptrs; ++ u16 target; ++ u8 extra_replicas; ++ unsigned btree_insert_flags; ++ unsigned write_flags; ++}; ++ ++struct data_update { ++ /* extent being updated: */ ++ enum btree_id btree_id; ++ struct bkey_buf k; ++ struct data_update_opts data_opts; ++ struct moving_context *ctxt; ++ struct bch_write_op op; ++}; ++ ++void bch2_data_update_read_done(struct data_update *, ++ struct bch_extent_crc_unpacked, ++ struct closure *); ++ ++void bch2_data_update_exit(struct data_update *); ++int bch2_data_update_init(struct bch_fs *, struct data_update *, ++ struct write_point_specifier, ++ struct bch_io_opts, struct data_update_opts, ++ enum btree_id, struct bkey_s_c); ++ ++#endif /* _BCACHEFS_DATA_UPDATE_H */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +new file mode 100644 +index 000000000000..cd37a1016e25 +--- /dev/null ++++ b/fs/bcachefs/debug.c +@@ -0,0 +1,764 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Assorted bcachefs debug code ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "super.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct dentry *bch_debug; ++ ++static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, ++ struct extent_ptr_decoded pick) ++{ ++ struct btree *v = c->verify_data; ++ struct btree_node *n_ondisk = c->verify_ondisk; ++ struct btree_node *n_sorted = c->verify_data->data; ++ struct bset *sorted, *inmemory = &b->data->keys; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ struct bio *bio; ++ bool failed = false; ++ ++ if (!bch2_dev_get_ioref(ca, READ)) ++ return false; ++ ++ bio = bio_alloc_bioset(ca->disk_sb.bdev, ++ buf_pages(n_sorted, btree_bytes(c)), ++ REQ_OP_READ|REQ_META, ++ GFP_NOIO, ++ &c->btree_bio); ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bch2_bio_map(bio, n_sorted, btree_bytes(c)); ++ ++ submit_bio_wait(bio); ++ ++ bio_put(bio); ++ percpu_ref_put(&ca->io_ref); ++ ++ memcpy(n_ondisk, n_sorted, btree_bytes(c)); ++ ++ v->written = 0; ++ if (bch2_btree_node_read_done(c, ca, v, false)) ++ return false; ++ ++ n_sorted = c->verify_data->data; ++ sorted = &n_sorted->keys; ++ ++ if (inmemory->u64s != sorted->u64s || ++ memcmp(inmemory->start, ++ sorted->start, ++ vstruct_end(inmemory) - (void *) inmemory->start)) { ++ unsigned offset = 0, sectors; ++ struct bset *i; ++ unsigned j; ++ ++ console_lock(); ++ ++ printk(KERN_ERR "*** in memory:\n"); ++ bch2_dump_bset(c, b, inmemory, 0); ++ ++ printk(KERN_ERR "*** read back in:\n"); ++ bch2_dump_bset(c, v, sorted, 0); ++ ++ while (offset < v->written) { ++ if (!offset) { ++ i = &n_ondisk->keys; ++ sectors = vstruct_blocks(n_ondisk, c->block_bits) << ++ c->block_bits; ++ } else { ++ struct btree_node_entry *bne = ++ (void *) n_ondisk + (offset << 9); ++ i = &bne->keys; ++ ++ sectors = vstruct_blocks(bne, c->block_bits) << ++ c->block_bits; ++ } ++ ++ printk(KERN_ERR "*** on disk block %u:\n", offset); ++ bch2_dump_bset(c, b, i, offset); ++ ++ offset += sectors; ++ } ++ ++ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) ++ if (inmemory->_data[j] != sorted->_data[j]) ++ break; ++ ++ console_unlock(); ++ bch_err(c, "verify failed at key %u", j); ++ ++ failed = true; ++ } ++ ++ if (v->written != b->written) { ++ bch_err(c, "written wrong: expected %u, got %u", ++ b->written, v->written); ++ failed = true; ++ } ++ ++ return failed; ++} ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_ptrs_c ptrs; ++ struct extent_ptr_decoded p; ++ const union bch_extent_entry *entry; ++ struct btree *v; ++ struct bset *inmemory = &b->data->keys; ++ struct bkey_packed *k; ++ bool failed = false; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ bch2_btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ if (!c->verify_ondisk) { ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) ++ goto out; ++ } ++ ++ if (!c->verify_data) { ++ c->verify_data = __bch2_btree_node_mem_alloc(c); ++ if (!c->verify_data) ++ goto out; ++ ++ list_del_init(&c->verify_data->list); ++ } ++ ++ BUG_ON(b->nsets != 1); ++ ++ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k)) ++ if (k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); ++ v->mem_ptr = 0; ++ } ++ ++ v = c->verify_data; ++ bkey_copy(&v->key, &b->key); ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; ++ bch2_btree_keys_init(v); ++ ++ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); ++ bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) ++ failed |= bch2_btree_verify_replica(c, b, p); ++ ++ if (failed) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); ++ printbuf_exit(&buf); ++ } ++out: ++ mutex_unlock(&c->verify_lock); ++ bch2_btree_node_io_unlock(b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++ ++/* XXX: bch_fs refcounting */ ++ ++struct dump_iter { ++ struct bch_fs *c; ++ enum btree_id id; ++ struct bpos from; ++ struct bpos prev_node; ++ u64 iter; ++ ++ struct printbuf buf; ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_buf(struct dump_iter *i) ++{ ++ if (i->buf.pos) { ++ size_t bytes = min_t(size_t, i->buf.pos, i->size); ++ int err = copy_to_user(i->ubuf, i->buf.buf, bytes); ++ ++ if (err) ++ return err; ++ ++ i->ret += bytes; ++ i->ubuf += bytes; ++ i->size -= bytes; ++ i->buf.pos -= bytes; ++ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); ++ } ++ ++ return 0; ++} ++ ++static int bch2_dump_open(struct inode *inode, struct file *file) ++{ ++ struct btree_debug *bd = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->iter = 0; ++ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); ++ i->id = bd->id; ++ i->buf = PRINTBUF; ++ ++ return 0; ++} ++ ++static int bch2_dump_release(struct inode *inode, struct file *file) ++{ ++ struct dump_iter *i = file->private_data; ++ ++ printbuf_exit(&i->buf); ++ kfree(i); ++ return 0; ++} ++ ++static ssize_t bch2_read_btree(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ err = for_each_btree_key2(&trans, iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ({ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ ++ bch2_bkey_val_to_text(&i->buf, i->c, k); ++ prt_newline(&i->buf); ++ 0; ++ })); ++ i->from = iter.pos; ++ ++ if (!err) ++ err = flush_buf(i); ++ ++ bch2_trans_exit(&trans); ++ ++ return err ?: i->ret; ++} ++ ++static const struct file_operations btree_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree, ++}; ++ ++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct btree *b; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size || !bpos_cmp(SPOS_MAX, i->from)) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { ++ bch2_btree_node_to_text(&i->buf, i->c, b); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ /* ++ * can't easily correctly restart a btree node traversal across ++ * all nodes, meh ++ */ ++ i->from = bpos_cmp(SPOS_MAX, b->key.k.p) ++ ? bpos_successor(b->key.k.p) ++ : b->key.k.p; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_format_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree_formats, ++}; ++ ++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ err = for_each_btree_key2(&trans, iter, i->id, i->from, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ({ ++ struct btree_path_level *l = &iter.path->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ ++ if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) { ++ bch2_btree_node_to_text(&i->buf, i->c, l->b); ++ i->prev_node = l->b->key.k.p; ++ } ++ ++ bch2_bfloat_to_text(&i->buf, l->b, _k); ++ 0; ++ })); ++ i->from = iter.pos; ++ ++ if (!err) ++ err = flush_buf(i); ++ ++ bch2_trans_exit(&trans); ++ ++ return err ?: i->ret; ++} ++ ++static const struct file_operations bfloat_failed_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_bfloat_failed, ++}; ++ ++static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ out->tabstops[0] = 32; ++ ++ prt_printf(out, "%px btree=%s l=%u ", ++ b, ++ bch2_btree_ids[b->c.btree_id], ++ b->c.level); ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ prt_newline(out); ++ ++ prt_printf(out, "flags: "); ++ prt_tab(out); ++ prt_bitflags(out, bch2_btree_node_flags, b->flags); ++ prt_newline(out); ++ ++ prt_printf(out, "pcpu read locks: "); ++ prt_tab(out); ++ prt_printf(out, "%u", b->c.lock.readers != NULL); ++ prt_newline(out); ++ ++ prt_printf(out, "written:"); ++ prt_tab(out); ++ prt_printf(out, "%u", b->written); ++ prt_newline(out); ++ ++ prt_printf(out, "writes blocked:"); ++ prt_tab(out); ++ prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); ++ prt_newline(out); ++ ++ prt_printf(out, "will make reachable:"); ++ prt_tab(out); ++ prt_printf(out, "%lx", b->will_make_reachable); ++ prt_newline(out); ++ ++ prt_printf(out, "journal pin %px:", &b->writes[0].journal); ++ prt_tab(out); ++ prt_printf(out, "%llu", b->writes[0].journal.seq); ++ prt_newline(out); ++ ++ prt_printf(out, "journal pin %px:", &b->writes[1].journal); ++ prt_tab(out); ++ prt_printf(out, "%llu", b->writes[1].journal.seq); ++ prt_newline(out); ++ ++ printbuf_indent_sub(out, 2); ++} ++ ++static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ rcu_read_lock(); ++ i->buf.atomic++; ++ tbl = rht_dereference_rcu(c->btree_cache.table.tbl, ++ &c->btree_cache.table); ++ if (i->iter < tbl->size) { ++ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) ++ bch2_cached_btree_node_to_text(&i->buf, c, b); ++ i->iter++;; ++ } else { ++ done = true; ++ } ++ --i->buf.atomic; ++ rcu_read_unlock(); ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations cached_btree_nodes_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_cached_btree_nodes_read, ++}; ++ ++static int prt_backtrace(struct printbuf *out, struct task_struct *task) ++{ ++ unsigned long entries[32]; ++ unsigned i, nr_entries; ++ int ret; ++ ++ ret = down_read_killable(&task->signal->exec_update_lock); ++ if (ret) ++ return ret; ++ ++ nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); ++ for (i = 0; i < nr_entries; i++) { ++ prt_printf(out, "[<0>] %pB", (void *)entries[i]); ++ prt_newline(out); ++ } ++ ++ up_read(&task->signal->exec_update_lock); ++ return 0; ++} ++ ++static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ struct btree_trans *trans; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(trans, &c->btree_trans_list, list) { ++ if (trans->task->pid <= i->iter) ++ continue; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ bch2_btree_trans_to_text(&i->buf, trans); ++ ++ prt_printf(&i->buf, "backtrace:"); ++ prt_newline(&i->buf); ++ printbuf_indent_add(&i->buf, 2); ++ prt_backtrace(&i->buf, trans->task); ++ printbuf_indent_sub(&i->buf, 2); ++ prt_newline(&i->buf); ++ ++ i->iter = trans->task->pid; ++ } ++ mutex_unlock(&c->btree_trans_lock); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations btree_transactions_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_btree_transactions_read, ++}; ++ ++static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct bch_fs *c = i->c; ++ bool done = false; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ do { ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); ++ i->iter++; ++ } while (!done); ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations journal_pins_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_journal_pins_read, ++}; ++ ++static int lock_held_stats_open(struct inode *inode, struct file *file) ++{ ++ struct bch_fs *c = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ ++ if (!i) ++ return -ENOMEM; ++ ++ i->iter = 0; ++ i->c = c; ++ i->buf = PRINTBUF; ++ file->private_data = i; ++ ++ return 0; ++} ++ ++static int lock_held_stats_release(struct inode *inode, struct file *file) ++{ ++ struct dump_iter *i = file->private_data; ++ ++ printbuf_exit(&i->buf); ++ kfree(i); ++ ++ return 0; ++} ++ ++static ssize_t lock_held_stats_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct lock_held_stats *lhs = &i->c->lock_held_stats; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) { ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ break; ++ ++ prt_printf(&i->buf, "%s:", lhs->names[i->iter]); ++ prt_newline(&i->buf); ++ printbuf_indent_add(&i->buf, 8); ++ bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]); ++ printbuf_indent_sub(&i->buf, 8); ++ prt_newline(&i->buf); ++ i->iter++; ++ } ++ ++ if (i->buf.allocation_failure) ++ return -ENOMEM; ++ ++ return i->ret; ++} ++ ++static const struct file_operations lock_held_stats_op = { ++ .owner = THIS_MODULE, ++ .open = lock_held_stats_open, ++ .release = lock_held_stats_release, ++ .read = lock_held_stats_read, ++}; ++ ++void bch2_fs_debug_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->fs_debug_dir)) ++ debugfs_remove_recursive(c->fs_debug_dir); ++} ++ ++void bch2_fs_debug_init(struct bch_fs *c) ++{ ++ struct btree_debug *bd; ++ char name[100]; ++ ++ if (IS_ERR_OR_NULL(bch_debug)) ++ return; ++ ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ c->fs_debug_dir = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->fs_debug_dir)) ++ return; ++ ++ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, ++ c->btree_debug, &cached_btree_nodes_ops); ++ ++ debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, ++ c->btree_debug, &btree_transactions_ops); ++ ++ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, ++ c->btree_debug, &journal_pins_ops); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { ++ debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir, ++ c, &lock_held_stats_op); ++ } ++ ++ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); ++ if (IS_ERR_OR_NULL(c->btree_debug_dir)) ++ return; ++ ++ for (bd = c->btree_debug; ++ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); ++ bd++) { ++ bd->id = bd - c->btree_debug; ++ debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->btree_debug_dir, bd, ++ &btree_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-formats", ++ bch2_btree_ids[bd->id]); ++ ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &btree_format_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-bfloat-failed", ++ bch2_btree_ids[bd->id]); ++ ++ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, ++ &bfloat_failed_debug_ops); ++ } ++} ++ ++#endif ++ ++void bch2_debug_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_debug)) ++ debugfs_remove_recursive(bch_debug); ++} ++ ++int __init bch2_debug_init(void) ++{ ++ int ret = 0; ++ ++ bch_debug = debugfs_create_dir("bcachefs", NULL); ++ return ret; ++} +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +new file mode 100644 +index 000000000000..0b86736e5e1b +--- /dev/null ++++ b/fs/bcachefs/debug.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DEBUG_H ++#define _BCACHEFS_DEBUG_H ++ ++#include "bcachefs.h" ++ ++struct bio; ++struct btree; ++struct bch_fs; ++ ++void __bch2_btree_verify(struct bch_fs *, struct btree *); ++ ++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ if (bch2_verify_btree_ondisk) ++ __bch2_btree_verify(c, b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++void bch2_fs_debug_exit(struct bch_fs *); ++void bch2_fs_debug_init(struct bch_fs *); ++#else ++static inline void bch2_fs_debug_exit(struct bch_fs *c) {} ++static inline void bch2_fs_debug_init(struct bch_fs *c) {} ++#endif ++ ++void bch2_debug_exit(void); ++int bch2_debug_init(void); ++ ++#endif /* _BCACHEFS_DEBUG_H */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +new file mode 100644 +index 000000000000..4d942d224a08 +--- /dev/null ++++ b/fs/bcachefs/dirent.c +@@ -0,0 +1,565 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "dirent.h" ++#include "fs.h" ++#include "keylist.h" ++#include "str_hash.h" ++#include "subvolume.h" ++ ++#include ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++{ ++ unsigned len = bkey_val_bytes(d.k) - ++ offsetof(struct bch_dirent, d_name); ++ ++ return strnlen(d.v->d_name, len); ++} ++ ++static u64 bch2_dirent_hash(const struct bch_hash_info *info, ++ const struct qstr *name) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, name->name, name->len); ++ ++ /* [0,2) reserved for dots */ ++ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); ++} ++ ++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_dirent_hash(info, key); ++} ++ ++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ ++ return bch2_dirent_hash(info, &name); ++} ++ ++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ int len = bch2_dirent_name_bytes(l); ++ const struct qstr *r = _r; ++ ++ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++} ++ ++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); ++ int l_len = bch2_dirent_name_bytes(l); ++ int r_len = bch2_dirent_name_bytes(r); ++ ++ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++} ++ ++static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ if (d.v->d_type == DT_SUBVOL) ++ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; ++ return true; ++} ++ ++const struct bch_hash_desc bch2_dirent_hash_desc = { ++ .btree_id = BTREE_ID_dirents, ++ .key_type = KEY_TYPE_dirent, ++ .hash_key = dirent_hash_key, ++ .hash_bkey = dirent_hash_bkey, ++ .cmp_key = dirent_cmp_key, ++ .cmp_bkey = dirent_cmp_bkey, ++ .is_visible = dirent_is_visible, ++}; ++ ++int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ unsigned len; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*d.v)); ++ return -EINVAL; ++ } ++ ++ len = bch2_dirent_name_bytes(d); ++ if (!len) { ++ prt_printf(err, "empty name"); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { ++ prt_printf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k),dirent_val_u64s(len)); ++ return -EINVAL; ++ } ++ ++ if (len > BCH_NAME_MAX) { ++ prt_printf(err, "dirent name too big (%u > %u)", ++ len, BCH_NAME_MAX); ++ return -EINVAL; ++ } ++ ++ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { ++ prt_printf(err, "invalid name"); ++ return -EINVAL; ++ } ++ ++ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { ++ prt_printf(err, "invalid name"); ++ return -EINVAL; ++ } ++ ++ if (memchr(d.v->d_name, '/', len)) { ++ prt_printf(err, "invalid name"); ++ return -EINVAL; ++ } ++ ++ if (d.v->d_type != DT_SUBVOL && ++ le64_to_cpu(d.v->d_inum) == d.k->p.inode) { ++ prt_printf(err, "dirent points to own directory"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ prt_printf(out, "%.*s -> %llu type %s", ++ bch2_dirent_name_bytes(d), ++ d.v->d_name, ++ d.v->d_type != DT_SUBVOL ++ ? le64_to_cpu(d.v->d_inum) ++ : le32_to_cpu(d.v->d_child_subvol), ++ bch2_d_type_str(d.v->d_type)); ++} ++ ++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++ subvol_inum dir, u8 type, ++ const struct qstr *name, u64 dst) ++{ ++ struct bkey_i_dirent *dirent; ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); ++ ++ if (name->len > BCH_NAME_MAX) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ BUG_ON(u64s > U8_MAX); ++ ++ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; ++ ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = u64s; ++ ++ if (type != DT_SUBVOL) { ++ dirent->v.d_inum = cpu_to_le64(dst); ++ } else { ++ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); ++ dirent->v.d_child_subvol = cpu_to_le32(dst); ++ } ++ ++ dirent->v.d_type = type; ++ ++ memcpy(dirent->v.d_name, name->name, name->len); ++ memset(dirent->v.d_name + name->len, 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); ++ ++ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); ++ ++ return dirent; ++} ++ ++int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, ++ const struct bch_hash_info *hash_info, ++ u8 type, const struct qstr *name, u64 dst_inum, ++ u64 *dir_offset, int flags) ++{ ++ struct bkey_i_dirent *dirent; ++ int ret; ++ ++ dirent = dirent_create_key(trans, dir, type, name, dst_inum); ++ ret = PTR_ERR_OR_ZERO(dirent); ++ if (ret) ++ return ret; ++ ++ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir, &dirent->k_i, flags); ++ *dir_offset = dirent->k.p.offset; ++ ++ return ret; ++} ++ ++static void dirent_copy_target(struct bkey_i_dirent *dst, ++ struct bkey_s_c_dirent src) ++{ ++ dst->v.d_inum = src.v->d_inum; ++ dst->v.d_type = src.v->d_type; ++} ++ ++int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, ++ struct bkey_s_c_dirent d, subvol_inum *target) ++{ ++ struct bch_subvolume s; ++ int ret = 0; ++ ++ if (d.v->d_type == DT_SUBVOL && ++ d.v->d_parent_subvol != dir.subvol) ++ return 1; ++ ++ if (likely(d.v->d_type != DT_SUBVOL)) { ++ target->subvol = dir.subvol; ++ target->inum = le64_to_cpu(d.v->d_inum); ++ } else { ++ target->subvol = le32_to_cpu(d.v->d_child_subvol); ++ ++ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); ++ ++ target->inum = le64_to_cpu(s.inode); ++ } ++ ++ return ret; ++} ++ ++int bch2_dirent_rename(struct btree_trans *trans, ++ subvol_inum src_dir, struct bch_hash_info *src_hash, ++ subvol_inum dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, ++ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter src_iter = { NULL }; ++ struct btree_iter dst_iter = { NULL }; ++ struct bkey_s_c old_src, old_dst = bkey_s_c_null; ++ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; ++ struct bpos dst_pos = ++ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); ++ unsigned src_type = 0, dst_type = 0, src_update_flags = 0; ++ int ret = 0; ++ ++ if (src_dir.subvol != dst_dir.subvol) ++ return -EXDEV; ++ ++ memset(src_inum, 0, sizeof(*src_inum)); ++ memset(dst_inum, 0, sizeof(*dst_inum)); ++ ++ /* Lookup src: */ ++ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto out; ++ ++ old_src = bch2_btree_iter_peek_slot(&src_iter); ++ ret = bkey_err(old_src); ++ if (ret) ++ goto out; ++ ++ ret = bch2_dirent_read_target(trans, src_dir, ++ bkey_s_c_to_dirent(old_src), src_inum); ++ if (ret) ++ goto out; ++ ++ src_type = bkey_s_c_to_dirent(old_src).v->d_type; ++ ++ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) ++ return -EOPNOTSUPP; ++ ++ ++ /* Lookup dst: */ ++ if (mode == BCH_RENAME) { ++ /* ++ * Note that we're _not_ checking if the target already exists - ++ * we're relying on the VFS to do that check for us for ++ * correctness: ++ */ ++ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name); ++ if (ret) ++ goto out; ++ } else { ++ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto out; ++ ++ old_dst = bch2_btree_iter_peek_slot(&dst_iter); ++ ret = bkey_err(old_dst); ++ if (ret) ++ goto out; ++ ++ ret = bch2_dirent_read_target(trans, dst_dir, ++ bkey_s_c_to_dirent(old_dst), dst_inum); ++ if (ret) ++ goto out; ++ ++ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; ++ ++ if (dst_type == DT_SUBVOL) ++ return -EOPNOTSUPP; ++ } ++ ++ if (mode != BCH_RENAME_EXCHANGE) ++ *src_offset = dst_iter.pos.offset; ++ ++ /* Create new dst key: */ ++ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_dst); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); ++ new_dst->k.p = dst_iter.pos; ++ ++ /* Create new src key: */ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); ++ new_src->k.p = src_iter.pos; ++ } else { ++ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ bkey_init(&new_src->k); ++ new_src->k.p = src_iter.pos; ++ ++ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && ++ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { ++ /* ++ * We have a hash collision for the new dst key, ++ * and new_src - the key we're deleting - is between ++ * new_dst's hashed slot and the slot we're going to be ++ * inserting it into - oops. This will break the hash ++ * table if we don't deal with it: ++ */ ++ if (mode == BCH_RENAME) { ++ /* ++ * If we're not overwriting, we can just insert ++ * new_dst at the src position: ++ */ ++ new_src = new_dst; ++ new_src->k.p = src_iter.pos; ++ goto out_set_src; ++ } else { ++ /* If we're overwriting, we can't insert new_dst ++ * at a different slot because it has to ++ * overwrite old_dst - just make sure to use a ++ * whiteout when deleting src: ++ */ ++ new_src->k.type = KEY_TYPE_hash_whiteout; ++ } ++ } else { ++ /* Check if we need a whiteout to delete src: */ ++ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, ++ src_hash, &src_iter); ++ if (ret < 0) ++ goto out; ++ ++ if (ret) ++ new_src->k.type = KEY_TYPE_hash_whiteout; ++ } ++ } ++ ++ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); ++ if (ret) ++ goto out; ++out_set_src: ++ ++ /* ++ * If we're deleting a subvolume, we need to really delete the dirent, ++ * not just emit a whiteout in the current snapshot: ++ */ ++ if (src_type == DT_SUBVOL) { ++ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&src_iter); ++ if (ret) ++ goto out; ++ ++ new_src->k.p = src_iter.pos; ++ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; ++ } ++ ++ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); ++ if (ret) ++ goto out; ++ ++ if (mode == BCH_RENAME_EXCHANGE) ++ *src_offset = new_src->k.p.offset; ++ *dst_offset = new_dst->k.p.offset; ++out: ++ bch2_trans_iter_exit(trans, &src_iter); ++ bch2_trans_iter_exit(trans, &dst_iter); ++ return ret; ++} ++ ++int __bch2_dirent_lookup_trans(struct btree_trans *trans, ++ struct btree_iter *iter, ++ subvol_inum dir, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, subvol_inum *inum, ++ unsigned flags) ++{ ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, ++ hash_info, dir, name, flags); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ d = bkey_s_c_to_dirent(k); ++ ++ ret = bch2_dirent_read_target(trans, dir, d, inum); ++ if (ret > 0) ++ ret = -ENOENT; ++err: ++ if (ret) ++ bch2_trans_iter_exit(trans, iter); ++ ++ return ret; ++} ++ ++u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, subvol_inum *inum) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, ++ name, inum, 0); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ if (!ret) ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, ++ SPOS(dir.inum, 0, snapshot), ++ POS(dir.inum, U64_MAX), 0, k, ret) ++ if (k.k->type == KEY_TYPE_dirent) { ++ ret = -ENOTEMPTY; ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ subvol_inum target; ++ u32 snapshot; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, ++ SPOS(inum.inum, ctx->pos, snapshot), ++ POS(inum.inum, U64_MAX), 0, k, ret) { ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ ret = bch2_dirent_read_target(&trans, inum, dirent, &target); ++ if (ret < 0) ++ break; ++ if (ret) ++ continue; ++ ++ /* ++ * XXX: dir_emit() can fault and block, while we're holding ++ * locks ++ */ ++ ctx->pos = dirent.k->p.offset; ++ if (!dir_emit(ctx, dirent.v->d_name, ++ bch2_dirent_name_bytes(dirent), ++ target.inum, ++ vfs_d_type(dirent.v->d_type))) ++ break; ++ ctx->pos = dirent.k->p.offset + 1; ++ ++ /* ++ * read_target looks up subvolumes, we can overflow paths if the ++ * directory has many subvolumes in it ++ */ ++ ret = btree_trans_too_many_iters(&trans); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +new file mode 100644 +index 000000000000..b1466932c768 +--- /dev/null ++++ b/fs/bcachefs/dirent.h +@@ -0,0 +1,67 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DIRENT_H ++#define _BCACHEFS_DIRENT_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_dirent_hash_desc; ++ ++int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++ .key_invalid = bch2_dirent_invalid, \ ++ .val_to_text = bch2_dirent_to_text, \ ++} ++ ++struct qstr; ++struct file; ++struct dir_context; ++struct bch_fs; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++ ++static inline unsigned dirent_val_u64s(unsigned len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, ++ sizeof(u64)); ++} ++ ++int bch2_dirent_read_target(struct btree_trans *, subvol_inum, ++ struct bkey_s_c_dirent, subvol_inum *); ++ ++int bch2_dirent_create(struct btree_trans *, subvol_inum, ++ const struct bch_hash_info *, u8, ++ const struct qstr *, u64, u64 *, int); ++ ++static inline unsigned vfs_d_type(unsigned type) ++{ ++ return type == DT_SUBVOL ? DT_DIR : type; ++} ++ ++enum bch_rename_mode { ++ BCH_RENAME, ++ BCH_RENAME_OVERWRITE, ++ BCH_RENAME_EXCHANGE, ++}; ++ ++int bch2_dirent_rename(struct btree_trans *, ++ subvol_inum, struct bch_hash_info *, ++ subvol_inum, struct bch_hash_info *, ++ const struct qstr *, subvol_inum *, u64 *, ++ const struct qstr *, subvol_inum *, u64 *, ++ enum bch_rename_mode); ++ ++int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, ++ subvol_inum, const struct bch_hash_info *, ++ const struct qstr *, subvol_inum *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, ++ const struct bch_hash_info *, ++ const struct qstr *, subvol_inum *); ++ ++int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); ++int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); ++ ++#endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +new file mode 100644 +index 000000000000..7bd4413671d2 +--- /dev/null ++++ b/fs/bcachefs/disk_groups.c +@@ -0,0 +1,506 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "super-io.h" ++ ++#include ++ ++static int group_cmp(const void *_l, const void *_r) ++{ ++ const struct bch_disk_group *l = _l; ++ const struct bch_disk_group *r = _r; ++ ++ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - ++ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ++ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - ++ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: ++ strncmp(l->label, r->label, sizeof(l->label)); ++} ++ ++static int bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g, *sorted = NULL; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned nr_groups = disk_groups_nr(groups); ++ unsigned i, len; ++ int ret = -EINVAL; ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ unsigned g; ++ ++ if (!BCH_MEMBER_GROUP(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (g >= nr_groups) { ++ prt_printf(err, "disk %u has invalid label %u (have %u)", ++ i, g, nr_groups); ++ return -EINVAL; ++ } ++ ++ if (BCH_GROUP_DELETED(&groups->entries[g])) { ++ prt_printf(err, "disk %u has deleted label %u", i, g); ++ return -EINVAL; ++ } ++ } ++ ++ if (!nr_groups) ++ return 0; ++ ++ for (i = 0; i < nr_groups; i++) { ++ g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ len = strnlen(g->label, sizeof(g->label)); ++ if (!len) { ++ prt_printf(err, "label %u empty", i); ++ return -EINVAL; ++ } ++ } ++ ++ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); ++ if (!sorted) ++ return -ENOMEM; ++ ++ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); ++ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); ++ ++ for (g = sorted; g + 1 < sorted + nr_groups; g++) ++ if (!BCH_GROUP_DELETED(g) && ++ !group_cmp(&g[0], &g[1])) { ++ prt_printf(err, "duplicate label %llu.%.*s", ++ BCH_GROUP_PARENT(g), ++ (int) sizeof(g->label), g->label); ++ goto err; ++ } ++ ++ ret = 0; ++err: ++ kfree(sorted); ++ return 0; ++} ++ ++static void bch2_sb_disk_groups_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g; ++ unsigned nr_groups = disk_groups_nr(groups); ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (g != groups->entries) ++ prt_printf(out, " "); ++ ++ if (BCH_GROUP_DELETED(g)) ++ prt_printf(out, "[deleted]"); ++ else ++ prt_printf(out, "[parent %llu name %s]", ++ BCH_GROUP_PARENT(g), g->label); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { ++ .validate = bch2_sb_disk_groups_validate, ++ .to_text = bch2_sb_disk_groups_to_text ++}; ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field_disk_groups *groups; ++ struct bch_disk_groups_cpu *cpu_g, *old_g; ++ unsigned i, g, nr_groups; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ if (!groups) ++ return 0; ++ ++ cpu_g = kzalloc(sizeof(*cpu_g) + ++ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); ++ if (!cpu_g) ++ return -ENOMEM; ++ ++ cpu_g->nr = nr_groups; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *src = &groups->entries[i]; ++ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; ++ ++ dst->deleted = BCH_GROUP_DELETED(src); ++ dst->parent = BCH_GROUP_PARENT(src); ++ } ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ struct bch_disk_group_cpu *dst = ++ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m); ++ while (g) { ++ dst = &cpu_g->entries[g - 1]; ++ __set_bit(i, dst->devs.d); ++ g = dst->parent; ++ } ++ } ++ ++ old_g = rcu_dereference_protected(c->disk_groups, ++ lockdep_is_held(&c->sb_lock)); ++ rcu_assign_pointer(c->disk_groups, cpu_g); ++ if (old_g) ++ kfree_rcu(old_g, rcu); ++ ++ return 0; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return NULL; ++ case TARGET_DEV: { ++ struct bch_dev *ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ return ca ? &ca->self : NULL; ++ } ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ return g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return false; ++ case TARGET_DEV: ++ return dev == t.dev; ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g; ++ const struct bch_devs_mask *m; ++ bool ret; ++ ++ rcu_read_lock(); ++ g = rcu_dereference(c->disk_groups); ++ m = g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ ++ ret = m ? test_bit(dev, m->d) : false; ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, ++ unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ if (!BCH_GROUP_DELETED(g) && ++ BCH_GROUP_PARENT(g) == parent && ++ strnlen(g->label, sizeof(g->label)) == namelen && ++ !memcmp(name, g->label, namelen)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ struct bch_disk_group *g; ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; ++ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); ++ i++) ++ ; ++ ++ if (i == nr_groups) { ++ unsigned u64s = ++ (sizeof(struct bch_sb_field_disk_groups) + ++ sizeof(struct bch_disk_group) * (nr_groups + 1)) / ++ sizeof(u64); ++ ++ groups = bch2_sb_resize_disk_groups(sb, u64s); ++ if (!groups) ++ return -ENOSPC; ++ ++ nr_groups = disk_groups_nr(groups); ++ } ++ ++ BUG_ON(i >= nr_groups); ++ ++ g = &groups->entries[i]; ++ ++ memcpy(g->label, name, namelen); ++ if (namelen < sizeof(g->label)) ++ g->label[namelen] = '\0'; ++ SET_BCH_GROUP_DELETED(g, 0); ++ SET_BCH_GROUP_PARENT(g, parent); ++ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); ++ ++ return i; ++} ++ ++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ v = __bch2_disk_group_find(groups, v + 1, name, len); ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups; ++ unsigned parent = 0; ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ groups = bch2_sb_get_disk_groups(sb->sb); ++ ++ v = __bch2_disk_group_find(groups, parent, name, len); ++ if (v < 0) ++ v = __bch2_disk_group_add(sb, parent, name, len); ++ if (v < 0) ++ return v; ++ ++ parent = v + 1; ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb); ++ struct bch_disk_group *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto inval; ++ ++ if (v >= disk_groups_nr(groups)) ++ goto inval; ++ ++ g = groups->entries + v; ++ ++ if (BCH_GROUP_DELETED(g)) ++ goto inval; ++ ++ path[nr++] = v; ++ ++ if (!BCH_GROUP_PARENT(g)) ++ break; ++ ++ v = BCH_GROUP_PARENT(g) - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); ++ if (nr) ++ prt_printf(out, "."); ++ } ++ return; ++inval: ++ prt_printf(out, "invalid label %u", v); ++} ++ ++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) ++{ ++ struct bch_member *mi; ++ int v = -1; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (!strlen(name) || !strcmp(name, "none")) ++ goto write_sb; ++ ++ v = bch2_disk_path_find_or_create(&c->disk_sb, name); ++ if (v < 0) { ++ mutex_unlock(&c->sb_lock); ++ return v; ++ } ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ goto unlock; ++write_sb: ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ SET_BCH_MEMBER_GROUP(mi, v + 1); ++ ++ bch2_write_super(c); ++unlock: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++{ ++ struct bch_dev *ca; ++ int g; ++ ++ if (!strlen(buf) || !strcmp(buf, "none")) { ++ *v = 0; ++ return 0; ++ } ++ ++ /* Is it a device? */ ++ ca = bch2_dev_lookup(c, buf); ++ if (!IS_ERR(ca)) { ++ *v = dev_to_target(ca->dev_idx); ++ percpu_ref_put(&ca->ref); ++ return 0; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ g = bch2_disk_path_find(&c->disk_sb, buf); ++ mutex_unlock(&c->sb_lock); ++ ++ if (g >= 0) { ++ *v = group_to_target(g); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ prt_printf(out, "none"); ++ break; ++ case TARGET_DEV: ++ if (c) { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ prt_printf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ prt_printf(out, "offline device %u", t.dev); ++ } else { ++ prt_printf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); ++ } else { ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_member *m = mi->members + t.dev; ++ ++ if (bch2_dev_exists(sb, mi, t.dev)) { ++ prt_printf(out, "Device "); ++ pr_uuid(out, m->uuid.b); ++ prt_printf(out, " (%u)", t.dev); ++ } else { ++ prt_printf(out, "Bad device %u", t.dev); ++ } ++ } ++ break; ++ case TARGET_GROUP: ++ if (c) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ } else { ++ bch2_disk_path_to_text(out, sb, t.group); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +new file mode 100644 +index 000000000000..de915480514b +--- /dev/null ++++ b/fs/bcachefs/disk_groups.h +@@ -0,0 +1,90 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_H ++#define _BCACHEFS_DISK_GROUPS_H ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; ++ ++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) ++{ ++ return groups ++ ? (vstruct_end(&groups->field) - ++ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) ++ : 0; ++} ++ ++struct target { ++ enum { ++ TARGET_NULL, ++ TARGET_DEV, ++ TARGET_GROUP, ++ } type; ++ union { ++ unsigned dev; ++ unsigned group; ++ }; ++}; ++ ++#define TARGET_DEV_START 1 ++#define TARGET_GROUP_START (256 + TARGET_DEV_START) ++ ++static inline u16 dev_to_target(unsigned dev) ++{ ++ return TARGET_DEV_START + dev; ++} ++ ++static inline u16 group_to_target(unsigned group) ++{ ++ return TARGET_GROUP_START + group; ++} ++ ++static inline struct target target_decode(unsigned target) ++{ ++ if (target >= TARGET_GROUP_START) ++ return (struct target) { ++ .type = TARGET_GROUP, ++ .group = target - TARGET_GROUP_START ++ }; ++ ++ if (target >= TARGET_DEV_START) ++ return (struct target) { ++ .type = TARGET_DEV, ++ .group = target - TARGET_DEV_START ++ }; ++ ++ return (struct target) { .type = TARGET_NULL }; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); ++ ++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, ++ enum bch_data_type data_type, ++ u16 target) ++{ ++ struct bch_devs_mask devs = c->rw_devs[data_type]; ++ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); ++ ++ if (t) ++ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); ++ return devs; ++} ++ ++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); ++ ++int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++ ++/* Exported for userspace bcachefs-tools: */ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++ ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); ++ ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *); ++ ++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); ++ ++const char *bch2_sb_validate_disk_groups(struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +new file mode 100644 +index 000000000000..f33acf1af110 +--- /dev/null ++++ b/fs/bcachefs/ec.c +@@ -0,0 +1,1673 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* erasure coding */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_buf.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++static void raid5_recov(unsigned disks, unsigned failed_idx, ++ size_t size, void **data) ++{ ++ unsigned i = 2, nr; ++ ++ BUG_ON(failed_idx >= disks); ++ ++ swap(data[0], data[failed_idx]); ++ memcpy(data[0], data[1], size); ++ ++ while (i < disks) { ++ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); ++ xor_blocks(nr, size, data[0], data + i); ++ i += nr; ++ } ++ ++ swap(data[0], data[failed_idx]); ++} ++ ++static void raid_gen(int nd, int np, size_t size, void **v) ++{ ++ if (np >= 1) ++ raid5_recov(nd + np, nd, size, v); ++ if (np >= 2) ++ raid6_call.gen_syndrome(nd + np, size, v); ++ BUG_ON(np > 2); ++} ++ ++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) ++{ ++ switch (nr) { ++ case 0: ++ break; ++ case 1: ++ if (ir[0] < nd + 1) ++ raid5_recov(nd + 1, ir[0], size, v); ++ else ++ raid6_call.gen_syndrome(nd + np, size, v); ++ break; ++ case 2: ++ if (ir[1] < nd) { ++ /* data+data failure. */ ++ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); ++ } else if (ir[0] < nd) { ++ /* data + p/q failure */ ++ ++ if (ir[1] == nd) /* data + p failure */ ++ raid6_datap_recov(nd + np, size, ir[0], v); ++ else { /* data + q failure */ ++ raid5_recov(nd + 1, ir[0], size, v); ++ raid6_call.gen_syndrome(nd + np, size, v); ++ } ++ } else { ++ raid_gen(nd, np, size, v); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++#else ++ ++#include ++ ++#endif ++ ++struct ec_bio { ++ struct bch_dev *ca; ++ struct ec_stripe_buf *buf; ++ size_t idx; ++ struct bio bio; ++}; ++ ++/* Stripes btree keys: */ ++ ++int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ ++ if (!bkey_cmp(k.k->p, POS_MIN)) { ++ prt_printf(err, "stripe at POS_MIN"); ++ return -EINVAL; ++ } ++ ++ if (k.k->p.inode) { ++ prt_printf(err, "nonzero inode field"); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*s)); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { ++ prt_printf(err, "incorrect value size (%zu < %u)", ++ bkey_val_u64s(k.k), stripe_val_u64s(s)); ++ return -EINVAL; ++ } ++ ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); ++} ++ ++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned i; ++ ++ prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", ++ s->algorithm, ++ le16_to_cpu(s->sectors), ++ s->nr_blocks - s->nr_redundant, ++ s->nr_redundant, ++ s->csum_type, ++ 1U << s->csum_granularity_bits); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev, ++ (u64) s->ptrs[i].offset, ++ stripe_blockcount_get(s, i)); ++} ++ ++/* returns blocknr in stripe that we matched: */ ++static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, ++ struct bkey_s_c k, unsigned *block) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned i, nr_data = s->nr_blocks - s->nr_redundant; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ for (i = 0; i < nr_data; i++) ++ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, ++ le16_to_cpu(s->sectors))) { ++ *block = i; ++ return ptr; ++ } ++ ++ return NULL; ++} ++ ++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ ++ extent_for_each_entry(e, entry) ++ if (extent_entry_type(entry) == ++ BCH_EXTENT_ENTRY_stripe_ptr && ++ entry->stripe_ptr.idx == idx) ++ return true; ++ ++ break; ++ } ++ } ++ ++ return false; ++} ++ ++/* Stripe bufs: */ ++ ++static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) ++{ ++ unsigned i; ++ ++ for (i = 0; i < buf->key.v.nr_blocks; i++) { ++ kvpfree(buf->data[i], buf->size << 9); ++ buf->data[i] = NULL; ++ } ++} ++ ++static int ec_stripe_buf_init(struct ec_stripe_buf *buf, ++ unsigned offset, unsigned size) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1U << v->csum_granularity_bits; ++ unsigned end = offset + size; ++ unsigned i; ++ ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ offset = round_down(offset, csum_granularity); ++ end = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)); ++ ++ buf->offset = offset; ++ buf->size = end - offset; ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < buf->key.v.nr_blocks; i++) { ++ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); ++ if (!buf->data[i]) ++ goto err; ++ } ++ ++ return 0; ++err: ++ ec_stripe_buf_exit(buf); ++ return -ENOMEM; ++} ++ ++/* Checksumming: */ ++ ++static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, ++ unsigned block, unsigned offset) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned end = buf->offset + buf->size; ++ unsigned len = min(csum_granularity, end - offset); ++ ++ BUG_ON(offset >= end); ++ BUG_ON(offset < buf->offset); ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ (len & (csum_granularity - 1))); ++ ++ return bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[block] + ((offset - buf->offset) << 9), ++ len << 9); ++} ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, j, csums_per_device = stripe_csums_per_device(v); ++ ++ if (!v->csum_type) ++ return; ++ ++ BUG_ON(buf->offset); ++ BUG_ON(buf->size != le16_to_cpu(v->sectors)); ++ ++ for (i = 0; i < v->nr_blocks; i++) ++ for (j = 0; j < csums_per_device; j++) ++ stripe_csum_set(v, i, j, ++ ec_block_checksum(buf, i, j << v->csum_granularity_bits)); ++} ++ ++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned i; ++ ++ if (!v->csum_type) ++ return; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ unsigned offset = buf->offset; ++ unsigned end = buf->offset + buf->size; ++ ++ if (!test_bit(i, buf->valid)) ++ continue; ++ ++ while (offset < end) { ++ unsigned j = offset >> v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, end - offset); ++ struct bch_csum want = stripe_csum_get(v, i, j); ++ struct bch_csum got = ec_block_checksum(buf, i, offset); ++ ++ if (bch2_crc_cmp(want, got)) { ++ struct printbuf buf2 = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); ++ ++ bch_err_ratelimited(c, ++ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", ++ (void *) _RET_IP_, i, j, v->csum_type, ++ want.lo, got.lo, buf2.buf); ++ printbuf_exit(&buf2); ++ clear_bit(i, buf->valid); ++ break; ++ } ++ ++ offset += len; ++ } ++ } ++} ++ ++/* Erasure coding: */ ++ ++static void ec_generate_ec(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = le16_to_cpu(v->sectors) << 9; ++ ++ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); ++} ++ ++static unsigned ec_nr_failed(struct ec_stripe_buf *buf) ++{ ++ return buf->key.v.nr_blocks - ++ bitmap_weight(buf->valid, buf->key.v.nr_blocks); ++} ++ ++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = buf->size << 9; ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (!test_bit(i, buf->valid)) ++ failed[nr_failed++] = i; ++ ++ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); ++ return 0; ++} ++ ++/* IO: */ ++ ++static void ec_block_endio(struct bio *bio) ++{ ++ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_stripe *v = &ec_bio->buf->key.v; ++ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; ++ struct bch_dev *ca = ec_bio->ca; ++ struct closure *cl = bio->bi_private; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", ++ bio_data_dir(bio) ? "write" : "read", ++ bch2_blk_status_to_str(bio->bi_status))) ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ ++ if (ptr_stale(ca, ptr)) { ++ bch_err_ratelimited(ca->fs, ++ "error %s stripe: stale pointer after io", ++ bio_data_dir(bio) == READ ? "reading from" : "writing to"); ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ } ++ ++ bio_put(&ec_bio->bio); ++ percpu_ref_put(&ca->io_ref); ++ closure_put(cl); ++} ++ ++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ++ unsigned rw, unsigned idx, struct closure *cl) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned offset = 0, bytes = buf->size << 9; ++ struct bch_extent_ptr *ptr = &v->ptrs[idx]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant ++ ? BCH_DATA_user ++ : BCH_DATA_parity; ++ ++ if (ptr_stale(ca, ptr)) { ++ bch_err_ratelimited(c, ++ "error %s stripe: stale pointer", ++ rw == READ ? "reading from" : "writing to"); ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ if (!bch2_dev_get_ioref(ca, rw)) { ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); ++ ++ while (offset < bytes) { ++ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, ++ DIV_ROUND_UP(bytes, PAGE_SIZE)); ++ unsigned b = min_t(size_t, bytes - offset, ++ nr_iovecs << PAGE_SHIFT); ++ struct ec_bio *ec_bio; ++ ++ ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, ++ nr_iovecs, ++ rw, ++ GFP_KERNEL, ++ &c->ec_bioset), ++ struct ec_bio, bio); ++ ++ ec_bio->ca = ca; ++ ec_bio->buf = buf; ++ ec_bio->idx = idx; ++ ++ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ++ ec_bio->bio.bi_end_io = ec_block_endio; ++ ec_bio->bio.bi_private = cl; ++ ++ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); ++ ++ closure_get(cl); ++ percpu_ref_get(&ca->io_ref); ++ ++ submit_bio(&ec_bio->bio); ++ ++ offset += b; ++ } ++ ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, ++ POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ if (k.k->type != KEY_TYPE_stripe) { ++ ret = -ENOENT; ++ goto err; ++ } ++ bkey_reassemble(&stripe->key.k_i, k); ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ ++ struct ec_stripe_buf *buf; ++ struct closure cl; ++ struct bch_stripe *v; ++ unsigned i, offset; ++ int ret = 0; ++ ++ closure_init_stack(&cl); ++ ++ BUG_ON(!rbio->pick.has_ec); ++ ++ buf = kzalloc(sizeof(*buf), GFP_NOIO); ++ if (!buf) ++ return -ENOMEM; ++ ++ ret = get_stripe_key(c, rbio->pick.ec.idx, buf); ++ if (ret) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: error %i looking up stripe", ret); ++ kfree(buf); ++ return -EIO; ++ } ++ ++ v = &buf->key.v; ++ ++ if (!bch2_ptr_matches_stripe(v, rbio->pick)) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: pointer doesn't match stripe"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; ++ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: read is bigger than stripe"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < v->nr_blocks; i++) ++ ec_block_io(c, buf, REQ_OP_READ, i, &cl); ++ ++ closure_sync(&cl); ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ bch_err_ratelimited(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ec_validate_checksums(c, buf); ++ ++ ret = ec_do_recov(c, buf); ++ if (ret) ++ goto err; ++ ++ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, ++ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); ++err: ++ ec_stripe_buf_exit(buf); ++ kfree(buf); ++ return ret; ++} ++ ++/* stripe bucket accounting: */ ++ ++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) ++{ ++ ec_stripes_heap n, *h = &c->ec_stripes_heap; ++ ++ if (idx >= h->size) { ++ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) ++ return -ENOMEM; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ if (n.size > h->size) { ++ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); ++ n.used = h->used; ++ swap(*h, n); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ free_heap(&n); ++ } ++ ++ if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) ++ return -ENOMEM; ++ ++ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && ++ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ec_stripe_mem_alloc(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ size_t idx = iter->pos.offset; ++ ++ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ return 0; ++ ++ bch2_trans_unlock(trans); ++ ++ return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?: ++ bch2_trans_relock(trans); ++} ++ ++static ssize_t stripe_idx_to_delete(struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ ++ return h->used && h->data[0].blocks_nonempty == 0 ++ ? h->data[0].idx : -1; ++} ++ ++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, ++ struct ec_stripe_heap_entry l, ++ struct ec_stripe_heap_entry r) ++{ ++ return ((l.blocks_nonempty > r.blocks_nonempty) - ++ (l.blocks_nonempty < r.blocks_nonempty)); ++} ++ ++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, ++ size_t i) ++{ ++ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); ++ ++ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; ++} ++ ++static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m = genradix_ptr(&c->stripes, idx); ++ ++ BUG_ON(!m->alive); ++ BUG_ON(m->heap_idx >= h->used); ++ BUG_ON(h->data[m->heap_idx].idx != idx); ++} ++ ++void bch2_stripes_heap_del(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (!m->on_heap) ++ return; ++ ++ m->on_heap = false; ++ ++ heap_verify_backpointer(c, idx); ++ ++ heap_del(&c->ec_stripes_heap, m->heap_idx, ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++} ++ ++void bch2_stripes_heap_insert(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (m->on_heap) ++ return; ++ ++ BUG_ON(heap_full(&c->ec_stripes_heap)); ++ ++ m->on_heap = true; ++ ++ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { ++ .idx = idx, ++ .blocks_nonempty = m->blocks_nonempty, ++ }), ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++} ++ ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (!m->on_heap) ++ return; ++ ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ ++/* stripe deletion */ ++ ++static int ec_stripe_delete(struct bch_fs *c, size_t idx) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_stripes, ++ POS(0, idx), ++ POS(0, idx + 1), ++ 0, NULL); ++} ++ ++static void ec_stripe_delete_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, ec_stripe_delete_work); ++ ssize_t idx; ++ ++ while (1) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ idx = stripe_idx_to_delete(c); ++ if (idx < 0) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ break; ++ } ++ ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ if (ec_stripe_delete(c, idx)) ++ break; ++ } ++} ++ ++/* stripe creation: */ ++ ++static int ec_stripe_bkey_insert(struct btree_trans *trans, ++ struct bkey_i_stripe *stripe, ++ struct disk_reservation *res) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bpos min_pos = POS(0, 1); ++ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); ++ int ret; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { ++ if (start_pos.offset) { ++ start_pos = min_pos; ++ bch2_btree_iter_set_pos(&iter, start_pos); ++ continue; ++ } ++ ++ ret = -ENOSPC; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) ++ break; ++ } ++ ++ c->ec_stripe_hint = iter.pos.offset; ++ ++ if (ret) ++ goto err; ++ ++ ret = ec_stripe_mem_alloc(trans, &iter); ++ if (ret) ++ goto err; ++ ++ stripe->k.p = iter.pos; ++ ++ ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static int ec_stripe_bkey_update(struct btree_trans *trans, ++ struct bkey_i_stripe *new, ++ struct disk_reservation *res) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ const struct bch_stripe *existing; ++ unsigned i; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, ++ new->k.p, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || k.k->type != KEY_TYPE_stripe) { ++ bch_err(trans->c, "error updating stripe: not found"); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ existing = bkey_s_c_to_stripe(k).v; ++ ++ if (existing->nr_blocks != new->v.nr_blocks) { ++ bch_err(trans->c, "error updating stripe: nr_blocks does not match"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ for (i = 0; i < new->v.nr_blocks; i++) ++ stripe_blockcount_set(&new->v, i, ++ stripe_blockcount_get(existing, i)); ++ ++ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static void extent_stripe_ptr_add(struct bkey_s_extent e, ++ struct ec_stripe_buf *s, ++ struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ struct bch_extent_stripe_ptr *dst = (void *) ptr; ++ union bch_extent_entry *end = extent_entry_last(e); ++ ++ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); ++ e.k->u64s += sizeof(*dst) / sizeof(u64); ++ ++ *dst = (struct bch_extent_stripe_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, ++ .block = block, ++ .redundancy = s->key.v.nr_redundant, ++ .idx = s->key.k.p.offset, ++ }; ++} ++ ++static int ec_stripe_update_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct ec_stripe_buf *s, ++ struct bpos end) ++{ ++ const struct bch_extent_ptr *ptr_c; ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ struct bkey_i *n; ++ int ret, dev, block; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ return 1; ++ ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) ++ return 0; ++ ++ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); ++ /* ++ * It doesn't generally make sense to erasure code cached ptrs: ++ * XXX: should we be incrementing a counter? ++ */ ++ if (!ptr_c || ptr_c->cached) ++ return 0; ++ ++ dev = s->key.v.ptrs[block].dev; ++ ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(n, k); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); ++ ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); ++ BUG_ON(!ec_ptr); ++ ++ extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block); ++ ++ return bch2_trans_update(trans, iter, n, 0); ++} ++ ++static int ec_stripe_update_extents(struct bch_fs *c, ++ struct ec_stripe_buf *s, ++ struct bkey *pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ return bch2_trans_run(c, ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_extents, bkey_start_pos(pos), ++ BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ ec_stripe_update_extent(&trans, &iter, k, s, pos->p))); ++} ++ ++/* ++ * data buckets of new stripe all written: create the stripe ++ */ ++static void ec_stripe_create(struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = s->c; ++ struct open_bucket *ob; ++ struct bkey_i *k; ++ struct stripe *m; ++ struct bch_stripe *v = &s->new_stripe.key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ int ret; ++ ++ BUG_ON(s->h->s == s); ++ ++ closure_sync(&s->iodone); ++ ++ if (s->err) { ++ if (s->err != -EROFS) ++ bch_err(c, "error creating stripe: error writing data buckets"); ++ goto err; ++ } ++ ++ if (s->have_existing_stripe) { ++ ec_validate_checksums(c, &s->existing_stripe); ++ ++ if (ec_do_recov(c, &s->existing_stripe)) { ++ bch_err(c, "error creating stripe: error reading existing stripe"); ++ goto err; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) ++ swap(s->new_stripe.data[i], ++ s->existing_stripe.data[i]); ++ ++ ec_stripe_buf_exit(&s->existing_stripe); ++ } ++ ++ BUG_ON(!s->allocated); ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ goto err; ++ ++ ec_generate_ec(&s->new_stripe); ++ ++ ec_generate_checksums(&s->new_stripe); ++ ++ /* write p/q: */ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); ++ closure_sync(&s->iodone); ++ ++ if (ec_nr_failed(&s->new_stripe)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } ++ ++ ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, ++ s->have_existing_stripe ++ ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res) ++ : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res)); ++ if (ret) { ++ bch_err(c, "error creating stripe: error creating stripe key"); ++ goto err_put_writes; ++ } ++ ++ for_each_keylist_key(&s->keys, k) { ++ ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k); ++ if (ret) { ++ bch_err(c, "error creating stripe: error updating pointers: %s", ++ bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); ++ ++ BUG_ON(m->on_heap); ++ bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++err_put_writes: ++ percpu_ref_put(&c->writes); ++err: ++ bch2_disk_reservation_put(c, &s->res); ++ ++ for (i = 0; i < v->nr_blocks; i++) ++ if (s->blocks[i]) { ++ ob = c->open_buckets + s->blocks[i]; ++ ++ if (i < nr_data) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++ } ++ ++ bch2_keylist_free(&s->keys, s->inline_keys); ++ ++ ec_stripe_buf_exit(&s->existing_stripe); ++ ec_stripe_buf_exit(&s->new_stripe); ++ closure_debug_destroy(&s->iodone); ++ kfree(s); ++} ++ ++static void ec_stripe_create_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, ++ struct bch_fs, ec_stripe_create_work); ++ struct ec_stripe_new *s, *n; ++restart: ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) ++ if (!atomic_read(&s->pin)) { ++ list_del(&s->list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ec_stripe_create(s); ++ goto restart; ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); ++ ++ if (atomic_dec_and_test(&s->pin)) { ++ BUG_ON(!s->pending); ++ queue_work(system_long_wq, &c->ec_stripe_create_work); ++ } ++} ++ ++static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = h->s; ++ ++ BUG_ON(!s->allocated && !s->err); ++ ++ h->s = NULL; ++ s->pending = true; ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_add(&s->list, &c->ec_stripe_new_list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ++ ec_stripe_new_put(c, s); ++} ++ ++/* have a full bucket - hand it off to be erasure coded: */ ++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ if (ob->sectors_free) ++ s->err = -1; ++ ++ ec_stripe_new_put(c, s); ++} ++ ++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ s->err = -EIO; ++} ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct bch_dev *ca; ++ unsigned offset; ++ ++ if (!ob) ++ return NULL; ++ ++ ca = bch_dev_bkey_exists(c, ob->dev); ++ offset = ca->mi.bucket_size - ob->sectors_free; ++ ++ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); ++} ++ ++void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, ++ struct bkey *k) ++{ ++ struct ec_stripe_new *ec = ob->ec; ++ ++ if (!ec) ++ return; ++ ++ mutex_lock(&ec->lock); ++ ++ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, ++ ARRAY_SIZE(ec->inline_keys), ++ BKEY_U64s)) { ++ BUG(); ++ } ++ ++ bkey_init(&ec->keys.top->k); ++ ec->keys.top->k.p = k->p; ++ ec->keys.top->k.size = k->size; ++ bch2_keylist_push(&ec->keys); ++ ++ mutex_unlock(&ec->lock); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ unsigned l = *((const unsigned *) _l); ++ unsigned r = *((const unsigned *) _r); ++ ++ return cmp_int(l, r); ++} ++ ++/* pick most common bucket size: */ ++static unsigned pick_blocksize(struct bch_fs *c, ++ struct bch_devs_mask *devs) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; ++ struct { ++ unsigned nr, size; ++ } cur = { 0, 0 }, best = { 0, 0 }; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ sizes[nr++] = ca->mi.bucket_size; ++ ++ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); ++ ++ for (i = 0; i < nr; i++) { ++ if (sizes[i] != cur.size) { ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ cur.nr = 0; ++ cur.size = sizes[i]; ++ } ++ ++ cur.nr++; ++ } ++ ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ return best.size; ++} ++ ++static bool may_create_new_stripe(struct bch_fs *c) ++{ ++ return false; ++} ++ ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ unsigned nr_data, ++ unsigned nr_parity, ++ unsigned stripe_size) ++{ ++ unsigned u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = nr_data + nr_parity; ++ s->v.nr_redundant = nr_parity; ++ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); ++ s->v.csum_type = BCH_CSUM_crc32c; ++ s->v.pad = 0; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s; ++ ++ lockdep_assert_held(&h->lock); ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ mutex_init(&s->lock); ++ closure_init(&s->iodone, NULL); ++ atomic_set(&s->pin, 1); ++ s->c = c; ++ s->h = h; ++ s->nr_data = min_t(unsigned, h->nr_active_devs, ++ BCH_BKEY_PTRS_MAX) - h->redundancy; ++ s->nr_parity = h->redundancy; ++ ++ bch2_keylist_init(&s->keys, s->inline_keys); ++ ++ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, ++ s->nr_parity, h->blocksize); ++ ++ h->s = s; ++ return 0; ++} ++ ++static struct ec_stripe_head * ++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, ++ unsigned algo, unsigned redundancy, ++ bool copygc) ++{ ++ struct ec_stripe_head *h; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ h = kzalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) ++ return NULL; ++ ++ mutex_init(&h->lock); ++ mutex_lock(&h->lock); ++ ++ h->target = target; ++ h->algo = algo; ++ h->redundancy = redundancy; ++ h->copygc = copygc; ++ ++ rcu_read_lock(); ++ h->devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(i, h->devs.d); ++ ++ h->blocksize = pick_blocksize(c, &h->devs); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ ++ rcu_read_unlock(); ++ list_add(&h->list, &c->ec_stripe_head_list); ++ return h; ++} ++ ++void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ if (h->s && ++ h->s->allocated && ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->nr_data) == h->s->nr_data) ++ ec_stripe_set_pending(c, h); ++ ++ mutex_unlock(&h->lock); ++} ++ ++struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy, ++ bool copygc) ++{ ++ struct ec_stripe_head *h; ++ ++ if (!redundancy) ++ return NULL; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) ++ if (h->target == target && ++ h->algo == algo && ++ h->redundancy == redundancy && ++ h->copygc == copygc) { ++ mutex_lock(&h->lock); ++ goto found; ++ } ++ ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc); ++found: ++ mutex_unlock(&c->ec_stripe_head_lock); ++ return h; ++} ++ ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ++ struct closure *cl) ++{ ++ struct bch_devs_mask devs = h->devs; ++ struct open_bucket *ob; ++ struct open_buckets buckets; ++ unsigned i, j, nr_have_parity = 0, nr_have_data = 0; ++ bool have_cache = true; ++ int ret = 0; ++ ++ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { ++ if (test_bit(i, h->s->blocks_gotten)) { ++ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); ++ if (i < h->s->nr_data) ++ nr_have_data++; ++ else ++ nr_have_parity++; ++ } ++ } ++ ++ BUG_ON(nr_have_data > h->s->nr_data); ++ BUG_ON(nr_have_parity > h->s->nr_parity); ++ ++ buckets.nr = 0; ++ if (nr_have_parity < h->s->nr_parity) { ++ ret = bch2_bucket_alloc_set(c, &buckets, ++ &h->parity_stripe, ++ &devs, ++ h->s->nr_parity, ++ &nr_have_parity, ++ &have_cache, ++ h->copygc ++ ? RESERVE_movinggc ++ : RESERVE_none, ++ 0, ++ cl); ++ ++ open_bucket_for_each(c, &buckets, ob, i) { ++ j = find_next_zero_bit(h->s->blocks_gotten, ++ h->s->nr_data + h->s->nr_parity, ++ h->s->nr_data); ++ BUG_ON(j >= h->s->nr_data + h->s->nr_parity); ++ ++ h->s->blocks[j] = buckets.v[i]; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); ++ __set_bit(j, h->s->blocks_gotten); ++ } ++ ++ if (ret) ++ return ret; ++ } ++ ++ buckets.nr = 0; ++ if (nr_have_data < h->s->nr_data) { ++ ret = bch2_bucket_alloc_set(c, &buckets, ++ &h->block_stripe, ++ &devs, ++ h->s->nr_data, ++ &nr_have_data, ++ &have_cache, ++ h->copygc ++ ? RESERVE_movinggc ++ : RESERVE_none, ++ 0, ++ cl); ++ ++ open_bucket_for_each(c, &buckets, ob, i) { ++ j = find_next_zero_bit(h->s->blocks_gotten, ++ h->s->nr_data, 0); ++ BUG_ON(j >= h->s->nr_data); ++ ++ h->s->blocks[j] = buckets.v[i]; ++ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); ++ __set_bit(j, h->s->blocks_gotten); ++ } ++ ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* XXX: doesn't obey target: */ ++static s64 get_existing_stripe(struct bch_fs *c, ++ struct ec_stripe_head *head) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t heap_idx; ++ u64 stripe_idx; ++ s64 ret = -1; ++ ++ if (may_create_new_stripe(c)) ++ return -1; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { ++ /* No blocks worth reusing, stripe will just be deleted: */ ++ if (!h->data[heap_idx].blocks_nonempty) ++ continue; ++ ++ stripe_idx = h->data[heap_idx].idx; ++ m = genradix_ptr(&c->stripes, stripe_idx); ++ ++ if (m->algorithm == head->algo && ++ m->nr_redundant == head->redundancy && ++ m->sectors == head->blocksize && ++ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { ++ bch2_stripes_heap_del(c, m, stripe_idx); ++ ret = stripe_idx; ++ break; ++ } ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return ret; ++} ++ ++static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, ++ struct ec_stripe_head *h) ++{ ++ unsigned i; ++ s64 idx; ++ int ret; ++ ++ idx = get_existing_stripe(c, h); ++ if (idx < 0) { ++ bch_err(c, "failed to find an existing stripe"); ++ return -ENOSPC; ++ } ++ ++ h->s->have_existing_stripe = true; ++ ret = get_stripe_key(c, idx, &h->s->existing_stripe); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); ++ return ret; ++ } ++ ++ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { ++ /* ++ * this is a problem: we have deleted from the ++ * stripes heap already ++ */ ++ BUG(); ++ } ++ ++ BUG_ON(h->s->existing_stripe.size != h->blocksize); ++ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); ++ ++ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { ++ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_gotten); ++ __set_bit(i, h->s->blocks_allocated); ++ } ++ ++ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); ++ } ++ ++ bkey_copy(&h->s->new_stripe.key.k_i, ++ &h->s->existing_stripe.key.k_i); ++ ++ return 0; ++} ++ ++static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, ++ struct ec_stripe_head *h) ++{ ++ int ret; ++ ++ ret = bch2_disk_reservation_get(c, &h->s->res, ++ h->blocksize, ++ h->s->nr_parity, 0); ++ ++ if (ret) { ++ /* ++ * This means we need to wait for copygc to ++ * empty out buckets from existing stripes: ++ */ ++ bch_err(c, "failed to reserve stripe"); ++ } ++ ++ return ret; ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy, ++ bool copygc, ++ struct closure *cl) ++{ ++ struct ec_stripe_head *h; ++ int ret; ++ bool needs_stripe_new; ++ ++ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); ++ if (!h) { ++ bch_err(c, "no stripe head"); ++ return NULL; ++ } ++ ++ needs_stripe_new = !h->s; ++ if (needs_stripe_new) { ++ if (ec_new_stripe_alloc(c, h)) { ++ ret = -ENOMEM; ++ bch_err(c, "failed to allocate new stripe"); ++ goto err; ++ } ++ ++ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) ++ BUG(); ++ } ++ ++ /* ++ * Try reserve a new stripe before reusing an ++ * existing stripe. This will prevent unnecessary ++ * read amplification during write oriented workloads. ++ */ ++ ret = 0; ++ if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe) ++ ret = __bch2_ec_stripe_head_reserve(c, h); ++ if (ret && needs_stripe_new) ++ ret = __bch2_ec_stripe_head_reuse(c, h); ++ if (ret) ++ goto err; ++ ++ if (!h->s->allocated) { ++ ret = new_stripe_alloc_buckets(c, h, cl); ++ if (ret) ++ goto err; ++ ++ h->s->allocated = true; ++ } ++ ++ return h; ++ ++err: ++ bch2_ec_stripe_head_put(c, h); ++ return ERR_PTR(ret); ++} ++ ++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ ++ mutex_lock(&h->lock); ++ if (!h->s) ++ goto unlock; ++ ++ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { ++ if (!h->s->blocks[i]) ++ continue; ++ ++ ob = c->open_buckets + h->s->blocks[i]; ++ if (ob->dev == ca->dev_idx) ++ goto found; ++ } ++ goto unlock; ++found: ++ h->s->err = -EROFS; ++ ec_stripe_set_pending(c, h); ++unlock: ++ mutex_unlock(&h->lock); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++} ++ ++void bch2_stripes_heap_start(struct bch_fs *c) ++{ ++ struct genradix_iter iter; ++ struct stripe *m; ++ ++ genradix_for_each(&c->stripes, iter, m) ++ if (m->alive) ++ bch2_stripes_heap_insert(c, m, iter.pos); ++} ++ ++int bch2_stripes_read(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ const struct bch_stripe *s; ++ struct stripe *m; ++ unsigned i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->type != KEY_TYPE_stripe) ++ continue; ++ ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); ++ if (ret) ++ break; ++ ++ s = bkey_s_c_to_stripe(k).v; ++ ++ m = genradix_ptr(&c->stripes, k.k->p.offset); ++ m->alive = true; ++ m->sectors = le16_to_cpu(s->sectors); ++ m->algorithm = s->algorithm; ++ m->nr_blocks = s->nr_blocks; ++ m->nr_redundant = s->nr_redundant; ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ m->blocks_nonempty += !!stripe_blockcount_get(s, i); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error reading stripes: %i", ret); ++ ++ return ret; ++} ++ ++void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (i = 0; i < min_t(size_t, h->used, 20); i++) { ++ m = genradix_ptr(&c->stripes, h->data[i].idx); ++ ++ prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx, ++ h->data[i].blocks_nonempty, ++ m->nr_blocks - m->nr_redundant, ++ m->nr_redundant); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++} ++ ++void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ prt_printf(out, "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ prt_printf(out, "\tpending: blocks %u+%u allocated %u\n", ++ h->s->nr_data, h->s->nr_parity, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->nr_data)); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry(s, &c->ec_stripe_new_list, list) { ++ prt_printf(out, "\tin flight: blocks %u+%u pin %u\n", ++ s->nr_data, s->nr_parity, ++ atomic_read(&s->pin)); ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++void bch2_fs_ec_exit(struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ ++ while (1) { ++ mutex_lock(&c->ec_stripe_head_lock); ++ h = list_first_entry_or_null(&c->ec_stripe_head_list, ++ struct ec_stripe_head, list); ++ if (h) ++ list_del(&h->list); ++ mutex_unlock(&c->ec_stripe_head_lock); ++ if (!h) ++ break; ++ ++ BUG_ON(h->s); ++ kfree(h); ++ } ++ ++ BUG_ON(!list_empty(&c->ec_stripe_new_list)); ++ ++ free_heap(&c->ec_stripes_heap); ++ genradix_free(&c->stripes); ++ bioset_exit(&c->ec_bioset); ++} ++ ++void bch2_fs_ec_init_early(struct bch_fs *c) ++{ ++ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); ++ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); ++} ++ ++int bch2_fs_ec_init(struct bch_fs *c) ++{ ++ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), ++ BIOSET_NEED_BVECS); ++} +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +new file mode 100644 +index 000000000000..a4c13d61af10 +--- /dev/null ++++ b/fs/bcachefs/ec.h +@@ -0,0 +1,230 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_H ++#define _BCACHEFS_EC_H ++ ++#include "ec_types.h" ++#include "buckets_types.h" ++#include "keylist_types.h" ++ ++int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, ++ int rw, struct printbuf *); ++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++ .key_invalid = bch2_stripe_invalid, \ ++ .val_to_text = bch2_stripe_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_stripe, \ ++ .atomic_trigger = bch2_mark_stripe, \ ++} ++ ++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(le16_to_cpu(s->sectors), ++ 1 << s->csum_granularity_bits); ++} ++ ++static inline unsigned stripe_csum_offset(const struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; ++ ++ return sizeof(struct bch_stripe) + ++ sizeof(struct bch_extent_ptr) * s->nr_blocks + ++ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; ++} ++ ++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return stripe_csum_offset(s, s->nr_blocks, 0) + ++ sizeof(u16) * idx; ++} ++ ++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); ++} ++ ++static inline void stripe_blockcount_set(struct bch_stripe *s, ++ unsigned idx, unsigned v) ++{ ++ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); ++ ++ *p = cpu_to_le16(v); ++} ++ ++static inline unsigned stripe_val_u64s(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), ++ sizeof(u64)); ++} ++ ++static inline void *stripe_csum(struct bch_stripe *s, ++ unsigned block, unsigned csum_idx) ++{ ++ EBUG_ON(block >= s->nr_blocks); ++ EBUG_ON(csum_idx >= stripe_csums_per_device(s)); ++ ++ return (void *) s + stripe_csum_offset(s, block, csum_idx); ++} ++ ++static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, ++ unsigned block, unsigned csum_idx) ++{ ++ struct bch_csum csum = { 0 }; ++ ++ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); ++ return csum; ++} ++ ++static inline void stripe_csum_set(struct bch_stripe *s, ++ unsigned block, unsigned csum_idx, ++ struct bch_csum csum) ++{ ++ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); ++} ++ ++static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, ++ const struct bch_extent_ptr *data_ptr, ++ unsigned sectors) ++{ ++ return data_ptr->dev == stripe_ptr->dev && ++ data_ptr->gen == stripe_ptr->gen && ++ data_ptr->offset >= stripe_ptr->offset && ++ data_ptr->offset < stripe_ptr->offset + sectors; ++} ++ ++static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, ++ struct extent_ptr_decoded p) ++{ ++ unsigned nr_data = s->nr_blocks - s->nr_redundant; ++ ++ BUG_ON(!p.has_ec); ++ ++ if (p.ec.block >= nr_data) ++ return false; ++ ++ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, ++ le16_to_cpu(s->sectors)); ++} ++ ++static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, ++ struct extent_ptr_decoded p) ++{ ++ unsigned nr_data = m->nr_blocks - m->nr_redundant; ++ ++ BUG_ON(!p.has_ec); ++ ++ if (p.ec.block >= nr_data) ++ return false; ++ ++ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, ++ m->sectors); ++} ++ ++struct bch_read_bio; ++ ++struct ec_stripe_buf { ++ /* might not be buffering the entire stripe: */ ++ unsigned offset; ++ unsigned size; ++ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; ++ ++ void *data[BCH_BKEY_PTRS_MAX]; ++ ++ union { ++ struct bkey_i_stripe key; ++ u64 pad[255]; ++ }; ++}; ++ ++struct ec_stripe_head; ++ ++struct ec_stripe_new { ++ struct bch_fs *c; ++ struct ec_stripe_head *h; ++ struct mutex lock; ++ struct list_head list; ++ struct closure iodone; ++ ++ /* counts in flight writes, stripe is created when pin == 0 */ ++ atomic_t pin; ++ ++ int err; ++ ++ u8 nr_data; ++ u8 nr_parity; ++ bool allocated; ++ bool pending; ++ bool have_existing_stripe; ++ ++ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; ++ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; ++ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; ++ struct disk_reservation res; ++ ++ struct keylist keys; ++ u64 inline_keys[BKEY_U64s * 8]; ++ ++ struct ec_stripe_buf new_stripe; ++ struct ec_stripe_buf existing_stripe; ++}; ++ ++struct ec_stripe_head { ++ struct list_head list; ++ struct mutex lock; ++ ++ unsigned target; ++ unsigned algo; ++ unsigned redundancy; ++ bool copygc; ++ ++ struct bch_devs_mask devs; ++ unsigned nr_active_devs; ++ ++ unsigned blocksize; ++ ++ struct dev_stripe_state block_stripe; ++ struct dev_stripe_state parity_stripe; ++ ++ struct ec_stripe_new *s; ++}; ++ ++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); ++void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, ++ struct bkey *); ++ ++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); ++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); ++ ++void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, ++ unsigned, unsigned, unsigned, bool, struct closure *); ++ ++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); ++ ++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); ++ ++void bch2_ec_flush_new_stripes(struct bch_fs *); ++ ++void bch2_stripes_heap_start(struct bch_fs *); ++ ++int bch2_stripes_read(struct bch_fs *); ++ ++void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); ++void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_ec_exit(struct bch_fs *); ++void bch2_fs_ec_init_early(struct bch_fs *); ++int bch2_fs_ec_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +new file mode 100644 +index 000000000000..edd93da663c1 +--- /dev/null ++++ b/fs/bcachefs/ec_types.h +@@ -0,0 +1,46 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_TYPES_H ++#define _BCACHEFS_EC_TYPES_H ++ ++#include ++ ++struct bch_replicas_padded { ++ struct bch_replicas_entry e; ++ u8 pad[BCH_BKEY_PTRS_MAX]; ++}; ++ ++struct stripe { ++ size_t heap_idx; ++ ++ u16 sectors; ++ u8 algorithm; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ ++ unsigned on_heap:1; ++ u8 blocks_nonempty; ++}; ++ ++struct gc_stripe { ++ u16 sectors; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ ++ u16 block_sectors[BCH_BKEY_PTRS_MAX]; ++ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; ++ ++ struct bch_replicas_padded r; ++}; ++ ++struct ec_stripe_heap_entry { ++ size_t idx; ++ unsigned blocks_nonempty; ++}; ++ ++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; ++ ++#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c +new file mode 100644 +index 000000000000..9da8a5973af0 +--- /dev/null ++++ b/fs/bcachefs/errcode.c +@@ -0,0 +1,51 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "errcode.h" ++ ++#include ++ ++static const char * const bch2_errcode_strs[] = { ++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, ++ BCH_ERRCODES() ++#undef x ++ NULL ++}; ++ ++#define BCH_ERR_0 0 ++ ++static unsigned bch2_errcode_parents[] = { ++#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class, ++ BCH_ERRCODES() ++#undef x ++}; ++ ++const char *bch2_err_str(int err) ++{ ++ const char *errstr; ++ err = abs(err); ++ ++ BUG_ON(err >= BCH_ERR_MAX); ++ ++ if (err >= BCH_ERR_START) ++ errstr = bch2_errcode_strs[err - BCH_ERR_START]; ++ else if (err) ++ errstr = errname(err); ++ else ++ errstr = "(No error)"; ++ return errstr ?: "(Invalid error)"; ++} ++ ++bool __bch2_err_matches(int err, int class) ++{ ++ err = abs(err); ++ class = abs(class); ++ ++ BUG_ON(err >= BCH_ERR_MAX); ++ BUG_ON(class >= BCH_ERR_MAX); ++ ++ while (err >= BCH_ERR_START && err != class) ++ err = bch2_errcode_parents[err - BCH_ERR_START]; ++ ++ return err == class; ++} +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +new file mode 100644 +index 000000000000..95925c8434b3 +--- /dev/null ++++ b/fs/bcachefs/errcode.h +@@ -0,0 +1,64 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERRCODE_H ++#define _BCACHEFS_ERRCODE_H ++ ++#define BCH_ERRCODES() \ ++ x(0, open_buckets_empty) \ ++ x(0, freelist_empty) \ ++ x(freelist_empty, no_buckets_found) \ ++ x(0, insufficient_devices) \ ++ x(0, transaction_restart) \ ++ x(transaction_restart, transaction_restart_fault_inject) \ ++ x(transaction_restart, transaction_restart_relock) \ ++ x(transaction_restart, transaction_restart_relock_path) \ ++ x(transaction_restart, transaction_restart_relock_path_intent) \ ++ x(transaction_restart, transaction_restart_relock_after_fill) \ ++ x(transaction_restart, transaction_restart_too_many_iters) \ ++ x(transaction_restart, transaction_restart_lock_node_reused) \ ++ x(transaction_restart, transaction_restart_fill_relock) \ ++ x(transaction_restart, transaction_restart_fill_mem_alloc_fail)\ ++ x(transaction_restart, transaction_restart_mem_realloced) \ ++ x(transaction_restart, transaction_restart_in_traverse_all) \ ++ x(transaction_restart, transaction_restart_would_deadlock) \ ++ x(transaction_restart, transaction_restart_would_deadlock_write)\ ++ x(transaction_restart, transaction_restart_upgrade) \ ++ x(transaction_restart, transaction_restart_key_cache_fill) \ ++ x(transaction_restart, transaction_restart_key_cache_raced) \ ++ x(transaction_restart, transaction_restart_key_cache_realloced)\ ++ x(transaction_restart, transaction_restart_journal_preres_get) \ ++ x(transaction_restart, transaction_restart_nested) \ ++ x(0, lock_fail_node_reused) \ ++ x(0, lock_fail_root_changed) \ ++ x(0, journal_reclaim_would_deadlock) \ ++ x(0, fsck) \ ++ x(fsck, fsck_fix) \ ++ x(fsck, fsck_ignore) \ ++ x(fsck, fsck_errors_not_fixed) \ ++ x(fsck, fsck_repair_unimplemented) \ ++ x(fsck, fsck_repair_impossible) \ ++ x(0, need_snapshot_cleanup) \ ++ x(0, need_topology_repair) ++ ++enum bch_errcode { ++ BCH_ERR_START = 2048, ++#define x(class, err) BCH_ERR_##err, ++ BCH_ERRCODES() ++#undef x ++ BCH_ERR_MAX ++}; ++ ++const char *bch2_err_str(int); ++bool __bch2_err_matches(int, int); ++ ++static inline bool _bch2_err_matches(int err, int class) ++{ ++ return err && __bch2_err_matches(err, class); ++} ++ ++#define bch2_err_matches(_err, _class) \ ++({ \ ++ BUILD_BUG_ON(!__builtin_constant_p(_class)); \ ++ _bch2_err_matches(_err, _class); \ ++}) ++ ++#endif /* _BCACHFES_ERRCODE_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +new file mode 100644 +index 000000000000..f6a895b2ceb7 +--- /dev/null ++++ b/fs/bcachefs/error.c +@@ -0,0 +1,184 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "error.h" ++#include "io.h" ++#include "super.h" ++ ++#define FSCK_ERR_RATELIMIT_NR 10 ++ ++bool bch2_inconsistent_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_ERROR, &c->flags); ++ ++ switch (c->opts.errors) { ++ case BCH_ON_ERROR_continue: ++ return false; ++ case BCH_ON_ERROR_ro: ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "inconsistency detected - emergency read only"); ++ return true; ++ case BCH_ON_ERROR_panic: ++ panic(bch2_fmt(c, "panic after error")); ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_topology_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ++ if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ bch2_inconsistent_error(c); ++} ++ ++void bch2_fatal_error(struct bch_fs *c) ++{ ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "fatal error - emergency read only"); ++} ++ ++void bch2_io_error_work(struct work_struct *work) ++{ ++ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); ++ struct bch_fs *c = ca->fs; ++ bool dev; ++ ++ down_write(&c->state_lock); ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, ++ BCH_FORCE_IF_DEGRADED); ++ if (dev ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, ++ BCH_FORCE_IF_DEGRADED) ++ : bch2_fs_emergency_read_only(c)) ++ bch_err(ca, ++ "too many IO errors, setting %s RO", ++ dev ? "device" : "filesystem"); ++ up_write(&c->state_lock); ++} ++ ++void bch2_io_error(struct bch_dev *ca) ++{ ++ //queue_work(system_long_wq, &ca->io_error_work); ++} ++ ++#ifdef __KERNEL__ ++#define ask_yn() false ++#else ++#include "tools-util.h" ++#endif ++ ++int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) ++{ ++ struct fsck_err_state *s = NULL; ++ va_list args; ++ bool fix = false, print = true, suppressing = false; ++ char _buf[sizeof(s->buf)], *buf = _buf; ++ ++ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { ++ va_start(args, fmt); ++ vprintk(fmt, args); ++ va_end(args); ++ ++ if (c->opts.errors == BCH_ON_ERROR_continue) { ++ bch_err(c, "fixing"); ++ return -BCH_ERR_fsck_fix; ++ } else { ++ bch2_inconsistent_error(c); ++ return -BCH_ERR_fsck_errors_not_fixed; ++ } ++ } ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry(s, &c->fsck_errors, list) ++ if (s->fmt == fmt) ++ goto found; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS); ++ if (!s) { ++ if (!c->fsck_alloc_err) ++ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); ++ c->fsck_alloc_err = true; ++ buf = _buf; ++ goto print; ++ } ++ ++ INIT_LIST_HEAD(&s->list); ++ s->fmt = fmt; ++found: ++ list_move(&s->list, &c->fsck_errors); ++ s->nr++; ++ if (c->opts.ratelimit_errors && ++ !(flags & FSCK_NO_RATELIMIT) && ++ s->nr >= FSCK_ERR_RATELIMIT_NR) { ++ if (s->nr == FSCK_ERR_RATELIMIT_NR) ++ suppressing = true; ++ else ++ print = false; ++ } ++ buf = s->buf; ++print: ++ va_start(args, fmt); ++ vscnprintf(buf, sizeof(_buf), fmt, args); ++ va_end(args); ++ ++ if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ bch_err(c, "%s, exiting", buf); ++ } else if (flags & FSCK_CAN_FIX) { ++ if (c->opts.fix_errors == FSCK_OPT_ASK) { ++ printk(KERN_ERR "%s: fix?", buf); ++ fix = ask_yn(); ++ } else if (c->opts.fix_errors == FSCK_OPT_YES || ++ (c->opts.nochanges && ++ !(flags & FSCK_CAN_IGNORE))) { ++ if (print) ++ bch_err(c, "%s, fixing", buf); ++ fix = true; ++ } else { ++ if (print) ++ bch_err(c, "%s, not fixing", buf); ++ fix = false; ++ } ++ } else if (flags & FSCK_NEED_FSCK) { ++ if (print) ++ bch_err(c, "%s (run fsck to correct)", buf); ++ } else { ++ if (print) ++ bch_err(c, "%s (repair unimplemented)", buf); ++ } ++ ++ if (suppressing) ++ bch_err(c, "Ratelimiting new instances of previous error"); ++ ++ mutex_unlock(&c->fsck_error_lock); ++ ++ if (fix) { ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ return -BCH_ERR_fsck_fix; ++ } else { ++ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); ++ set_bit(BCH_FS_ERROR, &c->flags); ++ return c->opts.fix_errors == FSCK_OPT_EXIT || ++ !(flags & FSCK_CAN_IGNORE) ++ ? -BCH_ERR_fsck_errors_not_fixed ++ : -BCH_ERR_fsck_ignore; ++ } ++} ++ ++void bch2_flush_fsck_errs(struct bch_fs *c) ++{ ++ struct fsck_err_state *s, *n; ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ if (s->ratelimited) ++ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); ++ ++ list_del(&s->list); ++ kfree(s); ++ } ++ ++ mutex_unlock(&c->fsck_error_lock); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +new file mode 100644 +index 000000000000..b603d738c549 +--- /dev/null ++++ b/fs/bcachefs/error.h +@@ -0,0 +1,223 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERROR_H ++#define _BCACHEFS_ERROR_H ++ ++#include ++#include ++ ++struct bch_dev; ++struct bch_fs; ++struct work_struct; ++ ++/* ++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag ++ * superblock as such ++ */ ++ ++/* Error messages: */ ++ ++/* ++ * Inconsistency errors: The on disk data is inconsistent. If these occur during ++ * initial recovery, they don't indicate a bug in the running code - we walk all ++ * the metadata before modifying anything. If they occur at runtime, they ++ * indicate either a bug in the running code or (less likely) data is being ++ * silently corrupted under us. ++ * ++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in ++ * BCH_ON_ERROR_CONTINUE mode ++ */ ++ ++bool bch2_inconsistent_error(struct bch_fs *); ++ ++void bch2_topology_error(struct bch_fs *); ++ ++#define bch2_fs_inconsistent(c, ...) \ ++({ \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_inconsistent_error(c); \ ++}) ++ ++#define bch2_fs_inconsistent_on(cond, c, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_fs_inconsistent(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Later we might want to mark only the particular device inconsistent, not the ++ * entire filesystem: ++ */ ++ ++#define bch2_dev_inconsistent(ca, ...) \ ++do { \ ++ bch_err(ca, __VA_ARGS__); \ ++ bch2_inconsistent_error((ca)->fs); \ ++} while (0) ++ ++#define bch2_dev_inconsistent_on(cond, ca, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * When a transaction update discovers or is causing a fs inconsistency, it's ++ * helpful to also dump the pending updates: ++ */ ++#define bch2_trans_inconsistent(trans, ...) \ ++({ \ ++ bch_err(trans->c, __VA_ARGS__); \ ++ bch2_inconsistent_error(trans->c); \ ++ bch2_dump_trans_updates(trans); \ ++}) ++ ++#define bch2_trans_inconsistent_on(cond, trans, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_trans_inconsistent(trans, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally ++ * be able to repair: ++ */ ++ ++enum fsck_err_opts { ++ FSCK_OPT_EXIT, ++ FSCK_OPT_YES, ++ FSCK_OPT_NO, ++ FSCK_OPT_ASK, ++}; ++ ++struct fsck_err_state { ++ struct list_head list; ++ const char *fmt; ++ u64 nr; ++ bool ratelimited; ++ char buf[512]; ++}; ++ ++#define FSCK_CAN_FIX (1 << 0) ++#define FSCK_CAN_IGNORE (1 << 1) ++#define FSCK_NEED_FSCK (1 << 2) ++#define FSCK_NO_RATELIMIT (1 << 3) ++ ++__printf(3, 4) __cold ++int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); ++void bch2_flush_fsck_errs(struct bch_fs *); ++ ++#define __fsck_err(c, _flags, msg, ...) \ ++({ \ ++ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ ++ \ ++ if (_ret != -BCH_ERR_fsck_fix && \ ++ _ret != -BCH_ERR_fsck_ignore) { \ ++ bch_err(c, "Unable to continue, halting"); \ ++ ret = _ret; \ ++ goto fsck_err; \ ++ } \ ++ \ ++ _ret == -BCH_ERR_fsck_fix; \ ++}) ++ ++/* These macros return true if error should be fixed: */ ++ ++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ ++ ++#define __fsck_err_on(cond, c, _flags, ...) \ ++ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ ++#define need_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define need_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++#define fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++/* ++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW ++ * mode - pretty much just due to metadata IO errors: ++ */ ++ ++void bch2_fatal_error(struct bch_fs *); ++ ++#define bch2_fs_fatal_error(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_fatal_error(c); \ ++} while (0) ++ ++#define bch2_fs_fatal_err_on(cond, c, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ \ ++ if (_ret) \ ++ bch2_fs_fatal_error(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * IO errors: either recoverable metadata IO (because we have replicas), or data ++ * IO - we need to log it and print out a message, but we don't (necessarily) ++ * want to shut down the fs: ++ */ ++ ++void bch2_io_error_work(struct work_struct *); ++ ++/* Does the error handling without logging a message */ ++void bch2_io_error(struct bch_dev *); ++ ++/* Logs message and handles the error: */ ++#define bch2_dev_io_error(ca, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \ ++ (ca)->name, ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\ ++ (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_io_err_on(cond, ca, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_io_error(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\ ++ _ret; \ ++}) ++ ++#endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +new file mode 100644 +index 000000000000..2fd5d9672a44 +--- /dev/null ++++ b/fs/bcachefs/extent_update.c +@@ -0,0 +1,178 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "debug.h" ++#include "extents.h" ++#include "extent_update.h" ++ ++/* ++ * This counts the number of iterators to the alloc & ec btrees we'll need ++ * inserting/removing this extent: ++ */ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0, lru = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ /* Might also be updating LRU btree */ ++ if (entry->ptr.cached) ++ lru++; ++ ++ fallthrough; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ /* ++ * Updating keys in the alloc btree may also update keys in the ++ * freespace or discard btrees: ++ */ ++ return lru + ret * 2; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters) ++{ ++ int ret = 0, ret2 = 0; ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key_norestart(trans, iter, ++ BTREE_ID_reflink, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret2) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ /* extent_update_to_keys(), for the reflink_v update */ ++ *nr_iters += 1; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += min_t(u64, k.k->size, ++ r_k.k->p.offset - idx); ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ break; ++ } ++ } ++ ++ return ret2 ?: ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_iter copy; ++ struct bkey_s_c k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ *end = insert->k.p; ++ ++ /* extent_update_to_keys(): */ ++ nr_iters += 1; ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2); ++ if (ret < 0) ++ return ret; ++ ++ bch2_trans_copy_iter(©, iter); ++ ++ for_each_btree_key_continue_norestart(copy, 0, k, ret) { ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ /* extent_handle_overwrites(): */ ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ case BCH_EXTENT_OVERLAP_FRONT: ++ nr_iters += 1; ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ nr_iters += 2; ++ break; ++ } ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_iter_exit(trans, ©); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(trans, iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, k); ++ return 0; ++} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +new file mode 100644 +index 000000000000..6f5cf449361a +--- /dev/null ++++ b/fs/bcachefs/extent_update.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENT_UPDATE_H ++#define _BCACHEFS_EXTENT_UPDATE_H ++ ++#include "bcachefs.h" ++ ++int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bpos *); ++int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); ++ ++#endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +new file mode 100644 +index 000000000000..2ca13014b9c4 +--- /dev/null ++++ b/fs/bcachefs/extents.c +@@ -0,0 +1,1324 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * ++ * Code for managing the extent btree and dynamically updating the writeback ++ * dirty sector count. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); ++ ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *, ++ struct bch_extent_crc_unpacked, ++ enum bch_extent_entry_type); ++ ++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *i; ++ ++ for (i = f->devs; i < f->devs + f->nr; i++) ++ if (i->dev == dev) ++ return i; ++ ++ return NULL; ++} ++ ++void bch2_mark_io_failure(struct bch_io_failures *failed, ++ struct extent_ptr_decoded *p) ++{ ++ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ f->dev = p->ptr.dev; ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else if (p->idx != f->idx) { ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else { ++ f->nr_failed++; ++ } ++} ++ ++/* ++ * returns true if p1 is better than p2: ++ */ ++static inline bool ptr_better(struct bch_fs *c, ++ const struct extent_ptr_decoded p1, ++ const struct extent_ptr_decoded p2) ++{ ++ if (likely(!p1.idx && !p2.idx)) { ++ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); ++ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); ++ ++ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); ++ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); ++ ++ /* Pick at random, biased in favor of the faster device: */ ++ ++ return bch2_rand_range(l1 + l2) > l1; ++ } ++ ++ if (bch2_force_reconstruct_read) ++ return p1.idx > p2.idx; ++ ++ return p1.idx < p2.idx; ++} ++ ++/* ++ * This picks a non-stale pointer, preferably from a device other than @avoid. ++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to ++ * other devices, it will still pick a pointer from avoid. ++ */ ++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_dev_io_failures *f; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_error) ++ return -EIO; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ /* ++ * If there are any dirty pointers it's an error if we can't ++ * read: ++ */ ++ if (!ret && !p.ptr.cached) ++ ret = -EIO; ++ ++ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) ++ continue; ++ ++ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; ++ if (f) ++ p.idx = f->nr_failed < f->nr_retries ++ ? f->idx ++ : f->idx + 1; ++ ++ if (!p.idx && ++ !bch2_dev_is_readable(ca)) ++ p.idx++; ++ ++ if (bch2_force_reconstruct_read && ++ !p.idx && p.has_ec) ++ p.idx++; ++ ++ if (p.idx >= (unsigned) p.has_ec + 1) ++ continue; ++ ++ if (ret > 0 && !ptr_better(c, p, *pick)) ++ continue; ++ ++ *pick = p; ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { ++ prt_printf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), BCH_REPLICAS_MAX); ++ return -EINVAL; ++ } ++ ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); ++} ++ ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { ++ prt_printf(err, "value too small (%zu <= %zu)", ++ bkey_val_bytes(k.k), sizeof(*bp.v)); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { ++ prt_printf(err, "value too big (%zu > %zu)", ++ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); ++ return -EINVAL; ++ } ++ ++ if (c->sb.version < bcachefs_metadata_version_snapshot && ++ bp.v->min_key.snapshot) { ++ prt_printf(err, "invalid min_key.snapshot (%u != 0)", ++ bp.v->min_key.snapshot); ++ return -EINVAL; ++ } ++ ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); ++} ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ prt_printf(out, "seq %llx written %u min_key %s", ++ le64_to_cpu(bp.v->seq), ++ le16_to_cpu(bp.v->sectors_written), ++ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); ++ ++ bch2_bpos_to_text(out, bp.v->min_key); ++ prt_printf(out, " "); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s k) ++{ ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); ++ ++ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bp.v->min_key, POS_MIN)) ++ bp.v->min_key = write ++ ? bpos_nosnap_predecessor(bp.v->min_key) ++ : bpos_nosnap_successor(bp.v->min_key); ++} ++ ++/* KEY_TYPE_extent: */ ++ ++bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) ++{ ++ struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); ++ struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); ++ union bch_extent_entry *en_l; ++ const union bch_extent_entry *en_r; ++ struct extent_ptr_decoded lp, rp; ++ bool use_right_ptr; ++ struct bch_dev *ca; ++ ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ while (en_l < l_ptrs.end && en_r < r_ptrs.end) { ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return false; ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); ++ } ++ ++ if (en_l < l_ptrs.end || en_r < r_ptrs.end) ++ return false; ++ ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ lp.crc = bch2_extent_crc_unpack(l.k, NULL); ++ rp.crc = bch2_extent_crc_unpack(r.k, NULL); ++ ++ while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && ++ __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { ++ if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != ++ rp.ptr.offset + rp.crc.offset || ++ lp.ptr.dev != rp.ptr.dev || ++ lp.ptr.gen != rp.ptr.gen || ++ lp.has_ec != rp.has_ec) ++ return false; ++ ++ /* Extents may not straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp.ptr.dev); ++ if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) ++ return false; ++ ++ if (lp.has_ec != rp.has_ec || ++ (lp.has_ec && ++ (lp.ec.block != rp.ec.block || ++ lp.ec.redundancy != rp.ec.redundancy || ++ lp.ec.idx != rp.ec.idx))) ++ return false; ++ ++ if (lp.crc.compression_type != rp.crc.compression_type || ++ lp.crc.nonce != rp.crc.nonce) ++ return false; ++ ++ if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= ++ lp.crc.uncompressed_size) { ++ /* can use left extent's crc entry */ ++ } else if (lp.crc.live_size <= rp.crc.offset ) { ++ /* can use right extent's crc entry */ ++ } else { ++ /* check if checksums can be merged: */ ++ if (lp.crc.csum_type != rp.crc.csum_type || ++ lp.crc.nonce != rp.crc.nonce || ++ crc_is_compressed(lp.crc) || ++ !bch2_checksum_mergeable(lp.crc.csum_type)) ++ return false; ++ ++ if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || ++ rp.crc.offset) ++ return false; ++ ++ if (lp.crc.csum_type && ++ lp.crc.uncompressed_size + ++ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) ++ return false; ++ } ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); ++ } ++ ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ while (en_l < l_ptrs.end && en_r < r_ptrs.end) { ++ if (extent_entry_is_crc(en_l)) { ++ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return false; ++ } ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); ++ } ++ ++ use_right_ptr = false; ++ en_l = l_ptrs.start; ++ en_r = r_ptrs.start; ++ while (en_l < l_ptrs.end) { ++ if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && ++ use_right_ptr) ++ en_l->ptr = en_r->ptr; ++ ++ if (extent_entry_is_crc(en_l)) { ++ struct bch_extent_crc_unpacked crc_l = ++ bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ struct bch_extent_crc_unpacked crc_r = ++ bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ use_right_ptr = false; ++ ++ if (crc_l.offset + crc_l.live_size + crc_r.live_size <= ++ crc_l.uncompressed_size) { ++ /* can use left extent's crc entry */ ++ } else if (crc_l.live_size <= crc_r.offset ) { ++ /* can use right extent's crc entry */ ++ crc_r.offset -= crc_l.live_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, ++ extent_entry_type(en_l)); ++ use_right_ptr = true; ++ } else { ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } ++ } ++ ++ en_l = extent_entry_next(en_l); ++ en_r = extent_entry_next(en_r); ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(*r.v)); ++ return -EINVAL; ++ } ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { ++ prt_printf(err, "invalid nr_replicas (%u)", ++ r.v->nr_replicas); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ prt_printf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return false; ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++/* Extent checksum entries: */ ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); ++} ++ ++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, ++ struct bch_extent_crc_unpacked n) ++{ ++ return !crc_is_compressed(u) && ++ u.csum_type && ++ u.uncompressed_size > u.live_size && ++ bch2_csum_type_is_encryption(u.csum_type) == ++ bch2_csum_type_is_encryption(n.csum_type); ++} ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, ++ struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ if (!n.csum_type) ++ return false; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (can_narrow_crc(crc, n)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're writing another replica for this extent, so while we've got the data in ++ * memory we'll be computing a new checksum for the currently live data. ++ * ++ * If there are other replicas we aren't moving, and they are checksummed but ++ * not compressed, we can modify them to point to only the data that is ++ * currently live (so that readers won't have to bounce) while we've got the ++ * checksum we need: ++ */ ++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked u; ++ struct extent_ptr_decoded p; ++ union bch_extent_entry *i; ++ bool ret = false; ++ ++ /* Find a checksum entry that covers only live data: */ ++ if (!n.csum_type) { ++ bkey_for_each_crc(&k->k, ptrs, u, i) ++ if (!crc_is_compressed(u) && ++ u.csum_type && ++ u.live_size == u.uncompressed_size) { ++ n = u; ++ goto found; ++ } ++ return false; ++ } ++found: ++ BUG_ON(crc_is_compressed(n)); ++ BUG_ON(n.offset); ++ BUG_ON(n.live_size != k->k.size); ++ ++restart_narrow_pointers: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ ++ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) ++ if (can_narrow_crc(p.crc, n)) { ++ __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ p.ptr.offset += p.crc.offset; ++ p.crc = n; ++ bch2_extent_ptr_decoded_append(k, &p); ++ ret = true; ++ goto restart_narrow_pointers; ++ } ++ ++ return ret; ++} ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) ++{ ++#define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (type) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields ++} ++ ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; ++ ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); ++ ++ bch2_extent_crc_pack(crc, new, type); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++} ++ ++/* Generic code for keys with pointers: */ ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ return bch2_bkey_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) ++{ ++ return k.k->type == KEY_TYPE_reservation ++ ? bkey_s_c_to_reservation(k).v->nr_replicas ++ : bch2_bkey_dirty_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) ++{ ++ unsigned ret = 0; ++ ++ if (k.k->type == KEY_TYPE_reservation) { ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ } else { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ ret += !p.ptr.cached && !crc_is_compressed(p.crc); ++ } ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && crc_is_compressed(p.crc)) ++ ret += p.crc.compressed_size; ++ ++ return ret; ++} ++ ++bool bch2_bkey_is_incompressible(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ return true; ++ return false; ++} ++ ++unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p = { 0 }; ++ unsigned replicas = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (p.has_ec) ++ replicas += p.ec.redundancy; ++ ++ replicas++; ++ ++ } ++ ++ return replicas; ++} ++ ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) ++{ ++ unsigned durability = 0; ++ struct bch_dev *ca; ++ ++ if (p.ptr.cached) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_failed) ++ durability = max_t(unsigned, durability, ca->mi.durability); ++ ++ if (p.has_ec) ++ durability += p.ec.redundancy; ++ ++ return durability; ++} ++ ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned durability = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); ++ ++ return durability; ++} ++ ++void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ union bch_extent_entry *next = extent_entry_next(entry); ++ ++ memmove_u64s(entry, next, (u64 *) end - (u64 *) next); ++ k->k.u64s -= extent_entry_u64s(entry); ++} ++ ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); ++ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ++ ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void __extent_entry_insert(struct bkey_i *k, ++ union bch_extent_entry *dst, ++ union bch_extent_entry *new) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ ++ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), ++ dst, (u64 *) end - (u64 *) dst); ++ k->k.u64s += extent_entry_u64s(new); ++ memcpy(dst, new, extent_entry_bytes(new)); ++} ++ ++void bch2_extent_ptr_decoded_append(struct bkey_i *k, ++ struct extent_ptr_decoded *p) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(&k->k, NULL); ++ union bch_extent_entry *pos; ++ ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = ptrs.start; ++ goto found; ++ } ++ ++ bkey_for_each_crc(&k->k, ptrs, crc, pos) ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = extent_entry_next(pos); ++ goto found; ++ } ++ ++ bch2_extent_crc_append(k, p->crc); ++ pos = bkey_val_end(bkey_i_to_s(k)); ++found: ++ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ptr)); ++ ++ if (p->has_ec) { ++ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ec)); ++ } ++} ++ ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *next = extent_entry_next(entry); ++ ++ /* stripes have ptrs, but their layout doesn't work with this code */ ++ BUG_ON(k.k->type == KEY_TYPE_stripe); ++ ++ memmove_u64s_down(entry, next, ++ (u64 *) bkey_val_end(k) - (u64 *) next); ++ k.k->u64s -= (u64 *) next - (u64 *) entry; ++} ++ ++/* ++ * Returns pointer to the next entry after the one being dropped: ++ */ ++static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry = to_entry(ptr), *next; ++ union bch_extent_entry *ret = entry; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ for (next = extent_entry_next(entry); ++ next != ptrs.end; ++ next = extent_entry_next(next)) { ++ if (extent_entry_is_crc(next)) { ++ break; ++ } else if (extent_entry_is_ptr(next)) { ++ drop_crc = false; ++ break; ++ } ++ } ++ ++ extent_entry_drop(k, entry); ++ ++ while ((entry = extent_entry_prev(ptrs, entry))) { ++ if (extent_entry_is_ptr(entry)) ++ break; ++ ++ if ((extent_entry_is_crc(entry) && drop_crc) || ++ extent_entry_is_stripe_ptr(entry)) { ++ ret = (void *) ret - extent_entry_bytes(entry); ++ extent_entry_drop(k, entry); ++ } ++ } ++ ++ return ret; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; ++ union bch_extent_entry *ret = ++ __bch2_bkey_drop_ptr(k, ptr); ++ ++ /* ++ * If we deleted all the dirty pointers and there's still cached ++ * pointers, we could set the cached pointers to dirty if they're not ++ * stale - but to do that correctly we'd need to grab an open_bucket ++ * reference so that we don't race with bucket reuse: ++ */ ++ if (have_dirty && ++ !bch2_bkey_dirty_devs(k.s_c).nr) { ++ k.k->type = KEY_TYPE_error; ++ set_bkey_val_u64s(k.k, 0); ++ ret = NULL; ++ } else if (!bch2_bkey_nr_ptrs(k.s_c)) { ++ k.k->type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(k.k, 0); ++ ret = NULL; ++ } ++ ++ return ret; ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); ++ ++ if (ptr) ++ __bch2_bkey_drop_ptr(k, ptr); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * Returns true if two extents refer to the same data: ++ */ ++bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) ++{ ++ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); ++ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); ++ const union bch_extent_entry *entry1, *entry2; ++ struct extent_ptr_decoded p1, p2; ++ ++ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) ++ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) ++ if (p1.ptr.dev == p2.ptr.dev && ++ p1.ptr.gen == p2.ptr.gen && ++ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == ++ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, ++ struct bkey_s_c k2) ++{ ++ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); ++ const union bch_extent_entry *entry2; ++ struct extent_ptr_decoded p2; ++ ++ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) ++ if (p1.ptr.dev == p2.ptr.dev && ++ p1.ptr.gen == p2.ptr.gen && ++ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == ++ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. ++ * ++ * Returns true if @k should be dropped entirely ++ * ++ * For existing keys, only called when btree nodes are being rewritten, not when ++ * they're merely being compacted/resorted in memory. ++ */ ++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ++ ptr->cached && ++ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); ++ ++ return bkey_deleted(k.k); ++} ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ prt_printf(out, " "); ++ ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ if (!ca) { ++ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : ""); ++ } else { ++ u32 offset; ++ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); ++ ++ prt_printf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev, ++ b, offset, ptr->gen, ++ ptr->cached ? " cached" : ""); ++ ++ if (ca && ptr_stale(ca, ptr)) ++ prt_printf(out, " stale"); ++ } ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ bch2_csum_types[crc.csum_type], ++ bch2_compression_types[crc.compression_type]); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ prt_printf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; ++ } ++ ++ first = false; ++ } ++} ++ ++static int extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata, ++ struct printbuf *err) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ u64 bucket; ++ u32 bucket_offset; ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, ptr->dev)) { ++ prt_printf(err, "pointer to invalid device (%u)", ptr->dev); ++ return -EINVAL; ++ } ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) { ++ prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); ++ return -EINVAL; ++ } ++ ++ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); ++ ++ if (bucket >= ca->mi.nbuckets) { ++ prt_printf(err, "pointer past last bucket (%llu > %llu)", ++ bucket, ca->mi.nbuckets); ++ return -EINVAL; ++ } ++ ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { ++ prt_printf(err, "pointer before first bucket (%llu < %u)", ++ bucket, ca->mi.first_bucket); ++ return -EINVAL; ++ } ++ ++ if (bucket_offset + size_ondisk > ca->mi.bucket_size) { ++ prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", ++ bucket_offset, size_ondisk, ca->mi.bucket_size); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ unsigned nonce = UINT_MAX; ++ unsigned nr_ptrs = 0; ++ int ret; ++ ++ if (bkey_is_btree_ptr(k.k)) ++ size_ondisk = btree_sectors(c); ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { ++ prt_printf(err, "invalid extent entry type (got %u, max %u)", ++ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); ++ return -EINVAL; ++ } ++ ++ if (bkey_is_btree_ptr(k.k) && ++ !extent_entry_is_ptr(entry)) { ++ prt_printf(err, "has non ptr field"); ++ return -EINVAL; ++ } ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, ++ false, err); ++ if (ret) ++ return ret; ++ nr_ptrs++; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) { ++ prt_printf(err, "checksum offset + key size > uncompressed size"); ++ return -EINVAL; ++ } ++ ++ size_ondisk = crc.compressed_size; ++ ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) { ++ prt_printf(err, "invalid checksum type"); ++ return -EINVAL; ++ } ++ ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { ++ prt_printf(err, "invalid compression type"); ++ return -EINVAL; ++ } ++ ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) { ++ prt_printf(err, "incorrect nonce"); ++ return -EINVAL; ++ } ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++ ++ if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { ++ prt_str(err, "too many ptrs"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_ptr_swab(struct bkey_s k) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ u64 *d; ++ ++ for (d = (u64 *) ptrs.start; ++ d != (u64 *) ptrs.end; ++ d++) ++ *d = swab64(*d); ++ ++ for (entry = ptrs.start; ++ entry < ptrs.end; ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++} ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 sub; ++ ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); ++ ++ sub = where.offset - bkey_start_offset(k.k); ++ ++ k.k->size -= sub; ++ ++ if (!k.k->size) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } ++ ++ break; ++ } ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); ++ break; ++ } ++ case KEY_TYPE_inline_data: ++ case KEY_TYPE_indirect_inline_data: { ++ void *p = bkey_inline_data_p(k); ++ unsigned bytes = bkey_inline_data_bytes(k.k); ++ ++ sub = min_t(u64, sub << 9, bytes); ++ ++ memmove(p, p + sub, bytes - sub); ++ ++ new_val_u64s -= sub >> 3; ++ break; ++ } ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} ++ ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 len = 0; ++ ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); ++ ++ len = where.offset - bkey_start_offset(k.k); ++ ++ k.k->p.offset = where.offset; ++ k.k->size = len; ++ ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ case KEY_TYPE_indirect_inline_data: ++ new_val_u64s = (bkey_inline_data_offset(k.k) + ++ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; ++ break; ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +new file mode 100644 +index 000000000000..3c17b81130bb +--- /dev/null ++++ b/fs/bcachefs/extents.h +@@ -0,0 +1,685 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_H ++#define _BCACHEFS_EXTENTS_H ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "extents_types.h" ++ ++struct bch_fs; ++struct btree_trans; ++ ++/* extent entries: */ ++ ++#define extent_entry_last(_e) \ ++ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) ++ ++#define entry_to_ptr(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ ++ \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const struct bch_extent_ptr *) (_entry), \ ++ (struct bch_extent_ptr *) (_entry)); \ ++}) ++ ++/* downcast, preserves const */ ++#define to_entry(_entry) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ ++ !type_is(_entry, struct bch_extent_ptr *) && \ ++ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ ++ \ ++ __builtin_choose_expr( \ ++ (type_is_exact(_entry, const union bch_extent_crc *) || \ ++ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ ++ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ ++ (const union bch_extent_entry *) (_entry), \ ++ (union bch_extent_entry *) (_entry)); \ ++}) ++ ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ ++static inline unsigned ++__extent_entry_type(const union bch_extent_entry *e) ++{ ++ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; ++} ++ ++static inline enum bch_extent_entry_type ++extent_entry_type(const union bch_extent_entry *e) ++{ ++ int ret = __ffs(e->type); ++ ++ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); ++ ++ return ret; ++} ++ ++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) ++{ ++ switch (extent_entry_type(entry)) { ++#define x(f, n) \ ++ case BCH_EXTENT_ENTRY_##f: \ ++ return sizeof(struct bch_extent_##f); ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) ++{ ++ return extent_entry_bytes(entry) / sizeof(u64); ++} ++ ++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) ++{ ++ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; ++} ++ ++static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) ++{ ++ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; ++} ++ ++static inline bool extent_entry_is_crc(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++union bch_extent_crc { ++ u8 type; ++ struct bch_extent_crc32 crc32; ++ struct bch_extent_crc64 crc64; ++ struct bch_extent_crc128 crc128; ++}; ++ ++#define __entry_to_crc(_entry) \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const union bch_extent_crc *) (_entry), \ ++ (union bch_extent_crc *) (_entry)) ++ ++#define entry_to_crc(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ ++ \ ++ __entry_to_crc(_entry); \ ++}) ++ ++static inline struct bch_extent_crc_unpacked ++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) ++{ ++#define common_fields(_crc) \ ++ .csum_type = _crc.csum_type, \ ++ .compression_type = _crc.compression_type, \ ++ .compressed_size = _crc._compressed_size + 1, \ ++ .uncompressed_size = _crc._uncompressed_size + 1, \ ++ .offset = _crc.offset, \ ++ .live_size = k->size ++ ++ if (!crc) ++ return (struct bch_extent_crc_unpacked) { ++ .compressed_size = k->size, ++ .uncompressed_size = k->size, ++ .live_size = k->size, ++ }; ++ ++ switch (extent_entry_type(to_entry(crc))) { ++ case BCH_EXTENT_ENTRY_crc32: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc32), ++ }; ++ ++ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; ++ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, ++ sizeof(crc->crc32.csum)); ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc64: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc64), ++ .nonce = crc->crc64.nonce, ++ .csum.lo = (__force __le64) crc->crc64.csum_lo, ++ }; ++ ++ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc128), ++ .nonce = crc->crc128.nonce, ++ .csum = crc->crc128.csum, ++ }; ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++#undef common_fields ++} ++ ++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) ++{ ++ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); ++} ++ ++/* bkey_ptrs: generically over any key type that has ptrs */ ++ ++struct bkey_ptrs_c { ++ const union bch_extent_entry *start; ++ const union bch_extent_entry *end; ++}; ++ ++struct bkey_ptrs { ++ union bch_extent_entry *start; ++ union bch_extent_entry *end; ++}; ++ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ case KEY_TYPE_btree_ptr_v2: { ++ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} ++ ++#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ ++ for ((_entry) = (_start); \ ++ (_entry) < (_end); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define __bkey_ptr_next(_ptr, _end) \ ++({ \ ++ typeof(_end) _entry; \ ++ \ ++ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ ++ if (extent_entry_is_ptr(_entry)) \ ++ break; \ ++ \ ++ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ ++}) ++ ++#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) ++ ++#define bkey_extent_entry_for_each(_p, _entry) \ ++ bkey_extent_entry_for_each_from(_p, _entry, _p.start) ++ ++#define __bkey_for_each_ptr(_start, _end, _ptr) \ ++ for ((_ptr) = (_start); \ ++ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ ++ (_ptr)++) ++ ++#define bkey_ptr_next(_p, _ptr) \ ++ __bkey_ptr_next(_ptr, (_p).end) ++ ++#define bkey_for_each_ptr(_p, _ptr) \ ++ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) ++ ++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ++({ \ ++ __label__ out; \ ++ \ ++ (_ptr).idx = 0; \ ++ (_ptr).has_ec = false; \ ++ \ ++ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ ++ switch (extent_entry_type(_entry)) { \ ++ case BCH_EXTENT_ENTRY_ptr: \ ++ (_ptr).ptr = _entry->ptr; \ ++ goto out; \ ++ case BCH_EXTENT_ENTRY_crc32: \ ++ case BCH_EXTENT_ENTRY_crc64: \ ++ case BCH_EXTENT_ENTRY_crc128: \ ++ (_ptr).crc = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_entry)); \ ++ break; \ ++ case BCH_EXTENT_ENTRY_stripe_ptr: \ ++ (_ptr).ec = _entry->stripe_ptr; \ ++ (_ptr).has_ec = true; \ ++ break; \ ++ } \ ++out: \ ++ _entry < (_end); \ ++}) ++ ++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ ++ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ ++ (_entry) = _start; \ ++ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ ++ _ptr, _entry) ++ ++#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ ++({ \ ++ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ ++ if (extent_entry_is_crc(_iter)) { \ ++ (_crc) = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_iter)); \ ++ break; \ ++ } \ ++ \ ++ (_iter) < (_end); \ ++}) ++ ++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ ++ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ ++ (_iter) = (_start); \ ++ bkey_crc_next(_k, _start, _end, _crc, _iter); \ ++ (_iter) = extent_entry_next(_iter)) ++ ++#define bkey_for_each_crc(_k, _p, _crc, _iter) \ ++ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) ++ ++/* Iterate over pointers in KEY_TYPE_extent: */ ++ ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) ++ ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++ ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) ++ ++/* utility code common to all keys with pointers: */ ++ ++void bch2_mark_io_failure(struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, ++ int, struct bkey_s); ++ ++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ ++} ++ ++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_v2_invalid, \ ++ .val_to_text = bch2_btree_ptr_v2_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .compat = bch2_btree_ptr_v2_compat, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ ++} ++ ++/* KEY_TYPE_extent: */ ++ ++bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ ++#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++ .key_invalid = bch2_bkey_ptrs_invalid, \ ++ .val_to_text = bch2_bkey_ptrs_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .key_normalize = bch2_extent_normalize, \ ++ .key_merge = bch2_extent_merge, \ ++ .trans_trigger = bch2_trans_mark_extent, \ ++ .atomic_trigger = bch2_mark_extent, \ ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ ++#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++ .key_invalid = bch2_reservation_invalid, \ ++ .val_to_text = bch2_reservation_to_text, \ ++ .key_merge = bch2_reservation_merge, \ ++ .trans_trigger = bch2_trans_mark_reservation, \ ++ .atomic_trigger = bch2_mark_reservation, \ ++} ++ ++/* Extent checksum entries: */ ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); ++ ++/* Generic code for keys with pointers: */ ++ ++static inline bool bkey_is_btree_ptr(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_direct_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_inline_data(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_inline_data || ++ k->type == KEY_TYPE_indirect_inline_data; ++} ++ ++static inline unsigned bkey_inline_data_offset(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_inline_data: ++ return sizeof(struct bch_inline_data); ++ case KEY_TYPE_indirect_inline_data: ++ return sizeof(struct bch_indirect_inline_data); ++ default: ++ BUG(); ++ } ++} ++ ++static inline unsigned bkey_inline_data_bytes(const struct bkey *k) ++{ ++ return bkey_val_bytes(k) - bkey_inline_data_offset(k); ++} ++ ++#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) ++ ++static inline bool bkey_extent_is_data(const struct bkey *k) ++{ ++ return bkey_extent_is_direct_data(k) || ++ bkey_extent_is_inline_data(k) || ++ k->type == KEY_TYPE_reflink_p; ++} ++ ++/* ++ * Should extent be counted under inode->i_sectors? ++ */ ++static inline bool bkey_extent_is_allocation(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reservation: ++ case KEY_TYPE_reflink_p: ++ case KEY_TYPE_reflink_v: ++ case KEY_TYPE_inline_data: ++ case KEY_TYPE_indirect_inline_data: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ return BCH_DATA_btree; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return BCH_DATA_user; ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ ++ BUG_ON(ptr < s.v->ptrs || ++ ptr >= s.v->ptrs + s.v->nr_blocks); ++ ++ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant ++ ? BCH_DATA_parity ++ : BCH_DATA_user; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++bool bch2_bkey_is_incompressible(struct bkey_s_c); ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); ++ ++unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); ++ ++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ ++do { \ ++ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ ++ \ ++ _ptr = &_ptrs.start->ptr; \ ++ \ ++ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ ++ if (_cond) { \ ++ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ ++ _ptrs = bch2_bkey_ptrs(_k); \ ++ continue; \ ++ } \ ++ \ ++ (_ptr)++; \ ++ } \ ++} while (0) ++ ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); ++bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); ++ ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++ ++void bch2_ptr_swab(struct bkey_s); ++ ++/* Generic extent code: */ ++ ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ ++int bch2_cut_front_s(struct bpos, struct bkey_s); ++int bch2_cut_back_s(struct bpos, struct bkey_s); ++ ++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_front_s(where, bkey_i_to_s(k)); ++} ++ ++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_back_s(where, bkey_i_to_s(k)); ++} ++ ++/** ++ * bch_key_resize - adjust size of @k ++ * ++ * bkey_start_offset(k) will be preserved, modifies where the extent ends ++ */ ++static inline void bch2_key_resize(struct bkey *k, unsigned new_size) ++{ ++ k->p.offset -= k->size; ++ k->p.offset += new_size; ++ k->size = new_size; ++} ++ ++/* ++ * In extent_sort_fix_overlapping(), insert_fixup_extent(), ++ * extent_merge_inline() - we're modifying keys in place that are packed. To do ++ * that we have to unpack the key, modify the unpacked key - then this ++ * copies/repacks the unpacked to the original as necessary. ++ */ ++static inline void extent_save(struct btree *b, struct bkey_packed *dst, ++ struct bkey *src) ++{ ++ struct bkey_format *f = &b->format; ++ struct bkey_i *dst_unpacked; ++ ++ if ((dst_unpacked = packed_to_bkey(dst))) ++ dst_unpacked->k = *src; ++ else ++ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); ++} ++ ++#endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +new file mode 100644 +index 000000000000..43d6c341ecca +--- /dev/null ++++ b/fs/bcachefs/extents_types.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_TYPES_H ++#define _BCACHEFS_EXTENTS_TYPES_H ++ ++#include "bcachefs_format.h" ++ ++struct bch_extent_crc_unpacked { ++ u32 compressed_size; ++ u32 uncompressed_size; ++ u32 live_size; ++ ++ u8 csum_type; ++ u8 compression_type; ++ ++ u16 offset; ++ ++ u16 nonce; ++ ++ struct bch_csum csum; ++}; ++ ++struct extent_ptr_decoded { ++ unsigned idx; ++ bool has_ec; ++ struct bch_extent_crc_unpacked crc; ++ struct bch_extent_ptr ptr; ++ struct bch_extent_stripe_ptr ec; ++}; ++ ++struct bch_io_failures { ++ u8 nr; ++ struct bch_dev_io_failures { ++ u8 dev; ++ u8 idx; ++ u8 nr_failed; ++ u8 nr_retries; ++ } devs[BCH_REPLICAS_MAX]; ++}; ++ ++#endif /* _BCACHEFS_EXTENTS_TYPES_H */ +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +new file mode 100644 +index 000000000000..05429c9631cd +--- /dev/null ++++ b/fs/bcachefs/eytzinger.h +@@ -0,0 +1,281 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _EYTZINGER_H ++#define _EYTZINGER_H ++ ++#include ++#include ++ ++#include "util.h" ++ ++/* ++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an ++ * array ++ */ ++ ++/* ++ * One based indexing version: ++ * ++ * With one based indexing each level of the tree starts at a power of two - ++ * good for cacheline alignment: ++ */ ++ ++static inline unsigned eytzinger1_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + child; ++} ++ ++static inline unsigned eytzinger1_left_child(unsigned i) ++{ ++ return eytzinger1_child(i, 0); ++} ++ ++static inline unsigned eytzinger1_right_child(unsigned i) ++{ ++ return eytzinger1_child(i, 1); ++} ++ ++static inline unsigned eytzinger1_first(unsigned size) ++{ ++ return rounddown_pow_of_two(size); ++} ++ ++static inline unsigned eytzinger1_last(unsigned size) ++{ ++ return rounddown_pow_of_two(size + 1) - 1; ++} ++ ++/* ++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that ++ * ++ * eytzinger1_next(0) == eytzinger1_first()) ++ * eytzinger1_prev(0) == eytzinger1_last()) ++ * ++ * eytzinger1_prev(eytzinger1_first()) == 0 ++ * eytzinger1_next(eytzinger1_last()) == 0 ++ */ ++ ++static inline unsigned eytzinger1_next(unsigned i, unsigned size) ++{ ++ EBUG_ON(i > size); ++ ++ if (eytzinger1_right_child(i) <= size) { ++ i = eytzinger1_right_child(i); ++ ++ i <<= __fls(size + 1) - __fls(i); ++ i >>= i > size; ++ } else { ++ i >>= ffz(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_prev(unsigned i, unsigned size) ++{ ++ EBUG_ON(i > size); ++ ++ if (eytzinger1_left_child(i) <= size) { ++ i = eytzinger1_left_child(i) + 1; ++ ++ i <<= __fls(size + 1) - __fls(i); ++ i -= 1; ++ i >>= i > size; ++ } else { ++ i >>= __ffs(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_extra(unsigned size) ++{ ++ return (size + 1 - rounddown_pow_of_two(size)) << 1; ++} ++ ++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned b = __fls(i); ++ unsigned shift = __fls(size) - b; ++ int s; ++ ++ EBUG_ON(!i || i > size); ++ ++ i ^= 1U << b; ++ i <<= 1; ++ i |= 1; ++ i <<= shift; ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i -= (i - extra) >> 1; ++ */ ++ s = extra - i; ++ i += (s >> 1) & (s >> 31); ++ ++ return i; ++} ++ ++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned shift; ++ int s; ++ ++ EBUG_ON(!i || i > size); ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i += i - extra; ++ */ ++ s = extra - i; ++ i -= s & (s >> 31); ++ ++ shift = __ffs(i); ++ ++ i >>= shift + 1; ++ i |= 1U << (__fls(size) - shift); ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); ++} ++ ++#define eytzinger1_for_each(_i, _size) \ ++ for ((_i) = eytzinger1_first((_size)); \ ++ (_i) != 0; \ ++ (_i) = eytzinger1_next((_i), (_size))) ++ ++/* Zero based indexing version: */ ++ ++static inline unsigned eytzinger0_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + 1 + child; ++} ++ ++static inline unsigned eytzinger0_left_child(unsigned i) ++{ ++ return eytzinger0_child(i, 0); ++} ++ ++static inline unsigned eytzinger0_right_child(unsigned i) ++{ ++ return eytzinger0_child(i, 1); ++} ++ ++static inline unsigned eytzinger0_first(unsigned size) ++{ ++ return eytzinger1_first(size) - 1; ++} ++ ++static inline unsigned eytzinger0_last(unsigned size) ++{ ++ return eytzinger1_last(size) - 1; ++} ++ ++static inline unsigned eytzinger0_next(unsigned i, unsigned size) ++{ ++ return eytzinger1_next(i + 1, size) - 1; ++} ++ ++static inline unsigned eytzinger0_prev(unsigned i, unsigned size) ++{ ++ return eytzinger1_prev(i + 1, size) - 1; ++} ++ ++static inline unsigned eytzinger0_extra(unsigned size) ++{ ++ return eytzinger1_extra(size); ++} ++ ++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __eytzinger1_to_inorder(i + 1, size, extra) - 1; ++} ++ ++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __inorder_to_eytzinger1(i + 1, size, extra) - 1; ++} ++ ++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); ++} ++ ++#define eytzinger0_for_each(_i, _size) \ ++ for ((_i) = eytzinger0_first((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_next((_i), (_size))) ++ ++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); ++ ++/* return greatest node <= @search, or -1 if not found */ ++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ++ eytzinger_cmp_fn cmp, const void *search) ++{ ++ unsigned i, n = 0; ++ ++ if (!nr) ++ return -1; ++ ++ do { ++ i = n; ++ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); ++ } while (n < nr); ++ ++ if (n & 1) { ++ /* @i was greater than @search, return previous node: */ ++ ++ if (i == eytzinger0_first(nr)) ++ return -1; ++ ++ return eytzinger0_prev(i, nr); ++ } else { ++ return i; ++ } ++} ++ ++#define eytzinger0_find(base, nr, size, _cmp, search) \ ++({ \ ++ void *_base = (base); \ ++ void *_search = (search); \ ++ size_t _nr = (nr); \ ++ size_t _size = (size); \ ++ size_t _i = 0; \ ++ int _res; \ ++ \ ++ while (_i < _nr && \ ++ (_res = _cmp(_search, _base + _i * _size, _size))) \ ++ _i = eytzinger0_child(_i, _res > 0); \ ++ _i; \ ++}) ++ ++void eytzinger0_sort(void *, size_t, size_t, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++#endif /* _EYTZINGER_H */ +diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h +new file mode 100644 +index 000000000000..cdb272708a4b +--- /dev/null ++++ b/fs/bcachefs/fifo.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FIFO_H ++#define _BCACHEFS_FIFO_H ++ ++#include "util.h" ++ ++#define FIFO(type) \ ++struct { \ ++ size_t front, back, size, mask; \ ++ type *data; \ ++} ++ ++#define DECLARE_FIFO(type, name) FIFO(type) name ++ ++#define fifo_buf_size(fifo) \ ++ ((fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ ++ : 0) ++ ++#define init_fifo(fifo, _size, _gfp) \ ++({ \ ++ (fifo)->front = (fifo)->back = 0; \ ++ (fifo)->size = (_size); \ ++ (fifo)->mask = (fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) - 1 \ ++ : 0; \ ++ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ ++}) ++ ++#define free_fifo(fifo) \ ++do { \ ++ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ ++ (fifo)->data = NULL; \ ++} while (0) ++ ++#define fifo_swap(l, r) \ ++do { \ ++ swap((l)->front, (r)->front); \ ++ swap((l)->back, (r)->back); \ ++ swap((l)->size, (r)->size); \ ++ swap((l)->mask, (r)->mask); \ ++ swap((l)->data, (r)->data); \ ++} while (0) ++ ++#define fifo_move(dest, src) \ ++do { \ ++ typeof(*((dest)->data)) _t; \ ++ while (!fifo_full(dest) && \ ++ fifo_pop(src, _t)) \ ++ fifo_push(dest, _t); \ ++} while (0) ++ ++#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) ++#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) ++ ++#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) ++#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) ++ ++#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) ++#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) ++ ++#define fifo_entry_idx_abs(fifo, p) \ ++ ((((p) >= &fifo_peek_front(fifo) \ ++ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ ++ (((p) - (fifo)->data))) ++ ++#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) ++#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++ ++#define fifo_push_back_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) ++ ++#define fifo_push_front_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) ++ ++#define fifo_push_back(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_push_front(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_pop_front(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_pop_back(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) ++#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) ++#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) ++#define fifo_peek(fifo) fifo_peek_front(fifo) ++ ++#define fifo_for_each_entry(_entry, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#endif /* _BCACHEFS_FIFO_H */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +new file mode 100644 +index 000000000000..53ffc684223c +--- /dev/null ++++ b/fs/bcachefs/fs-common.c +@@ -0,0 +1,496 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fs-common.h" ++#include "inode.h" ++#include "subvolume.h" ++#include "xattr.h" ++ ++#include ++ ++static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) ++{ ++ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; ++} ++ ++int bch2_create_trans(struct btree_trans *trans, ++ subvol_inum dir, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *new_inode, ++ const struct qstr *name, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct posix_acl *default_acl, ++ struct posix_acl *acl, ++ subvol_inum snapshot_src, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; ++ subvol_inum new_inum = dir; ++ u64 now = bch2_current_time(c); ++ u64 cpu = raw_smp_processor_id(); ++ u64 dir_target; ++ u32 snapshot; ++ unsigned dir_type = mode_to_type(mode); ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ if (!(flags & BCH_CREATE_SNAPSHOT)) { ++ /* Normal create path - allocate a new inode: */ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ ++ if (flags & BCH_CREATE_TMPFILE) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ ++ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); ++ if (ret) ++ goto err; ++ ++ snapshot_src = (subvol_inum) { 0 }; ++ } else { ++ /* ++ * Creating a snapshot - we're not allocating a new inode, but ++ * we do have to lookup the root inode of the subvolume we're ++ * snapshotting and update it (in the new snapshot): ++ */ ++ ++ if (!snapshot_src.inum) { ++ /* Inode wasn't specified, just snapshot: */ ++ struct bch_subvolume s; ++ ++ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, ++ BTREE_ITER_CACHED, &s); ++ if (ret) ++ goto err; ++ ++ snapshot_src.inum = le64_to_cpu(s.inode); ++ } ++ ++ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ if (new_inode->bi_subvol != snapshot_src.subvol) { ++ /* Not a subvolume root: */ ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ /* ++ * If we're not root, we have to own the subvolume being ++ * snapshotted: ++ */ ++ if (uid && new_inode->bi_uid != uid) { ++ ret = -EPERM; ++ goto err; ++ } ++ ++ flags |= BCH_CREATE_SUBVOL; ++ } ++ ++ new_inum.inum = new_inode->bi_inum; ++ dir_target = new_inode->bi_inum; ++ ++ if (flags & BCH_CREATE_SUBVOL) { ++ u32 new_subvol, dir_snapshot; ++ ++ ret = bch2_subvolume_create(trans, new_inode->bi_inum, ++ snapshot_src.subvol, ++ &new_subvol, &snapshot, ++ (flags & BCH_CREATE_SNAPSHOT_RO) != 0); ++ if (ret) ++ goto err; ++ ++ new_inode->bi_parent_subvol = dir.subvol; ++ new_inode->bi_subvol = new_subvol; ++ new_inum.subvol = new_subvol; ++ dir_target = new_subvol; ++ dir_type = DT_SUBVOL; ++ ++ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); ++ ret = bch2_btree_iter_traverse(&dir_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (!(flags & BCH_CREATE_SNAPSHOT)) { ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ goto err; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inum, new_inode, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (!(flags & BCH_CREATE_TMPFILE)) { ++ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ u64 dir_offset; ++ ++ if (is_subdir_for_nlink(new_inode)) ++ dir_u->bi_nlink++; ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ ret = bch2_inode_write(trans, &dir_iter, dir_u); ++ if (ret) ++ goto err; ++ ++ ret = bch2_dirent_create(trans, dir, &dir_hash, ++ dir_type, ++ name, ++ dir_target, ++ &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ new_inode->bi_dir = dir_u->bi_inum; ++ new_inode->bi_dir_offset = dir_offset; ++ } ++ } ++ ++ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; ++ bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ++ ++ ret = bch2_btree_iter_traverse(&inode_iter) ?: ++ bch2_inode_write(trans, &inode_iter, new_inode); ++err: ++ bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(trans, &dir_iter); ++ return ret; ++} ++ ++int bch2_link_trans(struct btree_trans *trans, ++ subvol_inum dir, struct bch_inode_unpacked *dir_u, ++ subvol_inum inum, struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_hash_info dir_hash; ++ u64 now = bch2_current_time(c); ++ u64 dir_offset = 0; ++ int ret; ++ ++ if (dir.subvol != inum.subvol) ++ return -EXDEV; ++ ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ inode_u->bi_ctime = now; ++ ret = bch2_inode_nlink_inc(inode_u); ++ if (ret) ++ return ret; ++ ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ dir_hash = bch2_hash_info_init(c, dir_u); ++ ++ ret = bch2_dirent_create(trans, dir, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum.inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ inode_u->bi_dir = dir.inum; ++ inode_u->bi_dir_offset = dir_offset; ++ } ++ ++ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: ++ bch2_inode_write(trans, &inode_iter, inode_u); ++err: ++ bch2_trans_iter_exit(trans, &dir_iter); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ return ret; ++} ++ ++int bch2_unlink_trans(struct btree_trans *trans, ++ subvol_inum dir, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, ++ const struct qstr *name, ++ bool deleting_snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter dirent_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_hash_info dir_hash; ++ subvol_inum inum; ++ u64 now = bch2_current_time(c); ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ dir_hash = bch2_hash_info_init(c, dir_u); ++ ++ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, ++ name, &inum, BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { ++ ret = bch2_empty_dir_trans(trans, inum); ++ if (ret) ++ goto err; ++ } ++ ++ if (deleting_snapshot && !inode_u->bi_subvol) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ if (deleting_snapshot || inode_u->bi_subvol) { ++ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(&dirent_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ /* ++ * If we're deleting a subvolume, we need to really delete the ++ * dirent, not just emit a whiteout in the current snapshot: ++ */ ++ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&dirent_iter); ++ if (ret) ++ goto err; ++ } else { ++ bch2_inode_nlink_dec(trans, inode_u); ++ } ++ ++ if (inode_u->bi_dir == dirent_iter.pos.inode && ++ inode_u->bi_dir_offset == dirent_iter.pos.offset) { ++ inode_u->bi_dir = 0; ++ inode_u->bi_dir_offset = 0; ++ } ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; ++ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); ++ ++ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ &dir_hash, &dirent_iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: ++ bch2_inode_write(trans, &dir_iter, dir_u) ?: ++ bch2_inode_write(trans, &inode_iter, inode_u); ++err: ++ bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(trans, &dirent_iter); ++ bch2_trans_iter_exit(trans, &dir_iter); ++ return ret; ++} ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, ++ struct bch_inode_unpacked *src_u) ++{ ++ u64 src, dst; ++ unsigned id; ++ bool ret = false; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ /* Skip attributes that were explicitly set on this inode */ ++ if (dst_u->bi_fields_set & (1 << id)) ++ continue; ++ ++ src = bch2_inode_opt_get(src_u, id); ++ dst = bch2_inode_opt_get(dst_u, id); ++ ++ if (src == dst) ++ continue; ++ ++ bch2_inode_opt_set(dst_u, id, src); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++int bch2_rename_trans(struct btree_trans *trans, ++ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, ++ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ struct bch_inode_unpacked *src_inode_u, ++ struct bch_inode_unpacked *dst_inode_u, ++ const struct qstr *src_name, ++ const struct qstr *dst_name, ++ enum bch_rename_mode mode) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter src_dir_iter = { NULL }; ++ struct btree_iter dst_dir_iter = { NULL }; ++ struct btree_iter src_inode_iter = { NULL }; ++ struct btree_iter dst_inode_iter = { NULL }; ++ struct bch_hash_info src_hash, dst_hash; ++ subvol_inum src_inum, dst_inum; ++ u64 src_offset, dst_offset; ++ u64 now = bch2_current_time(c); ++ int ret; ++ ++ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ src_hash = bch2_hash_info_init(c, src_dir_u); ++ ++ if (dst_dir.inum != src_dir.inum || ++ dst_dir.subvol != src_dir.subvol) { ++ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ dst_hash = bch2_hash_info_init(c, dst_dir_u); ++ } else { ++ dst_dir_u = src_dir_u; ++ dst_hash = src_hash; ++ } ++ ++ ret = bch2_dirent_rename(trans, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, ++ src_name, &src_inum, &src_offset, ++ dst_name, &dst_inum, &dst_offset, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ ++ if (dst_inum.inum) { ++ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto err; ++ } ++ ++ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { ++ src_inode_u->bi_dir = dst_dir_u->bi_inum; ++ src_inode_u->bi_dir_offset = dst_offset; ++ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ dst_inode_u->bi_dir = src_dir_u->bi_inum; ++ dst_inode_u->bi_dir_offset = src_offset; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE && ++ dst_inode_u->bi_dir == dst_dir_u->bi_inum && ++ dst_inode_u->bi_dir_offset == src_offset) { ++ dst_inode_u->bi_dir = 0; ++ dst_inode_u->bi_dir_offset = 0; ++ } ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ if (S_ISDIR(src_inode_u->bi_mode) != ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -ENOTDIR; ++ goto err; ++ } ++ ++ if (S_ISDIR(dst_inode_u->bi_mode) && ++ bch2_empty_dir_trans(trans, dst_inum)) { ++ ret = -ENOTEMPTY; ++ goto err; ++ } ++ } ++ ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (is_subdir_for_nlink(src_inode_u)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } ++ ++ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { ++ dst_dir_u->bi_nlink--; ++ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ bch2_inode_nlink_dec(trans, dst_inode_u); ++ ++ src_dir_u->bi_mtime = now; ++ src_dir_u->bi_ctime = now; ++ ++ if (src_dir.inum != dst_dir.inum) { ++ dst_dir_u->bi_mtime = now; ++ dst_dir_u->bi_ctime = now; ++ } ++ ++ src_inode_u->bi_ctime = now; ++ ++ if (dst_inum.inum) ++ dst_inode_u->bi_ctime = now; ++ ++ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: ++ (src_dir.inum != dst_dir.inum ++ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) ++ : 0 ) ?: ++ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: ++ (dst_inum.inum ++ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) ++ : 0 ); ++err: ++ bch2_trans_iter_exit(trans, &dst_inode_iter); ++ bch2_trans_iter_exit(trans, &src_inode_iter); ++ bch2_trans_iter_exit(trans, &dst_dir_iter); ++ bch2_trans_iter_exit(trans, &src_dir_iter); ++ return ret; ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +new file mode 100644 +index 000000000000..dde237859514 +--- /dev/null ++++ b/fs/bcachefs/fs-common.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_COMMON_H ++#define _BCACHEFS_FS_COMMON_H ++ ++struct posix_acl; ++ ++#define BCH_CREATE_TMPFILE (1U << 0) ++#define BCH_CREATE_SUBVOL (1U << 1) ++#define BCH_CREATE_SNAPSHOT (1U << 2) ++#define BCH_CREATE_SNAPSHOT_RO (1U << 3) ++ ++int bch2_create_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct posix_acl *, ++ struct posix_acl *, ++ subvol_inum, unsigned); ++ ++int bch2_link_trans(struct btree_trans *, ++ subvol_inum, struct bch_inode_unpacked *, ++ subvol_inum, struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_unlink_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, bool); ++ ++int bch2_rename_trans(struct btree_trans *, ++ subvol_inum, struct bch_inode_unpacked *, ++ subvol_inum, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ const struct qstr *, ++ enum bch_rename_mode); ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000000..f37bc43e27f4 +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,3496 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++static inline bool bio_full(struct bio *bio, unsigned len) ++{ ++ if (bio->bi_vcnt >= bio->bi_max_vecs) ++ return true; ++ if (bio->bi_iter.bi_size > UINT_MAX - len) ++ return true; ++ return false; ++} ++ ++static inline struct address_space *faults_disabled_mapping(void) ++{ ++ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); ++} ++ ++static inline void set_fdm_dropped_locks(void) ++{ ++ current->faults_disabled_mapping = ++ (void *) (((unsigned long) current->faults_disabled_mapping)|1); ++} ++ ++static inline bool fdm_dropped_locks(void) ++{ ++ return ((unsigned long) current->faults_disabled_mapping) & 1; ++} ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++struct bch_writepage_io { ++ struct closure cl; ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_write { ++ struct completion done; ++ struct kiocb *req; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ sync:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ u64 written; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ bool should_dirty; ++ struct bch_read_bio rbio; ++}; ++ ++/* pagecache_block must be held */ ++static int write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++/* quotas */ ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (!res->sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ res->sectors = 0; ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (!sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, ++ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", ++ inode->v.i_ino, (u64) inode->v.i_blocks, sectors, ++ inode->ei_inode.bi_sectors); ++ inode->v.i_blocks += sectors; ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif ++ mutex_unlock(&inode->ei_quota_lock); ++} ++ ++/* page state: */ ++ ++/* stored in page->private: */ ++ ++struct bch_page_sector { ++ /* Uncompressed, fully allocated replicas (or on disk reservation): */ ++ unsigned nr_replicas:4; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ ++ unsigned replicas_reserved:4; ++ ++ /* i_sectors: */ ++ enum { ++ SECTOR_UNALLOCATED, ++ SECTOR_RESERVED, ++ SECTOR_DIRTY, ++ SECTOR_DIRTY_RESERVED, ++ SECTOR_ALLOCATED, ++ } state:8; ++}; ++ ++struct bch_page_state { ++ spinlock_t lock; ++ atomic_t write_count; ++ bool uptodate; ++ struct bch_page_sector s[PAGE_SECTORS]; ++}; ++ ++static inline struct bch_page_state *__bch2_page_state(struct page *page) ++{ ++ return page_has_private(page) ++ ? (struct bch_page_state *) page_private(page) ++ : NULL; ++} ++ ++static inline struct bch_page_state *bch2_page_state(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ ++ return __bch2_page_state(page); ++} ++ ++/* for newly allocated pages: */ ++static void __bch2_page_state_release(struct page *page) ++{ ++ kfree(detach_page_private(page)); ++} ++ ++static void bch2_page_state_release(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ __bch2_page_state_release(page); ++} ++ ++/* for newly allocated pages: */ ++static struct bch_page_state *__bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ struct bch_page_state *s; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ attach_page_private(page, s); ++ return s; ++} ++ ++static struct bch_page_state *bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); ++} ++ ++static unsigned bkey_to_sector_state(const struct bkey *k) ++{ ++ if (k->type == KEY_TYPE_reservation) ++ return SECTOR_RESERVED; ++ if (bkey_extent_is_allocation(k)) ++ return SECTOR_ALLOCATED; ++ return SECTOR_UNALLOCATED; ++} ++ ++static void __bch2_page_state_set(struct page *page, ++ unsigned pg_offset, unsigned pg_len, ++ unsigned nr_ptrs, unsigned state) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); ++ unsigned i; ++ ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ spin_lock(&s->lock); ++ ++ for (i = pg_offset; i < pg_offset + pg_len; i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ ++ if (i == PAGE_SECTORS) ++ s->uptodate = true; ++ ++ spin_unlock(&s->lock); ++} ++ ++static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, ++ struct page **pages, unsigned nr_pages) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; ++ unsigned pg_idx = 0; ++ u32 snapshot; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k.k); ++ ++ while (pg_idx < nr_pages) { ++ struct page *page = pages[pg_idx]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; ++ unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; ++ ++ BUG_ON(k.k->p.offset < pg_start); ++ BUG_ON(bkey_start_offset(k.k) > pg_end); ++ ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) ++ __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); ++ ++ if (k.k->p.offset < pg_end) ++ break; ++ pg_idx++; ++ } ++ ++ if (pg_idx == nr_pages) ++ break; ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k.k); ++ ++ bio_for_each_segment(bv, bio, iter) ++ __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, ++ bv.bv_len >> 9, nr_ptrs, state); ++} ++ ++static void mark_pagecache_unallocated(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++static void mark_pagecache_reserved(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct pagevec pvec; ++ s64 i_sectors_delta = 0; ++ ++ if (end <= start) ++ return; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ u64 pg_start = page->index << PAGE_SECTORS_SHIFT; ++ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; ++ unsigned pg_offset = max(start, pg_start) - pg_start; ++ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; ++ struct bch_page_state *s; ++ ++ BUG_ON(end <= pg_start); ++ BUG_ON(pg_offset >= PAGE_SECTORS); ++ BUG_ON(pg_offset + pg_len > PAGE_SECTORS); ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = pg_offset; j < pg_offset + pg_len; j++) ++ switch (s->s[j].state) { ++ case SECTOR_UNALLOCATED: ++ s->s[j].state = SECTOR_RESERVED; ++ break; ++ case SECTOR_DIRTY: ++ s->s[j].state = SECTOR_DIRTY_RESERVED; ++ i_sectors_delta--; ++ break; ++ default: ++ break; ++ } ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++ ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++} ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_page_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++static int bch2_get_page_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct page *page, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++struct bch2_page_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static void bch2_page_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++static void bch2_page_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++static int bch2_page_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ BUG_ON(!s->uptodate); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, ++ disk_sectors, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, ++ check_enospc); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_page_bits(struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_page_state *s = bch2_page_state(page); ++ struct disk_reservation disk_res = { 0 }; ++ int i, dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(PageWriteback(page)); ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ switch (s->s[i].state) { ++ case SECTOR_DIRTY: ++ s->s[i].state = SECTOR_UNALLOCATED; ++ --dirty_sectors; ++ break; ++ case SECTOR_DIRTY_RESERVED: ++ s->s[i].state = SECTOR_RESERVED; ++ break; ++ default: ++ break; ++ } ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ i_sectors_acct(c, inode, NULL, dirty_sectors); ++ ++ bch2_page_state_release(page); ++} ++ ++static void bch2_set_page_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) page_offset(page) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ switch (s->s[i].state) { ++ case SECTOR_UNALLOCATED: ++ s->s[i].state = SECTOR_DIRTY; ++ dirty_sectors++; ++ break; ++ case SECTOR_RESERVED: ++ s->s[i].state = SECTOR_DIRTY_RESERVED; ++ break; ++ default: ++ break; ++ } ++ } ++ ++ spin_unlock(&s->lock); ++ ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!PageDirty(page)) ++ __set_page_dirty_nobuffers(page); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct address_space *mapping = file->f_mapping; ++ struct address_space *fdm = faults_disabled_mapping(); ++ struct bch_inode_info *inode = file_bch_inode(file); ++ int ret; ++ ++ if (fdm == mapping) ++ return VM_FAULT_SIGBUS; ++ ++ /* Lock ordering: */ ++ if (fdm > mapping) { ++ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); ++ ++ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) ++ goto got_lock; ++ ++ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); ++ ++ /* Signal that lock has been dropped: */ ++ set_fdm_dropped_locks(); ++ return VM_FAULT_SIGBUS; ++ } ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++got_lock: ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct page *page = vmf->page; ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation res; ++ unsigned len; ++ loff_t isize; ++ int ret; ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ lock_page(page); ++ isize = i_size_read(&inode->v); ++ ++ if (page->mapping != mapping || page_offset(page) >= isize) { ++ unlock_page(page); ++ ret = VM_FAULT_NOPAGE; ++ goto out; ++ } ++ ++ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); ++ ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ } ++ ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ ++ bch2_set_page_dirty(c, inode, page, &res, 0, len); ++ bch2_page_reservation_put(c, inode, &res); ++ ++ wait_for_stable_page(page); ++ ret = VM_FAULT_LOCKED; ++out: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) ++{ ++ if (offset || length < folio_size(folio)) ++ return; ++ ++ bch2_clear_page_bits(&folio->page); ++} ++ ++int bch2_releasepage(struct page *page, gfp_t gfp_mask) ++{ ++ if (PageDirty(page)) ++ return 0; ++ ++ bch2_clear_page_bits(page); ++ return 1; ++} ++ ++#ifdef CONFIG_MIGRATION ++int bch2_migrate_page(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ int ret; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(!PageLocked(newpage)); ++ ++ ret = migrate_page_move_mapping(mapping, newpage, page, 0); ++ if (ret != MIGRATEPAGE_SUCCESS) ++ return ret; ++ ++ if (PagePrivate(page)) ++ attach_page_private(newpage, detach_page_private(page)); ++ ++ if (mode != MIGRATE_SYNC_NO_COPY) ++ migrate_page_copy(newpage, page); ++ else ++ migrate_page_states(newpage, page); ++ return MIGRATEPAGE_SUCCESS; ++} ++#endif ++ ++/* readpage(s): */ ++ ++static void bch2_readpages_end_io(struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) { ++ struct page *page = bv->bv_page; ++ ++ if (!bio->bi_status) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ } ++ ++ bio_put(bio); ++} ++ ++struct readpages_iter { ++ struct address_space *mapping; ++ struct page **pages; ++ unsigned nr_pages; ++ unsigned idx; ++ pgoff_t offset; ++}; ++ ++static int readpages_iter_init(struct readpages_iter *iter, ++ struct readahead_control *ractl) ++{ ++ unsigned i, nr_pages = readahead_count(ractl); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->mapping = ractl->mapping; ++ iter->offset = readahead_index(ractl); ++ iter->nr_pages = nr_pages; ++ ++ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); ++ if (!iter->pages) ++ return -ENOMEM; ++ ++ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); ++ for (i = 0; i < nr_pages; i++) { ++ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); ++ put_page(iter->pages[i]); ++ } ++ ++ return 0; ++} ++ ++static inline struct page *readpage_iter_next(struct readpages_iter *iter) ++{ ++ if (iter->idx >= iter->nr_pages) ++ return NULL; ++ ++ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); ++ ++ return iter->pages[iter->idx]; ++} ++ ++static bool extent_partial_reads_expensive(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc.csum_type || crc.compression_type) ++ return true; ++ return false; ++} ++ ++static void readpage_bio_extend(struct readpages_iter *iter, ++ struct bio *bio, ++ unsigned sectors_this_extent, ++ bool get_more) ++{ ++ while (bio_sectors(bio) < sectors_this_extent && ++ bio->bi_vcnt < bio->bi_max_vecs) { ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; ++ struct page *page = readpage_iter_next(iter); ++ int ret; ++ ++ if (page) { ++ if (iter->offset + iter->idx != page_offset) ++ break; ++ ++ iter->idx++; ++ } else { ++ if (!get_more) ++ break; ++ ++ page = xa_load(&iter->mapping->i_pages, page_offset); ++ if (page && !xa_is_value(page)) ++ break; ++ ++ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); ++ if (!page) ++ break; ++ ++ if (!__bch2_page_state_create(page, 0)) { ++ put_page(page); ++ break; ++ } ++ ++ ret = add_to_page_cache_lru(page, iter->mapping, ++ page_offset, GFP_NOFS); ++ if (ret) { ++ __bch2_page_state_release(page); ++ put_page(page); ++ break; ++ } ++ ++ put_page(page); ++ } ++ ++ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); ++ } ++} ++ ++static void bchfs_read(struct btree_trans *trans, ++ struct bch_read_bio *rbio, ++ subvol_inum inum, ++ struct readpages_iter *readpages_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_buf sk; ++ int flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE; ++ u32 snapshot; ++ int ret = 0; ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ rbio->subvol = inum.subvol; ++ ++ bch2_bkey_buf_init(&sk); ++retry: ++ bch2_trans_begin(trans); ++ iter = (struct btree_iter) { NULL }; ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, ++ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ struct bkey_s_c k; ++ unsigned bytes, sectors, offset_into_extent; ++ enum btree_id data_btree = BTREE_ID_extents; ++ ++ /* ++ * read_extent -> io_time_reset may cause a transaction restart ++ * without returning an error, we need to check for that here: ++ */ ++ ret = bch2_trans_relock(trans); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_set_pos(&iter, ++ POS(inum.inum, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ offset_into_extent = iter.pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ ++ ret = bch2_read_indirect_extent(trans, &data_btree, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ k = bkey_i_to_s_c(sk.k); ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ if (readpages_iter) ++ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, ++ extent_partial_reads_expensive(k)); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ bch2_bio_page_state_set(&rbio->bio, k); ++ ++ bch2_read_extent(trans, rbio, iter.pos, ++ data_btree, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ ++ ret = btree_trans_too_many_iters(trans); ++ if (ret) ++ break; ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ if (ret) { ++ bch_err_inum_ratelimited(c, inum.inum, ++ "read error %i from btree lookup", ret); ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ bio_endio(&rbio->bio); ++ } ++ ++ bch2_bkey_buf_exit(&sk, c); ++} ++ ++void bch2_readahead(struct readahead_control *ractl) ++{ ++ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct page *page; ++ struct readpages_iter readpages_iter; ++ int ret; ++ ++ ret = readpages_iter_init(&readpages_iter, ractl); ++ BUG_ON(ret); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ while ((page = readpage_iter_next(&readpages_iter))) { ++ pgoff_t index = readpages_iter.offset + readpages_iter.idx; ++ unsigned n = min_t(unsigned, ++ readpages_iter.nr_pages - ++ readpages_iter.idx, ++ BIO_MAX_VECS); ++ struct bch_read_bio *rbio = ++ rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, ++ GFP_NOFS, &c->bio_read), ++ opts); ++ ++ readpages_iter.idx++; ++ ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bchfs_read(&trans, rbio, inode_inum(inode), ++ &readpages_iter); ++ } ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_trans_exit(&trans); ++ kfree(readpages_iter.pages); ++} ++ ++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ++ subvol_inum inum, struct page *page) ++{ ++ struct btree_trans trans; ++ ++ bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); ++ rbio->bio.bi_iter.bi_sector = ++ (sector_t) page->index << PAGE_SECTORS_SHIFT; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bchfs_read(&trans, rbio, inum, NULL); ++ bch2_trans_exit(&trans); ++} ++ ++int bch2_readpage(struct file *file, struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct bch_read_bio *rbio; ++ ++ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), opts); ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ ++ __bchfs_readpage(c, rbio, inode_inum(inode), page); ++ return 0; ++} ++ ++static void bch2_read_single_page_end_io(struct bio *bio) ++{ ++ complete(bio->bi_private); ++} ++ ++static int bch2_read_single_page(struct page *page, ++ struct address_space *mapping) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_read_bio *rbio; ++ int ret; ++ DECLARE_COMPLETION_ONSTACK(done); ++ ++ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), ++ io_opts(c, &inode->ei_inode)); ++ rbio->bio.bi_private = &done; ++ rbio->bio.bi_end_io = bch2_read_single_page_end_io; ++ ++ __bchfs_readpage(c, rbio, inode_inum(inode), page); ++ wait_for_completion(&done); ++ ++ ret = blk_status_to_errno(rbio->bio.bi_status); ++ bio_put(&rbio->bio); ++ ++ if (ret < 0) ++ return ret; ++ ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* writepages: */ ++ ++struct bch_writepage_state { ++ struct bch_writepage_io *io; ++ struct bch_io_opts opts; ++}; ++ ++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ return (struct bch_writepage_state) { ++ .opts = io_opts(c, &inode->ei_inode) ++ }; ++} ++ ++static void bch2_writepage_io_free(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ ++ bio_put(&io->op.wbio.bio); ++} ++ ++static void bch2_writepage_io_done(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ struct bch_fs *c = io->op.c; ++ struct bio *bio = &io->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bvec; ++ unsigned i; ++ ++ up(&io->op.c->io_in_flight); ++ ++ if (io->op.error) { ++ set_bit(EI_INODE_ERROR, &io->inode->ei_flags); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ SetPageError(bvec->bv_page); ++ mapping_set_error(bvec->bv_page->mapping, -EIO); ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ /* ++ * racing with fallocate can cause us to add fewer sectors than ++ * expected - but we shouldn't add more sectors than expected: ++ */ ++ WARN_ON_ONCE(io->op.i_sectors_delta > 0); ++ ++ /* ++ * (error (due to going RO) halfway through a page can screw that up ++ * slightly) ++ * XXX wtf? ++ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); ++ */ ++ ++ /* ++ * PageWriteback is effectively our ref on the inode - fixup i_blocks ++ * before calling end_page_writeback: ++ */ ++ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(bvec->bv_page); ++ } ++ ++ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++} ++ ++static void bch2_writepage_do_io(struct bch_writepage_state *w) ++{ ++ struct bch_writepage_io *io = w->io; ++ ++ down(&io->op.c->io_in_flight); ++ ++ w->io = NULL; ++ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); ++ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++} ++ ++/* ++ * Get a bch_writepage_io and add @page to it - appending to an existing one if ++ * possible, else allocating a new one: ++ */ ++static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct writeback_control *wbc, ++ struct bch_writepage_state *w, ++ struct bch_inode_info *inode, ++ u64 sector, ++ unsigned nr_replicas) ++{ ++ struct bch_write_op *op; ++ ++ w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, ++ REQ_OP_WRITE, ++ GFP_NOFS, ++ &c->writepage_bioset), ++ struct bch_writepage_io, op.wbio.bio); ++ ++ closure_init(&w->io->cl, NULL); ++ w->io->inode = inode; ++ ++ op = &w->io->op; ++ bch2_write_op_init(op, c, w->opts); ++ op->target = w->opts.foreground_target; ++ op->nr_replicas = nr_replicas; ++ op->res.nr_replicas = nr_replicas; ++ op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->subvol = inode->ei_subvol; ++ op->pos = POS(inode->v.i_ino, sector); ++ op->wbio.bio.bi_iter.bi_sector = sector; ++ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); ++} ++ ++static int __bch2_writepage(struct page *page, ++ struct writeback_control *wbc, ++ void *data) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_writepage_state *w = data; ++ struct bch_page_state *s, orig; ++ unsigned i, offset, nr_replicas_this_write = U32_MAX; ++ loff_t i_size = i_size_read(&inode->v); ++ pgoff_t end_index = i_size >> PAGE_SHIFT; ++ int ret; ++ ++ EBUG_ON(!PageUptodate(page)); ++ ++ /* Is the page fully inside i_size? */ ++ if (page->index < end_index) ++ goto do_io; ++ ++ /* Is the page fully outside i_size? (truncate in progress) */ ++ offset = i_size & (PAGE_SIZE - 1); ++ if (page->index > end_index || !offset) { ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ zero_user_segment(page, offset, PAGE_SIZE); ++do_io: ++ s = bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ /* ++ * Things get really hairy with errors during writeback: ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); ++ ++ /* Before unlocking the page, get copy of reservations: */ ++ spin_lock(&s->lock); ++ orig = *s; ++ spin_unlock(&s->lock); ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; ++ ++ s->s[i].replicas_reserved = 0; ++ s->s[i].state = SECTOR_ALLOCATED; ++ } ++ ++ BUG_ON(atomic_read(&s->write_count)); ++ atomic_set(&s->write_count, 1); ++ ++ BUG_ON(PageWriteback(page)); ++ set_page_writeback(page); ++ ++ unlock_page(page); ++ ++ offset = 0; ++ while (1) { ++ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; ++ u64 sector; ++ ++ while (offset < PAGE_SECTORS && ++ orig.s[offset].state < SECTOR_DIRTY) ++ offset++; ++ ++ if (offset == PAGE_SECTORS) ++ break; ++ ++ while (offset + sectors < PAGE_SECTORS && ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) { ++ reserved_sectors += orig.s[offset + sectors].replicas_reserved; ++ dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; ++ sectors++; ++ } ++ BUG_ON(!sectors); ++ ++ sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; ++ ++ if (w->io && ++ (w->io->op.res.nr_replicas != nr_replicas_this_write || ++ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || ++ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= ++ (BIO_MAX_VECS * PAGE_SIZE) || ++ bio_end_sector(&w->io->op.wbio.bio) != sector)) ++ bch2_writepage_do_io(w); ++ ++ if (!w->io) ++ bch2_writepage_io_alloc(c, wbc, w, inode, sector, ++ nr_replicas_this_write); ++ ++ atomic_inc(&s->write_count); ++ ++ BUG_ON(inode != w->io->inode); ++ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, ++ sectors << 9, offset << 9)); ++ ++ /* Check for writing past i_size: */ ++ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); ++ ++ w->io->op.res.sectors += reserved_sectors; ++ w->io->op.i_sectors_delta -= dirty_sectors; ++ w->io->op.new_i_size = i_size; ++ ++ offset += sectors; ++ } ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(page); ++ ++ return 0; ++} ++ ++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(mapping->host)); ++ struct blk_plug plug; ++ int ret; ++ ++ blk_start_plug(&plug); ++ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ blk_finish_plug(&plug); ++ return ret; ++} ++ ++/* buffered writes: */ ++ ++int bch2_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res; ++ pgoff_t index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ struct page *page; ++ int ret = -ENOMEM; ++ ++ res = kmalloc(sizeof(*res), GFP_KERNEL); ++ if (!res) ++ return -ENOMEM; ++ ++ bch2_page_reservation_init(c, inode, res); ++ *fsdata = res; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ page = grab_cache_page_write_begin(mapping, index, flags); ++ if (!page) ++ goto err_unlock; ++ ++ if (PageUptodate(page)) ++ goto out; ++ ++ /* If we're writing entire page, don't need to read it in first: */ ++ if (len == PAGE_SIZE) ++ goto out; ++ ++ if (!offset && pos + len >= inode->v.i_size) { ++ zero_user_segment(page, len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++ ++ if (index > inode->v.i_size >> PAGE_SHIFT) { ++ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++readpage: ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto err; ++out: ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); ++ if (ret) ++ goto out; ++ } ++ ++ ret = bch2_page_reservation_get(c, inode, page, res, ++ offset, len, true); ++ if (ret) { ++ if (!PageUptodate(page)) { ++ /* ++ * If the page hasn't been read in, we won't know if we ++ * actually need a reservation - we don't actually need ++ * to read here, we just need to check if the page is ++ * fully backed by uncompressed data: ++ */ ++ goto readpage; ++ } ++ ++ goto err; ++ } ++ ++ *pagep = page; ++ return 0; ++err: ++ unlock_page(page); ++ put_page(page); ++ *pagep = NULL; ++err_unlock: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ kfree(res); ++ *fsdata = NULL; ++ return ret; ++} ++ ++int bch2_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res = fsdata; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ if (unlikely(copied < len && !PageUptodate(page))) { ++ /* ++ * The page needs to be read in, but that would destroy ++ * our partial write - simplest thing is to just force ++ * userspace to redo the write: ++ */ ++ zero_user(page, 0, PAGE_SIZE); ++ flush_dcache_page(page); ++ copied = 0; ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (copied) { ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, res, offset, copied); ++ ++ inode->ei_last_dirtied = (unsigned long) current; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_page_reservation_put(c, inode, res); ++ kfree(res); ++ ++ return copied; ++} ++ ++#define WRITE_BATCH_PAGES 32 ++ ++static int __bch2_buffered_write(struct bch_inode_info *inode, ++ struct address_space *mapping, ++ struct iov_iter *iter, ++ loff_t pos, unsigned len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct page *pages[WRITE_BATCH_PAGES]; ++ struct bch2_page_reservation res; ++ unsigned long index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); ++ unsigned i, reserved = 0, set_dirty = 0; ++ unsigned copied = 0, nr_pages_copied = 0; ++ int ret = 0; ++ ++ BUG_ON(!len); ++ BUG_ON(nr_pages > ARRAY_SIZE(pages)); ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); ++ if (!pages[i]) { ++ nr_pages = i; ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ len = min_t(unsigned, len, ++ nr_pages * PAGE_SIZE - offset); ++ break; ++ } ++ } ++ ++ if (offset && !PageUptodate(pages[0])) { ++ ret = bch2_read_single_page(pages[0], mapping); ++ if (ret) ++ goto out; ++ } ++ ++ if ((pos + len) & (PAGE_SIZE - 1) && ++ !PageUptodate(pages[nr_pages - 1])) { ++ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { ++ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); ++ } else { ++ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); ++ if (ret) ++ goto out; ++ } ++ } ++ ++ while (reserved < len) { ++ unsigned i = (offset + reserved) >> PAGE_SHIFT; ++ struct page *page = pages[i]; ++ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - reserved, ++ PAGE_SIZE - pg_offset); ++ ++ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ++ ret = bch2_page_state_set(c, inode_inum(inode), ++ pages + i, nr_pages - i); ++ if (ret) ++ goto out; ++ } ++ ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); ++ if (ret) ++ goto out; ++ ++ reserved += pg_len; ++ } ++ ++ if (mapping_writably_mapped(mapping)) ++ for (i = 0; i < nr_pages; i++) ++ flush_dcache_page(pages[i]); ++ ++ while (copied < len) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - copied, ++ PAGE_SIZE - pg_offset); ++ unsigned pg_copied = copy_page_from_iter_atomic(page, ++ pg_offset, pg_len,iter); ++ ++ if (!pg_copied) ++ break; ++ ++ if (!PageUptodate(page) && ++ pg_copied != PAGE_SIZE && ++ pos + copied + pg_copied < inode->v.i_size) { ++ zero_user(page, 0, PAGE_SIZE); ++ break; ++ } ++ ++ flush_dcache_page(page); ++ copied += pg_copied; ++ ++ if (pg_copied != pg_len) ++ break; ++ } ++ ++ if (!copied) ++ goto out; ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ while (set_dirty < copied) { ++ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, copied - set_dirty, ++ PAGE_SIZE - pg_offset); ++ ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); ++ unlock_page(page); ++ put_page(page); ++ ++ set_dirty += pg_len; ++ } ++ ++ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); ++ inode->ei_last_dirtied = (unsigned long) current; ++out: ++ for (i = nr_pages_copied; i < nr_pages; i++) { ++ unlock_page(pages[i]); ++ put_page(pages[i]); ++ } ++ ++ bch2_page_reservation_put(c, inode, &res); ++ ++ return copied ?: ret; ++} ++ ++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ do { ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE * WRITE_BATCH_PAGES - offset); ++again: ++ /* ++ * Bring in the user page that we will copy from _first_. ++ * Otherwise there's a nasty deadlock on copying from the ++ * same page as we're writing to, without it being marked ++ * up-to-date. ++ * ++ * Not only is this an optimisation, but it is also required ++ * to check that the address is actually valid, when atomic ++ * usercopies are used, below. ++ */ ++ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ++ bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE - offset); ++ ++ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { ++ ret = -EFAULT; ++ break; ++ } ++ } ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ++ if (unlikely(ret < 0)) ++ break; ++ ++ cond_resched(); ++ ++ if (unlikely(ret == 0)) { ++ /* ++ * If we were unable to copy any data at all, we must ++ * fall back to a single segment length write. ++ * ++ * If we didn't fallback here, we could livelock ++ * because not all segments in the iov can be copied at ++ * once without a pagefault. ++ */ ++ bytes = min_t(unsigned long, PAGE_SIZE - offset, ++ iov_iter_single_seg_count(iter)); ++ goto again; ++ } ++ pos += ret; ++ written += ret; ++ ret = 0; ++ ++ balance_dirty_pages_ratelimited(mapping); ++ } while (iov_iter_count(iter)); ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return written ? written : ret; ++} ++ ++/* O_DIRECT reads */ ++ ++static void bio_check_or_release(struct bio *bio, bool check_dirty) ++{ ++ if (check_dirty) { ++ bio_check_pages_dirty(bio); ++ } else { ++ bio_release_pages(bio, false); ++ bio_put(bio); ++ } ++} ++ ++static void bch2_dio_read_complete(struct closure *cl) ++{ ++ struct dio_read *dio = container_of(cl, struct dio_read, cl); ++ ++ dio->req->ki_complete(dio->req, dio->ret); ++ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); ++} ++ ++static void bch2_direct_IO_read_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ ++ if (bio->bi_status) ++ dio->ret = blk_status_to_errno(bio->bi_status); ++ ++ closure_put(&dio->cl); ++} ++ ++static void bch2_direct_IO_read_split_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ bool should_dirty = dio->should_dirty; ++ ++ bch2_direct_IO_read_endio(bio); ++ bio_check_or_release(bio, should_dirty); ++} ++ ++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_read *dio; ++ struct bio *bio; ++ loff_t offset = req->ki_pos; ++ bool sync = is_sync_kiocb(req); ++ size_t shorten; ++ ssize_t ret; ++ ++ if ((offset|iter->count) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ ret = min_t(loff_t, iter->count, ++ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); ++ ++ if (!ret) ++ return ret; ++ ++ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); ++ iter->count -= shorten; ++ ++ bio = bio_alloc_bioset(NULL, ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), ++ REQ_OP_READ, ++ GFP_KERNEL, ++ &c->dio_read_bioset); ++ ++ bio->bi_end_io = bch2_direct_IO_read_endio; ++ ++ dio = container_of(bio, struct dio_read, rbio.bio); ++ closure_init(&dio->cl, NULL); ++ ++ /* ++ * this is a _really_ horrible hack just to avoid an atomic sub at the ++ * end: ++ */ ++ if (!sync) { ++ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER - ++ CLOSURE_RUNNING + ++ CLOSURE_DESTRUCTOR); ++ } else { ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER + 1); ++ } ++ ++ dio->req = req; ++ dio->ret = ret; ++ /* ++ * This is one of the sketchier things I've encountered: we have to skip ++ * the dirtying of requests that are internal from the kernel (i.e. from ++ * loopback), because we'll deadlock on page_lock. ++ */ ++ dio->should_dirty = iter_is_iovec(iter); ++ ++ goto start; ++ while (iter->count) { ++ bio = bio_alloc_bioset(NULL, ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), ++ REQ_OP_READ, ++ GFP_KERNEL, ++ &c->bio_read); ++ bio->bi_end_io = bch2_direct_IO_read_split_endio; ++start: ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); ++ bio->bi_iter.bi_sector = offset >> 9; ++ bio->bi_private = dio; ++ ++ ret = bio_iov_iter_get_pages(bio, iter); ++ if (ret < 0) { ++ /* XXX: fault inject this path */ ++ bio->bi_status = BLK_STS_RESOURCE; ++ bio_endio(bio); ++ break; ++ } ++ ++ offset += bio->bi_iter.bi_size; ++ ++ if (dio->should_dirty) ++ bio_set_pages_dirty(bio); ++ ++ if (iter->count) ++ closure_get(&dio->cl); ++ ++ bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); ++ } ++ ++ iter->count += shorten; ++ ++ if (sync) { ++ closure_sync(&dio->cl); ++ closure_debug_destroy(&dio->cl); ++ ret = dio->ret; ++ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); ++ return ret; ++ } else { ++ return -EIOCBQUEUED; ++ } ++} ++ ++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ size_t count = iov_iter_count(iter); ++ ssize_t ret; ++ ++ if (!count) ++ return 0; /* skip atime */ ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ struct blk_plug plug; ++ ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ return ret; ++ ++ file_accessed(file); ++ ++ blk_start_plug(&plug); ++ ret = bch2_direct_IO_read(iocb, iter); ++ blk_finish_plug(&plug); ++ ++ if (ret >= 0) ++ iocb->ki_pos += ret; ++ } else { ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = generic_file_read_iter(iocb, iter); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ } ++ ++ return ret; ++} ++ ++/* O_DIRECT writes */ ++ ++static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, ++ u64 offset, u64 size, ++ unsigned nr_replicas, bool compressed) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 end = offset + size; ++ u32 snapshot; ++ bool ret = true; ++ int err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (err) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) ++ break; ++ ++ if (k.k->p.snapshot != snapshot || ++ nr_replicas > bch2_bkey_replicas(c, k) || ++ (!compressed && bch2_bkey_sectors_compressed(k))) { ++ ret = false; ++ break; ++ } ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(err, BCH_ERR_transaction_restart)) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return err ? false : ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *); ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = req->ki_filp->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bio *bio = &dio->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ unsigned unaligned, iter_count; ++ bool sync = dio->sync, dropped_locks; ++ long ret; ++ ++ if (dio->loop) ++ goto loop; ++ ++ down(&c->io_in_flight); ++ ++ while (1) { ++ iter_count = dio->iter.count; ++ ++ if (kthread && dio->mm) ++ kthread_use_mm(dio->mm); ++ BUG_ON(current->faults_disabled_mapping); ++ current->faults_disabled_mapping = mapping; ++ ++ ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ ++ dropped_locks = fdm_dropped_locks(); ++ ++ current->faults_disabled_mapping = NULL; ++ if (kthread && dio->mm) ++ kthread_unuse_mm(dio->mm); ++ ++ /* ++ * If the fault handler returned an error but also signalled ++ * that it dropped & retook ei_pagecache_lock, we just need to ++ * re-shoot down the page cache and retry: ++ */ ++ if (dropped_locks && ret) ++ ret = 0; ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ if (unlikely(dropped_locks)) { ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter_count - 1); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (!bio->bi_iter.bi_size) ++ continue; ++ } ++ ++ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); ++ bio->bi_iter.bi_size -= unaligned; ++ iov_iter_revert(&dio->iter, unaligned); ++ ++ if (!bio->bi_iter.bi_size) { ++ /* ++ * bio_iov_iter_get_pages was only able to get < ++ * blocksize worth of pages: ++ */ ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = dio->op.opts.foreground_target; ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.subvol = inode->ei_subvol; ++ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, inode_inum(inode), ++ dio->op.pos.offset, bio_sectors(bio), ++ dio->op.opts.data_replicas, ++ dio->op.opts.compression != 0)) ++ goto err; ++ ++ task_io_account_write(bio->bi_iter.bi_size); ++ ++ if (!dio->sync && !dio->loop && dio->iter.count) { ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) { ++ dio->sync = sync = true; ++ goto do_io; ++ } ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ } ++do_io: ++ dio->loop = true; ++ closure_call(&dio->op.cl, bch2_write, NULL, NULL); ++ ++ if (sync) ++ wait_for_completion(&dio->done); ++ else ++ return -EIOCBQUEUED; ++loop: ++ i_sectors_acct(c, inode, &dio->quota_res, ++ dio->op.i_sectors_delta); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; ++ ++ spin_lock(&inode->v.i_lock); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ bio->bi_vcnt = 0; ++ ++ if (dio->op.error) { ++ set_bit(EI_INODE_ERROR, &inode->ei_flags); ++ break; ++ } ++ ++ if (!dio->iter.count) ++ break; ++ ++ bio_reset(bio, NULL, REQ_OP_WRITE); ++ reinit_completion(&dio->done); ++ } ++ ++ ret = dio->op.error ?: ((long) dio->written << 9); ++err: ++ up(&c->io_in_flight); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ ++ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ bio_put(bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (!sync) { ++ req->ki_complete(req, ret); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *op) ++{ ++ struct dio_write *dio = container_of(op, struct dio_write, op); ++ ++ if (dio->sync) ++ complete(&dio->done); ++ else ++ bch2_dio_write_loop(dio); ++} ++ ++static noinline ++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct dio_write *dio; ++ struct bio *bio; ++ bool locked = true, extending; ++ ssize_t ret; ++ ++ prefetch(&c->opts); ++ prefetch((void *) &c->opts + 64); ++ prefetch(&inode->ei_inode); ++ prefetch((void *) &inode->ei_inode + 64); ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(req, iter); ++ if (unlikely(ret <= 0)) ++ goto err; ++ ++ ret = file_remove_privs(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = file_update_time(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) ++ goto err; ++ ++ inode_dio_begin(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ extending = req->ki_pos + iter->count > inode->v.i_size; ++ if (!extending) { ++ inode_unlock(&inode->v); ++ locked = false; ++ } ++ ++ bio = bio_alloc_bioset(NULL, ++ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), ++ REQ_OP_WRITE, ++ GFP_KERNEL, ++ &c->dio_write_bioset); ++ dio = container_of(bio, struct dio_write, op.wbio.bio); ++ init_completion(&dio->done); ++ dio->req = req; ++ dio->mm = current->mm; ++ dio->loop = false; ++ dio->sync = is_sync_kiocb(req) || extending; ++ dio->free_iov = false; ++ dio->quota_res.sectors = 0; ++ dio->written = 0; ++ dio->iter = *iter; ++ ++ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, ++ iter->count >> 9, true); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = bch2_dio_write_loop(dio); ++err: ++ if (locked) ++ inode_unlock(&inode->v); ++ return ret; ++err_put_bio: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ bio_put(bio); ++ inode_dio_end(&inode->v); ++ goto err; ++} ++ ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) ++ return bch2_direct_write(iocb, from); ++ ++ /* We can write back this queue in page reclaim */ ++ current->backing_dev_info = inode_to_bdi(&inode->v); ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ current->backing_dev_info = NULL; ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++/* fsync: */ ++ ++/* ++ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an ++ * insert trigger: look up the btree inode instead ++ */ ++static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) ++{ ++ struct bch_inode_unpacked inode; ++ int ret; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode); ++ if (ret) ++ return ret; ++ ++ return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); ++} ++ ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2, ret3; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ ret2 = sync_inode_metadata(&inode->v, 1); ++ ret3 = bch2_flush_inode(c, inode_inum(inode)); ++ ++ return ret ?: ret2 ?: ret3; ++} ++ ++/* truncate: */ ++ ++static inline int range_has_data(struct bch_fs *c, u32 subvol, ++ struct bpos start, ++ struct bpos end) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) { ++ ret = 1; ++ break; ++ } ++ } ++ start = iter.pos; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int __bch2_truncate_page(struct bch_inode_info *inode, ++ pgoff_t index, loff_t start, loff_t end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_page_state *s; ++ unsigned start_offset = start & (PAGE_SIZE - 1); ++ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; ++ unsigned i; ++ struct page *page; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ /* Page boundary? Nothing to do */ ++ if (!((index == start >> PAGE_SHIFT && start_offset) || ++ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) ++ return 0; ++ ++ /* Above i_size? */ ++ if (index << PAGE_SHIFT >= inode->v.i_size) ++ return 0; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) { ++ /* ++ * XXX: we're doing two index lookups when we end up reading the ++ * page ++ */ ++ ret = range_has_data(c, inode->ei_subvol, ++ POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); ++ if (ret <= 0) ++ return ret; ++ ++ page = find_or_create_page(mapping, index, GFP_KERNEL); ++ if (unlikely(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ s = bch2_page_state_create(page, 0); ++ if (!s) { ++ ret = -ENOMEM; ++ goto unlock; ++ } ++ ++ if (!PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto unlock; ++ } ++ ++ if (index != start >> PAGE_SHIFT) ++ start_offset = 0; ++ if (index != end >> PAGE_SHIFT) ++ end_offset = PAGE_SIZE; ++ ++ for (i = round_up(start_offset, block_bytes(c)) >> 9; ++ i < round_down(end_offset, block_bytes(c)) >> 9; ++ i++) { ++ s->s[i].nr_replicas = 0; ++ if (s->s[i].state == SECTOR_DIRTY) ++ i_sectors_delta--; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ /* ++ * Caller needs to know whether this page will be written out by ++ * writeback - doing an i_size update if necessary - or whether it will ++ * be responsible for the i_size update: ++ */ ++ ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), ++ PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; ++ ++ zero_user_segment(page, start_offset, end_offset); ++ ++ /* ++ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. ++ * ++ * XXX: because we aren't currently tracking whether the page has actual ++ * data in it (vs. just 0s, or only partially written) this wrong. ick. ++ */ ++ BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); ++ ++ /* ++ * This removes any writeable userspace mappings; we need to force ++ * .page_mkwrite to be called again before any mmapped writes, to ++ * redirty the full page: ++ */ ++ page_mkclean(page); ++ __set_page_dirty_nobuffers(page); ++unlock: ++ unlock_page(page); ++ put_page(page); ++out: ++ return ret; ++} ++ ++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) ++{ ++ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, ++ from, round_up(from, PAGE_SIZE)); ++} ++ ++static int bch2_truncate_pages(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, ++ start, end); ++ ++ if (ret >= 0 && ++ start >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ start, end); ++ return ret; ++} ++ ++static int bch2_extend(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode_u, ++ struct iattr *iattr) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ int ret; ++ ++ /* ++ * sync appends: ++ * ++ * this has to be done _before_ extending i_size: ++ */ ++ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); ++ if (ret) ++ return ret; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ return bch2_setattr_nonsize(mnt_userns, inode, iattr); ++} ++ ++static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ return 0; ++} ++ ++static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ u64 *new_i_size = p; ++ ++ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_size = *new_i_size; ++ return 0; ++} ++ ++int bch2_truncate(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_inode_unpacked inode_u; ++ u64 new_i_size = iattr->ia_size; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ /* ++ * If the truncate call with change the size of the file, the ++ * cmtimes should be updated. If the size will not change, we ++ * do not need to update the cmtimes. ++ */ ++ if (iattr->ia_size != inode->v.i_size) { ++ if (!(iattr->ia_valid & ATTR_MTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_mtime); ++ if (!(iattr->ia_valid & ATTR_CTIME)) ++ ktime_get_coarse_real_ts64(&iattr->ia_ctime); ++ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; ++ } ++ ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); ++ if (ret) ++ goto err; ++ ++ /* ++ * check this before next assertion; on filesystem error our normal ++ * invariants are a bit broken (truncate has to truncate the page cache ++ * before the inode). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && ++ inode->v.i_size < inode_u.bi_size); ++ ++ if (iattr->ia_size > inode->v.i_size) { ++ ret = bch2_extend(mnt_userns, inode, &inode_u, iattr); ++ goto err; ++ } ++ ++ iattr->ia_valid &= ~ATTR_SIZE; ++ ++ ret = bch2_truncate_page(inode, iattr->ia_size); ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ /* ++ * When extending, we're going to write the new i_size to disk ++ * immediately so we need to flush anything above the current on disk ++ * i_size first: ++ * ++ * Also, when extending we need to flush the page that i_size currently ++ * straddles - if it's mapped to userspace, we need to ensure that ++ * userspace has to redirty it and call .mkwrite -> set_page_dirty ++ * again to allocate the part of the page that was extended. ++ */ ++ if (iattr->ia_size > inode_u.bi_size) ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, ++ iattr->ia_size - 1); ++ else if (iattr->ia_size & (PAGE_SIZE - 1)) ++ ret = filemap_write_and_wait_range(mapping, ++ round_down(iattr->ia_size, PAGE_SIZE), ++ iattr->ia_size - 1); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, ++ &new_i_size, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ ret = bch2_fpunch(c, inode_inum(inode), ++ round_up(iattr->ia_size, block_bytes(c)) >> 9, ++ U64_MAX, &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && ++ !bch2_journal_error(&c->journal), c, ++ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", ++ inode->v.i_ino, (u64) inode->v.i_blocks, ++ inode->ei_inode.bi_sectors); ++ if (unlikely(ret)) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ return ret; ++} ++ ++/* fallocate: */ ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ ++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 end = offset + len; ++ u64 block_start = round_up(offset, block_bytes(c)); ++ u64 block_end = round_down(end, block_bytes(c)); ++ bool truncated_last_page; ++ int ret = 0; ++ ++ ret = bch2_truncate_pages(inode, offset, end); ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ truncated_last_page = ret; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ ++ if (block_start < block_end ) { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode_inum(inode), ++ block_start >> 9, block_end >> 9, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (end >= inode->v.i_size && !truncated_last_page) { ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ } else { ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ } ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ return ret; ++} ++ ++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ++ loff_t offset, loff_t len, ++ bool insert) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bkey_buf copy; ++ struct btree_trans trans; ++ struct btree_iter src, dst, del; ++ loff_t shift, new_size; ++ u64 src_start; ++ int ret = 0; ++ ++ if ((offset | len) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ if (insert) { ++ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) ++ return -EFBIG; ++ ++ if (offset >= inode->v.i_size) ++ return -EINVAL; ++ ++ src_start = U64_MAX; ++ shift = len; ++ } else { ++ if (offset + len >= inode->v.i_size) ++ return -EINVAL; ++ ++ src_start = offset + len; ++ shift = -len; ++ } ++ ++ new_size = inode->v.i_size + shift; ++ ++ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ if (ret) ++ return ret; ++ ++ if (insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } else { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode_inum(inode), ++ offset >> 9, (offset + len) >> 9, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ return ret; ++ } ++ ++ bch2_bkey_buf_init(©); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, ++ POS(inode->v.i_ino, src_start >> 9), ++ BTREE_ITER_INTENT); ++ bch2_trans_copy_iter(&dst, &src); ++ bch2_trans_copy_iter(&del, &src); ++ ++ while (ret == 0 || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ struct bpos next_pos; ++ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); ++ struct bpos atomic_end; ++ unsigned trigger_flags = 0; ++ u32 snapshot; ++ ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, ++ inode->ei_subvol, &snapshot); ++ if (ret) ++ continue; ++ ++ bch2_btree_iter_set_snapshot(&src, snapshot); ++ bch2_btree_iter_set_snapshot(&dst, snapshot); ++ bch2_btree_iter_set_snapshot(&del, snapshot); ++ ++ bch2_trans_begin(&trans); ++ ++ k = insert ++ ? bch2_btree_iter_peek_prev(&src) ++ : bch2_btree_iter_peek(&src); ++ if ((ret = bkey_err(k))) ++ continue; ++ ++ if (!k.k || k.k->p.inode != inode->v.i_ino) ++ break; ++ ++ if (insert && ++ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) ++ break; ++reassemble: ++ bch2_bkey_buf_reassemble(©, c, k); ++ ++ if (insert && ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) ++ bch2_cut_front(move_pos, copy.k); ++ ++ copy.k->k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); ++ ++ ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); ++ if (ret) ++ continue; ++ ++ if (bkey_cmp(atomic_end, copy.k->k.p)) { ++ if (insert) { ++ move_pos = atomic_end; ++ move_pos.offset -= shift >> 9; ++ goto reassemble; ++ } else { ++ bch2_cut_back(atomic_end, copy.k); ++ } ++ } ++ ++ bkey_init(&delete.k); ++ delete.k.p = copy.k->k.p; ++ delete.k.size = copy.k->k.size; ++ delete.k.p.offset -= shift >> 9; ++ bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); ++ ++ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; ++ ++ if (copy.k->k.size != k.k->size) { ++ /* We might end up splitting compressed extents: */ ++ unsigned nr_ptrs = ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ copy.k->k.size, nr_ptrs, ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ } ++ ++ ret = bch2_btree_iter_traverse(&del) ?: ++ bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: ++ bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (!ret) ++ bch2_btree_iter_set_pos(&src, next_pos); ++ } ++ bch2_trans_iter_exit(&trans, &del); ++ bch2_trans_iter_exit(&trans, &dst); ++ bch2_trans_iter_exit(&trans, &src); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(©, c); ++ ++ if (ret) ++ return ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (!insert) { ++ i_size_write(&inode->v, new_size); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ } else { ++ /* We need an inode update to update bi_journal_seq for fsync: */ ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ } ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ u64 start_sector, u64 end_sector) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bpos end_pos = POS(inode->v.i_ino, end_sector); ++ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ POS(inode->v.i_ino, start_sector), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (!ret && bkey_cmp(iter.pos, end_pos) < 0) { ++ s64 i_sectors_delta = 0; ++ struct disk_reservation disk_res = { 0 }; ++ struct quota_res quota_res = { 0 }; ++ struct bkey_i_reservation reservation; ++ struct bkey_s_c k; ++ unsigned sectors; ++ u32 snapshot; ++ ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, ++ inode->ei_subvol, &snapshot); ++ if (ret) ++ goto bkey_err; ++ ++ bch2_btree_iter_set_snapshot(&iter, snapshot); ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ /* already reserved */ ++ if (k.k->type == KEY_TYPE_reservation && ++ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { ++ bch2_btree_iter_advance(&iter); ++ continue; ++ } ++ ++ if (bkey_extent_is_data(k.k) && ++ !(mode & FALLOC_FL_ZERO_RANGE)) { ++ bch2_btree_iter_advance(&iter); ++ continue; ++ } ++ ++ bkey_reservation_init(&reservation.k_i); ++ reservation.k.type = KEY_TYPE_reservation; ++ reservation.k.p = k.k->p; ++ reservation.k.size = k.k->size; ++ ++ bch2_cut_front(iter.pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k_i); ++ ++ sectors = reservation.k.size; ++ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); ++ ++ if (!bkey_extent_is_allocation(k.k)) { ++ ret = bch2_quota_reservation_add(c, inode, ++ "a_res, ++ sectors, true); ++ if (unlikely(ret)) ++ goto bkey_err; ++ } ++ ++ if (reservation.v.nr_replicas < replicas || ++ bch2_bkey_sectors_compressed(k)) { ++ ret = bch2_disk_reservation_get(c, &disk_res, sectors, ++ replicas, 0); ++ if (unlikely(ret)) ++ goto bkey_err; ++ ++ reservation.v.nr_replicas = disk_res.nr_replicas; ++ } ++ ++ ret = bch2_extent_update(&trans, inode_inum(inode), &iter, ++ &reservation.k_i, ++ &disk_res, NULL, ++ 0, &i_sectors_delta, true); ++ if (ret) ++ goto bkey_err; ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++bkey_err: ++ bch2_quota_reservation_put(c, inode, "a_res); ++ bch2_disk_reservation_put(c, &disk_res); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ } ++ ++ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ ++ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); ++ ++ if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { ++ struct quota_res quota_res = { 0 }; ++ s64 i_sectors_delta = 0; ++ ++ bch2_fpunch_at(&trans, &iter, inode_inum(inode), ++ end_sector, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++ bch2_quota_reservation_put(c, inode, "a_res); ++ } ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 end = offset + len; ++ u64 block_start = round_down(offset, block_bytes(c)); ++ u64 block_end = round_up(end, block_bytes(c)); ++ bool truncated_last_page = false; ++ int ret, ret2 = 0; ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ return ret; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = bch2_truncate_pages(inode, offset, end); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ truncated_last_page = ret; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ ++ block_start = round_up(offset, block_bytes(c)); ++ block_end = round_down(end, block_bytes(c)); ++ } ++ ++ ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); ++ ++ /* ++ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, ++ * so that the VFS cache i_size is consistent with the btree i_size: ++ */ ++ if (ret && ++ !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) ++ return ret; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) ++ end = inode->v.i_size; ++ ++ if (end >= inode->v.i_size && ++ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || ++ !(mode & FALLOC_FL_KEEP_SIZE))) { ++ spin_lock(&inode->v.i_lock); ++ i_size_write(&inode->v, end); ++ spin_unlock(&inode->v.i_lock); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret2 = bch2_write_inode_size(c, inode, end, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++ ++ return ret ?: ret2; ++} ++ ++long bch2_fallocate_dispatch(struct file *file, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ long ret; ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return -EROFS; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ++ ret = bchfs_fallocate(inode, mode, offset, len); ++ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ++ ret = bchfs_fpunch(inode, offset, len); ++ else if (mode == FALLOC_FL_INSERT_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, true); ++ else if (mode == FALLOC_FL_COLLAPSE_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, false); ++ else ++ ret = -EOPNOTSUPP; ++ ++ ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ percpu_ref_put(&c->writes); ++ ++ return ret; ++} ++ ++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ++ struct file *file_dst, loff_t pos_dst, ++ loff_t len, unsigned remap_flags) ++{ ++ struct bch_inode_info *src = file_bch_inode(file_src); ++ struct bch_inode_info *dst = file_bch_inode(file_dst); ++ struct bch_fs *c = src->v.i_sb->s_fs_info; ++ s64 i_sectors_delta = 0; ++ u64 aligned_len; ++ loff_t ret = 0; ++ ++ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) ++ return -EINVAL; ++ ++ if (remap_flags & REMAP_FILE_DEDUP) ++ return -EOPNOTSUPP; ++ ++ if ((pos_src & (block_bytes(c) - 1)) || ++ (pos_dst & (block_bytes(c) - 1))) ++ return -EINVAL; ++ ++ if (src == dst && ++ abs(pos_src - pos_dst) < len) ++ return -EINVAL; ++ ++ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ file_update_time(file_dst); ++ ++ inode_dio_wait(&src->v); ++ inode_dio_wait(&dst->v); ++ ++ ret = generic_remap_file_range_prep(file_src, pos_src, ++ file_dst, pos_dst, ++ &len, remap_flags); ++ if (ret < 0 || len == 0) ++ goto err; ++ ++ aligned_len = round_up((u64) len, block_bytes(c)); ++ ++ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ pos_dst, pos_dst + len - 1); ++ if (ret) ++ goto err; ++ ++ mark_pagecache_unallocated(src, pos_src >> 9, ++ (pos_src + aligned_len) >> 9); ++ ++ ret = bch2_remap_range(c, ++ inode_inum(dst), pos_dst >> 9, ++ inode_inum(src), pos_src >> 9, ++ aligned_len >> 9, ++ pos_dst + len, &i_sectors_delta); ++ if (ret < 0) ++ goto err; ++ ++ /* ++ * due to alignment, we might have remapped slightly more than requsted ++ */ ++ ret = min((u64) ret << 9, (u64) len); ++ ++ /* XXX get a quota reservation */ ++ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ ++ spin_lock(&dst->v.i_lock); ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); ++ spin_unlock(&dst->v.i_lock); ++ ++ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || ++ IS_SYNC(file_inode(file_dst))) ++ ret = bch2_flush_inode(c, inode_inum(dst)); ++err: ++ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ return ret; ++} ++ ++/* fseek: */ ++ ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ struct page *page; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ loff_t ret; ++ int offset; ++ ++ while (index <= end_index) { ++ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { ++ lock_page(page); ++ ++ offset = page_data_offset(page, ++ page->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ unlock_page(page); ++ put_page(page); ++ return ret; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ } else { ++ break; ++ } ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_data(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ subvol_inum inum = inode_inum(inode); ++ u64 isize, next_data = MAX_LFS_FILESIZE; ++ u32 snapshot; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ break; ++ } else if (bkey_extent_is_data(k.k)) { ++ next_data = max(offset, bkey_start_offset(k.k) << 9); ++ break; ++ } else if (k.k->p.offset >> 9 > isize) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); ++ ++ if (next_data >= isize) ++ return -ENXIO; ++ ++ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); ++} ++ ++static int __page_hole_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (!s) ++ return 0; ++ ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state < SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) ++{ ++ pgoff_t index = offset >> PAGE_SHIFT; ++ struct page *page; ++ int pg_offset; ++ loff_t ret = -1; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) ++ return offset; ++ ++ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); ++ if (pg_offset >= 0) ++ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; ++ ++ unlock_page(page); ++ ++ return ret; ++} ++ ++static loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset, hole; ++ ++ while (offset < end_offset) { ++ hole = page_hole_offset(mapping, offset); ++ if (hole >= 0 && hole <= end_offset) ++ return max(start_offset, hole); ++ ++ offset += PAGE_SIZE; ++ offset &= PAGE_MASK; ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_hole(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ subvol_inum inum = inode_inum(inode); ++ u64 isize, next_hole = MAX_LFS_FILESIZE; ++ u32 snapshot; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inode->v.i_ino, offset >> 9, snapshot), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ offset, MAX_LFS_FILESIZE); ++ break; ++ } else if (!bkey_extent_is_data(k.k)) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ max(offset, bkey_start_offset(k.k) << 9), ++ k.k->p.offset << 9); ++ ++ if (next_hole < k.k->p.offset << 9) ++ break; ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (next_hole > isize) ++ next_hole = isize; ++ ++ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); ++} ++ ++loff_t bch2_llseek(struct file *file, loff_t offset, int whence) ++{ ++ switch (whence) { ++ case SEEK_SET: ++ case SEEK_CUR: ++ case SEEK_END: ++ return generic_file_llseek(file, offset, whence); ++ case SEEK_DATA: ++ return bch2_seek_data(file, offset); ++ case SEEK_HOLE: ++ return bch2_seek_hole(file, offset); ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_fs_fsio_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fsio_init(struct bch_fs *c) ++{ ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ ret = -ENOMEM; ++ ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +new file mode 100644 +index 000000000000..7f2d7f454be4 +--- /dev/null ++++ b/fs/bcachefs/fs-io.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_H ++#define _BCACHEFS_FS_IO_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++#include "buckets.h" ++#include "io_types.h" ++ ++#include ++ ++struct quota_res; ++ ++int __must_check bch2_write_inode_size(struct bch_fs *, ++ struct bch_inode_info *, ++ loff_t, unsigned); ++ ++int bch2_readpage(struct file *, struct page *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++void bch2_readahead(struct readahead_control *); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++int bch2_fsync(struct file *, loff_t, loff_t, int); ++ ++int bch2_truncate(struct user_namespace *, ++ struct bch_inode_info *, struct iattr *); ++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); ++ ++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, ++ loff_t, loff_t, unsigned); ++ ++loff_t bch2_llseek(struct file *, loff_t, int); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidate_folio(struct folio *, size_t, size_t); ++int bch2_releasepage(struct page *, gfp_t); ++int bch2_migrate_page(struct address_space *, struct page *, ++ struct page *, enum migrate_mode); ++ ++void bch2_fs_fsio_exit(struct bch_fs *); ++int bch2_fs_fsio_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_H */ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +new file mode 100644 +index 000000000000..9f329a624c12 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.c +@@ -0,0 +1,523 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-ioctl.h" ++#include "quota.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ ++#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ ++#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ ++ ++struct flags_set { ++ unsigned mask; ++ unsigned flags; ++ ++ unsigned projid; ++}; ++ ++static int bch2_inode_flags_set(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ /* ++ * We're relying on btree locking here for exclusion with other ioctl ++ * calls - use the flags in the btree (@bi), not inode->i_flags: ++ */ ++ struct flags_set *s = p; ++ unsigned newflags = s->flags; ++ unsigned oldflags = bi->bi_flags & s->mask; ++ ++ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ !capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ ++ if (!S_ISREG(bi->bi_mode) && ++ !S_ISDIR(bi->bi_mode) && ++ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ return -EINVAL; ++ ++ bi->bi_flags &= ~s->mask; ++ bi->bi_flags |= newflags; ++ ++ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); ++ return 0; ++} ++ ++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) ++{ ++ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); ++ ++ return put_user(flags, arg); ++} ++ ++static int bch2_ioc_setflags(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ void __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; ++ unsigned uflags; ++ int ret; ++ ++ if (get_user(uflags, (int __user *) arg)) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); ++ if (uflags) ++ return -EOPNOTSUPP; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ++ ret = -EACCES; ++ goto setflags_out; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ++ ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++setflags_out: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct fsxattr fa = { 0 }; ++ ++ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; ++ ++ return copy_to_user(arg, &fa, sizeof(fa)); ++} ++ ++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct flags_set *s = p; ++ ++ if (s->projid != bi->bi_project) { ++ bi->bi_fields_set |= 1U << Inode_opt_project; ++ bi->bi_project = s->projid; ++ } ++ ++ return bch2_inode_flags_set(inode, bi, p); ++} ++ ++static int bch2_ioc_fssetxattr(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; ++ struct fsxattr fa; ++ int ret; ++ ++ if (copy_from_user(&fa, arg, sizeof(fa))) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); ++ if (fa.fsx_xflags) ++ return -EOPNOTSUPP; ++ ++ if (fa.fsx_projid >= U32_MAX) ++ return -EINVAL; ++ ++ /* ++ * inode fields accessible via the xattr interface are stored with a +1 ++ * bias, so that 0 means unset: ++ */ ++ s.projid = fa.fsx_projid + 1; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ++ ret = -EACCES; ++ goto err; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_set_projid(c, inode, fa.fsx_projid); ++ if (ret) ++ goto err_unlock; ++ ++ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ++ ATTR_CTIME); ++err_unlock: ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_inode_info *dir = p; ++ ++ return !bch2_reinherit_attrs(bi, &dir->ei_inode); ++} ++ ++static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *src, ++ const char __user *name) ++{ ++ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); ++ struct bch_inode_info *dst; ++ struct inode *vinode = NULL; ++ char *kname = NULL; ++ struct qstr qstr; ++ int ret = 0; ++ subvol_inum inum; ++ ++ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); ++ if (!kname) ++ return -ENOMEM; ++ ++ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); ++ if (unlikely(ret < 0)) ++ goto err1; ++ ++ qstr.len = ret; ++ qstr.name = kname; ++ ++ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); ++ if (ret) ++ goto err1; ++ ++ vinode = bch2_vfs_inode_get(c, inum); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) ++ goto err1; ++ ++ dst = to_bch_ei(vinode); ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ goto err2; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ if (inode_attr_changing(src, dst, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst, ++ src->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err3; ++ } ++ ++ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); ++err3: ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ /* return true if we did work */ ++ if (ret >= 0) ++ ret = !ret; ++ ++ mnt_drop_write_file(file); ++err2: ++ iput(vinode); ++err1: ++ kfree(kname); ++ ++ return ret; ++} ++ ++static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) ++{ ++ u32 flags; ++ int ret = 0; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (get_user(flags, arg)) ++ return -EFAULT; ++ ++ bch_notice(c, "shutdown by ioctl type %u", flags); ++ ++ down_write(&c->vfs_sb->s_umount); ++ ++ switch (flags) { ++ case FSOP_GOING_FLAGS_DEFAULT: ++ ret = freeze_bdev(c->vfs_sb->s_bdev); ++ if (ret) ++ goto err; ++ ++ bch2_journal_flush(&c->journal); ++ c->vfs_sb->s_flags |= SB_RDONLY; ++ bch2_fs_emergency_read_only(c); ++ thaw_bdev(c->vfs_sb->s_bdev); ++ break; ++ ++ case FSOP_GOING_FLAGS_LOGFLUSH: ++ bch2_journal_flush(&c->journal); ++ fallthrough; ++ ++ case FSOP_GOING_FLAGS_NOLOGFLUSH: ++ c->vfs_sb->s_flags |= SB_RDONLY; ++ bch2_fs_emergency_read_only(c); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++err: ++ up_write(&c->vfs_sb->s_umount); ++ return ret; ++} ++ ++static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, ++ struct bch_ioctl_subvolume arg) ++{ ++ struct inode *dir; ++ struct bch_inode_info *inode; ++ struct user_namespace *s_user_ns; ++ struct dentry *dst_dentry; ++ struct path src_path, dst_path; ++ int how = LOOKUP_FOLLOW; ++ int error; ++ subvol_inum snapshot_src = { 0 }; ++ unsigned lookup_flags = 0; ++ unsigned create_flags = BCH_CREATE_SUBVOL; ++ ++ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| ++ BCH_SUBVOL_SNAPSHOT_RO)) ++ return -EINVAL; ++ ++ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && ++ (arg.src_ptr || ++ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) ++ return -EINVAL; ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) ++ create_flags |= BCH_CREATE_SNAPSHOT; ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) ++ create_flags |= BCH_CREATE_SNAPSHOT_RO; ++ ++ /* why do we need this lock? */ ++ down_read(&c->vfs_sb->s_umount); ++ ++ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) ++ sync_inodes_sb(c->vfs_sb); ++retry: ++ if (arg.src_ptr) { ++ error = user_path_at(arg.dirfd, ++ (const char __user *)(unsigned long)arg.src_ptr, ++ how, &src_path); ++ if (error) ++ goto err1; ++ ++ if (src_path.dentry->d_sb->s_fs_info != c) { ++ path_put(&src_path); ++ error = -EXDEV; ++ goto err1; ++ } ++ ++ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); ++ } ++ ++ dst_dentry = user_path_create(arg.dirfd, ++ (const char __user *)(unsigned long)arg.dst_ptr, ++ &dst_path, lookup_flags); ++ error = PTR_ERR_OR_ZERO(dst_dentry); ++ if (error) ++ goto err2; ++ ++ if (dst_dentry->d_sb->s_fs_info != c) { ++ error = -EXDEV; ++ goto err3; ++ } ++ ++ if (dst_dentry->d_inode) { ++ error = -EEXIST; ++ goto err3; ++ } ++ ++ dir = dst_path.dentry->d_inode; ++ if (IS_DEADDIR(dir)) { ++ error = -ENOENT; ++ goto err3; ++ } ++ ++ s_user_ns = dir->i_sb->s_user_ns; ++ if (!kuid_has_mapping(s_user_ns, current_fsuid()) || ++ !kgid_has_mapping(s_user_ns, current_fsgid())) { ++ error = -EOVERFLOW; ++ goto err3; ++ } ++ ++ error = inode_permission(file_mnt_user_ns(filp), ++ dir, MAY_WRITE | MAY_EXEC); ++ if (error) ++ goto err3; ++ ++ if (!IS_POSIXACL(dir)) ++ arg.mode &= ~current_umask(); ++ ++ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); ++ if (error) ++ goto err3; ++ ++ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && ++ !arg.src_ptr) ++ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; ++ ++ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), ++ dst_dentry, arg.mode|S_IFDIR, ++ 0, snapshot_src, create_flags); ++ error = PTR_ERR_OR_ZERO(inode); ++ if (error) ++ goto err3; ++ ++ d_instantiate(dst_dentry, &inode->v); ++ fsnotify_mkdir(dir, dst_dentry); ++err3: ++ done_path_create(&dst_path, dst_dentry); ++err2: ++ if (arg.src_ptr) ++ path_put(&src_path); ++ ++ if (retry_estale(error, lookup_flags)) { ++ lookup_flags |= LOOKUP_REVAL; ++ goto retry; ++ } ++err1: ++ up_read(&c->vfs_sb->s_umount); ++ ++ return error; ++} ++ ++static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ++ struct bch_ioctl_subvolume arg) ++{ ++ struct path path; ++ struct inode *dir; ++ int ret = 0; ++ ++ if (arg.flags) ++ return -EINVAL; ++ ++ ret = user_path_at(arg.dirfd, ++ (const char __user *)(unsigned long)arg.dst_ptr, ++ LOOKUP_FOLLOW, &path); ++ if (ret) ++ return ret; ++ ++ if (path.dentry->d_sb->s_fs_info != c) { ++ path_put(&path); ++ return -EXDEV; ++ } ++ ++ dir = path.dentry->d_parent->d_inode; ++ ++ ret = __bch2_unlink(dir, path.dentry, true); ++ if (!ret) { ++ fsnotify_rmdir(dir, path.dentry); ++ d_delete(path.dentry); ++ } ++ path_put(&path); ++ ++ return ret; ++} ++ ++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ return bch2_ioc_getflags(inode, (int __user *) arg); ++ ++ case FS_IOC_SETFLAGS: ++ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); ++ ++ case FS_IOC_FSGETXATTR: ++ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); ++ case FS_IOC_FSSETXATTR: ++ return bch2_ioc_fssetxattr(c, file, inode, ++ (void __user *) arg); ++ ++ case BCHFS_IOC_REINHERIT_ATTRS: ++ return bch2_ioc_reinherit_attrs(c, file, inode, ++ (void __user *) arg); ++ ++ case FS_IOC_GETVERSION: ++ return -ENOTTY; ++ case FS_IOC_SETVERSION: ++ return -ENOTTY; ++ ++ case FS_IOC_GOINGDOWN: ++ return bch2_ioc_goingdown(c, (u32 __user *) arg); ++ ++ case BCH_IOCTL_SUBVOLUME_CREATE: { ++ struct bch_ioctl_subvolume i; ++ ++ if (copy_from_user(&i, (void __user *) arg, sizeof(i))) ++ return -EFAULT; ++ return bch2_ioctl_subvolume_create(c, file, i); ++ } ++ ++ case BCH_IOCTL_SUBVOLUME_DESTROY: { ++ struct bch_ioctl_subvolume i; ++ ++ if (copy_from_user(&i, (void __user *) arg, sizeof(i))) ++ return -EFAULT; ++ return bch2_ioctl_subvolume_destroy(c, file, i); ++ } ++ ++ default: ++ return bch2_fs_ioctl(c, cmd, (void __user *) arg); ++ } ++} ++ ++#ifdef CONFIG_COMPAT ++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ /* These are just misnamed, they actually get/put from/to user an int */ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ cmd = FS_IOC_GETFLAGS; ++ break; ++ case FS_IOC32_SETFLAGS: ++ cmd = FS_IOC_SETFLAGS; ++ break; ++ default: ++ return -ENOIOCTLCMD; ++ } ++ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); ++} ++#endif ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h +new file mode 100644 +index 000000000000..f201980ef2c3 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IOCTL_H ++#define _BCACHEFS_FS_IOCTL_H ++ ++/* Inode flags: */ ++ ++/* bcachefs inode flags -> vfs inode flags: */ ++static const unsigned bch_flags_to_vfs[] = { ++ [__BCH_INODE_SYNC] = S_SYNC, ++ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, ++ [__BCH_INODE_APPEND] = S_APPEND, ++ [__BCH_INODE_NOATIME] = S_NOATIME, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ ++static const unsigned bch_flags_to_uflags[] = { ++ [__BCH_INODE_SYNC] = FS_SYNC_FL, ++ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_APPEND] = FS_APPEND_FL, ++ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, ++ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ ++static const unsigned bch_flags_to_xflags[] = { ++ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, ++ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, ++ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; ++}; ++ ++#define set_flags(_map, _in, _out) \ ++do { \ ++ unsigned _i; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & (1 << _i)) \ ++ (_out) |= _map[_i]; \ ++ else \ ++ (_out) &= ~_map[_i]; \ ++} while (0) ++ ++#define map_flags(_map, _in) \ ++({ \ ++ unsigned _out = 0; \ ++ \ ++ set_flags(_map, _in, _out); \ ++ _out; \ ++}) ++ ++#define map_flags_rev(_map, _in) \ ++({ \ ++ unsigned _i, _out = 0; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & _map[_i]) { \ ++ (_out) |= 1 << _i; \ ++ (_in) &= ~_map[_i]; \ ++ } \ ++ (_out); \ ++}) ++ ++#define map_defined(_map) \ ++({ \ ++ unsigned _in = ~0; \ ++ \ ++ map_flags_rev(_map, _in); \ ++}) ++ ++/* Set VFS inode flags from bcachefs inode: */ ++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) ++{ ++ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); ++} ++ ++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); ++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); ++ ++#endif /* _BCACHEFS_FS_IOCTL_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +new file mode 100644 +index 000000000000..6d57bd87bfd5 +--- /dev/null ++++ b/fs/bcachefs/fs.c +@@ -0,0 +1,1939 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "errcode.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-io.h" ++#include "fs-ioctl.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "quota.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *bch2_inode_cache; ++ ++static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ struct bch_subvolume *); ++ ++static void __pagecache_lock_put(struct pagecache_lock *lock, long i) ++{ ++ BUG_ON(atomic_long_read(&lock->v) == 0); ++ ++ if (atomic_long_sub_return_release(i, &lock->v) == 0) ++ wake_up_all(&lock->wait); ++} ++ ++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) ++{ ++ long v = atomic_long_read(&lock->v), old; ++ ++ do { ++ old = v; ++ ++ if (i > 0 ? v < 0 : v > 0) ++ return false; ++ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, ++ old, old + i)) != old); ++ return true; ++} ++ ++static void __pagecache_lock_get(struct pagecache_lock *lock, long i) ++{ ++ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, 1); ++} ++ ++bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) ++{ ++ return __pagecache_lock_tryget(lock, 1); ++} ++ ++void bch2_pagecache_add_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, 1); ++} ++ ++void bch2_pagecache_block_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, -1); ++} ++ ++void bch2_pagecache_block_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, -1); ++} ++ ++void bch2_inode_update_after_write(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ unsigned fields) ++{ ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(bi->bi_inum != inode->v.i_ino); ++ ++ bch2_assert_pos_locked(trans, BTREE_ID_inodes, ++ POS(0, bi->bi_inum), ++ c->opts.inodes_use_key_cache); ++ ++ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); ++ i_uid_write(&inode->v, bi->bi_uid); ++ i_gid_write(&inode->v, bi->bi_gid); ++ inode->v.i_mode = bi->bi_mode; ++ ++ if (fields & ATTR_ATIME) ++ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); ++ if (fields & ATTR_MTIME) ++ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); ++ if (fields & ATTR_CTIME) ++ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); ++ ++ inode->ei_inode = *bi; ++ ++ bch2_inode_flags_to_vfs(inode); ++} ++ ++int __must_check bch2_write_inode(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ inode_set_fn set, ++ void *p, unsigned fields) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 512); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), ++ BTREE_ITER_INTENT) ?: ++ (set ? set(inode, &inode_u, p) : 0) ?: ++ bch2_inode_write(&trans, &iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); ++ ++ /* ++ * the btree node lock protects inode->ei_inode, not ei_update_lock; ++ * this is important for inode updates via bchfs_write_index_update ++ */ ++ if (!ret) ++ bch2_inode_update_after_write(&trans, inode, &inode_u, fields); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_fs_quota_transfer(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_qid new_qid, ++ unsigned qtypes, ++ enum quota_acct_mode mode) ++{ ++ unsigned i; ++ int ret; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ for (i = 0; i < QTYP_NR; i++) ++ if (new_qid.q[i] == inode->ei_qid.q[i]) ++ qtypes &= ~(1U << i); ++ ++ if (!qtypes) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ++ ret = bch2_quota_transfer(c, qtypes, new_qid, ++ inode->ei_qid, ++ inode->v.i_blocks + ++ inode->ei_quota_reserved, ++ mode); ++ if (!ret) ++ for (i = 0; i < QTYP_NR; i++) ++ if (qtypes & (1 << i)) ++ inode->ei_qid.q[i] = new_qid.q[i]; ++ ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++static int bch2_iget5_test(struct inode *vinode, void *p) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ subvol_inum *inum = p; ++ ++ return inode->ei_subvol == inum->subvol && ++ inode->ei_inode.bi_inum == inum->inum; ++} ++ ++static int bch2_iget5_set(struct inode *vinode, void *p) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ subvol_inum *inum = p; ++ ++ inode->v.i_ino = inum->inum; ++ inode->ei_subvol = inum->subvol; ++ inode->ei_inode.bi_inum = inum->inum; ++ return 0; ++} ++ ++static unsigned bch2_inode_hash(subvol_inum inum) ++{ ++ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ struct bch_inode_info *inode; ++ struct btree_trans trans; ++ struct bch_subvolume subvol; ++ int ret; ++ ++ inode = to_bch_ei(iget5_locked(c->vfs_sb, ++ bch2_inode_hash(inum), ++ bch2_iget5_test, ++ bch2_iget5_set, ++ &inum)); ++ if (unlikely(!inode)) ++ return ERR_PTR(-ENOMEM); ++ if (!(inode->v.i_state & I_NEW)) ++ return &inode->v; ++ ++ bch2_trans_init(&trans, c, 8, 0); ++ ret = lockrestart_do(&trans, ++ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: ++ bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); ++ ++ if (!ret) ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); ++ bch2_trans_exit(&trans); ++ ++ if (ret) { ++ iget_failed(&inode->v); ++ return ERR_PTR(ret); ++ } ++ ++ unlock_new_inode(&inode->v); ++ ++ return &inode->v; ++} ++ ++struct bch_inode_info * ++__bch2_create(struct user_namespace *mnt_userns, ++ struct bch_inode_info *dir, struct dentry *dentry, ++ umode_t mode, dev_t rdev, subvol_inum snapshot_src, ++ unsigned flags) ++{ ++ struct bch_fs *c = dir->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u; ++ struct bch_inode_info *inode, *old; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *default_acl = NULL, *acl = NULL; ++ subvol_inum inum; ++ struct bch_subvolume subvol; ++ u64 journal_seq = 0; ++ int ret; ++ ++ /* ++ * preallocate acls + vfs inode before btree transaction, so that ++ * nothing can fail after the transaction succeeds: ++ */ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); ++ if (ret) ++ return ERR_PTR(ret); ++#endif ++ inode = to_bch_ei(new_inode(c->vfs_sb)); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto err; ++ } ++ ++ bch2_inode_init_early(c, &inode_u); ++ ++ if (!(flags & BCH_CREATE_TMPFILE)) ++ mutex_lock(&dir->ei_update_lock); ++ ++ bch2_trans_init(&trans, c, 8, ++ 2048 + (!(flags & BCH_CREATE_TMPFILE) ++ ? dentry->d_name.len : 0)); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_create_trans(&trans, ++ inode_inum(dir), &dir_u, &inode_u, ++ !(flags & BCH_CREATE_TMPFILE) ++ ? &dentry->d_name : NULL, ++ from_kuid(mnt_userns, current_fsuid()), ++ from_kgid(mnt_userns, current_fsgid()), ++ mode, rdev, ++ default_acl, acl, snapshot_src, flags) ?: ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (unlikely(ret)) ++ goto err_before_quota; ++ ++ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; ++ inum.inum = inode_u.bi_inum; ++ ++ ret = bch2_subvolume_get(&trans, inum.subvol, true, ++ BTREE_ITER_WITH_UPDATES, &subvol) ?: ++ bch2_trans_commit(&trans, NULL, &journal_seq, 0); ++ if (unlikely(ret)) { ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++err_before_quota: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ goto err_trans; ++ } ++ ++ if (!(flags & BCH_CREATE_TMPFILE)) { ++ bch2_inode_update_after_write(&trans, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&dir->ei_update_lock); ++ } ++ ++ bch2_iget5_set(&inode->v, &inum); ++ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); ++ ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); ++ ++ /* ++ * we must insert the new inode into the inode cache before calling ++ * bch2_trans_exit() and dropping locks, else we could race with another ++ * thread pulling the inode in and modifying it: ++ */ ++ ++ inode->v.i_state |= I_CREATING; ++ ++ old = to_bch_ei(inode_insert5(&inode->v, ++ bch2_inode_hash(inum), ++ bch2_iget5_test, ++ bch2_iget5_set, ++ &inum)); ++ BUG_ON(!old); ++ ++ if (unlikely(old != inode)) { ++ /* ++ * We raced, another process pulled the new inode into cache ++ * before us: ++ */ ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ ++ inode = old; ++ } else { ++ /* ++ * we really don't want insert_inode_locked2() to be setting ++ * I_NEW... ++ */ ++ unlock_new_inode(&inode->v); ++ } ++ ++ bch2_trans_exit(&trans); ++err: ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return inode; ++err_trans: ++ if (!(flags & BCH_CREATE_TMPFILE)) ++ mutex_unlock(&dir->ei_update_lock); ++ ++ bch2_trans_exit(&trans); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ inode = ERR_PTR(ret); ++ goto err; ++} ++ ++/* methods */ ++ ++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); ++ struct inode *vinode = NULL; ++ subvol_inum inum = { .subvol = 1 }; ++ int ret; ++ ++ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, ++ &dentry->d_name, &inum); ++ ++ if (!ret) ++ vinode = bch2_vfs_inode_get(c, inum); ++ ++ return d_splice_alias(vinode, dentry); ++} ++ ++static int bch2_mknod(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ umode_t mode, dev_t rdev) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, ++ (subvol_inum) { 0 }, 0); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_create(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ umode_t mode, bool excl) ++{ ++ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); ++} ++ ++static int __bch2_link(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_info *dir, ++ struct dentry *dentry) ++{ ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u, inode_u; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_link_trans(&trans, ++ inode_inum(dir), &dir_u, ++ inode_inum(inode), &inode_u, ++ &dentry->d_name)); ++ ++ if (likely(!ret)) { ++ bch2_inode_update_after_write(&trans, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int bch2_link(struct dentry *old_dentry, struct inode *vdir, ++ struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ return ret; ++ ++ ihold(&inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++int __bch2_unlink(struct inode *vdir, struct dentry *dentry, ++ bool deleting_snapshot) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct bch_inode_unpacked dir_u, inode_u; ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ ret = commit_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL, ++ bch2_unlink_trans(&trans, ++ inode_inum(dir), &dir_u, ++ &inode_u, &dentry->d_name, ++ deleting_snapshot)); ++ ++ if (likely(!ret)) { ++ bch2_inode_update_after_write(&trans, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(&trans, inode, &inode_u, ++ ATTR_MTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ ++ return ret; ++} ++ ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ return __bch2_unlink(vdir, dentry, false); ++} ++ ++static int bch2_symlink(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, ++ const char *symname) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; ++ int ret; ++ ++ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, ++ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); ++ if (unlikely(IS_ERR(inode))) ++ return PTR_ERR(inode); ++ ++ inode_lock(&inode->v); ++ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); ++ inode_unlock(&inode->v); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ goto err; ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++err: ++ iput(&inode->v); ++ return ret; ++} ++ ++static int bch2_mkdir(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); ++} ++ ++static int bch2_rename2(struct user_namespace *mnt_userns, ++ struct inode *src_vdir, struct dentry *src_dentry, ++ struct inode *dst_vdir, struct dentry *dst_dentry, ++ unsigned flags) ++{ ++ struct bch_fs *c = src_vdir->i_sb->s_fs_info; ++ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); ++ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); ++ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); ++ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); ++ struct bch_inode_unpacked dst_dir_u, src_dir_u; ++ struct bch_inode_unpacked src_inode_u, dst_inode_u; ++ struct btree_trans trans; ++ enum bch_rename_mode mode = flags & RENAME_EXCHANGE ++ ? BCH_RENAME_EXCHANGE ++ : dst_dentry->d_inode ++ ? BCH_RENAME_OVERWRITE : BCH_RENAME; ++ int ret; ++ ++ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) ++ return -EINVAL; ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, ++ 0, LLONG_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 8, 2048); ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, src_inode, ++ dst_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst_inode, ++ src_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_rename_trans(&trans, ++ inode_inum(src_dir), &src_dir_u, ++ inode_inum(dst_dir), &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode)); ++ if (unlikely(ret)) ++ goto err; ++ ++ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); ++ BUG_ON(dst_inode && ++ dst_inode->v.i_ino != dst_inode_u.bi_inum); ++ ++ bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ ++ if (src_dir != dst_dir) ++ bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ ++ bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, ++ ATTR_CTIME); ++ ++ if (dst_inode) ++ bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, ++ ATTR_CTIME); ++err: ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_quota_transfer(c, src_inode, ++ bch_qid(&src_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ if (dst_inode) ++ bch2_fs_quota_transfer(c, dst_inode, ++ bch_qid(&dst_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ return ret; ++} ++ ++static void bch2_setattr_copy(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ unsigned int ia_valid = attr->ia_valid; ++ ++ if (ia_valid & ATTR_UID) ++ bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); ++ if (ia_valid & ATTR_GID) ++ bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); ++ ++ if (ia_valid & ATTR_SIZE) ++ bi->bi_size = attr->ia_size; ++ ++ if (ia_valid & ATTR_ATIME) ++ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); ++ if (ia_valid & ATTR_MTIME) ++ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); ++ if (ia_valid & ATTR_CTIME) ++ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); ++ ++ if (ia_valid & ATTR_MODE) { ++ umode_t mode = attr->ia_mode; ++ kgid_t gid = ia_valid & ATTR_GID ++ ? attr->ia_gid ++ : inode->v.i_gid; ++ ++ if (!in_group_p(gid) && ++ !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) ++ mode &= ~S_ISGID; ++ bi->bi_mode = mode; ++ } ++} ++ ++int bch2_setattr_nonsize(struct user_namespace *mnt_userns, ++ struct bch_inode_info *inode, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_qid qid; ++ struct btree_trans trans; ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl = NULL; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ++ qid = inode->ei_qid; ++ ++ if (attr->ia_valid & ATTR_UID) ++ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); ++ ++ if (attr->ia_valid & ATTR_GID) ++ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ++ ++ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ kfree(acl); ++ acl = NULL; ++ ++ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), ++ BTREE_ITER_INTENT); ++ if (ret) ++ goto btree_err; ++ ++ bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); ++ ++ if (attr->ia_valid & ATTR_MODE) { ++ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, ++ inode_u.bi_mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++btree_err: ++ bch2_trans_iter_exit(&trans, &inode_iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ if (unlikely(ret)) ++ goto err_trans; ++ ++ bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); ++ ++ if (acl) ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++err_trans: ++ bch2_trans_exit(&trans); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_getattr(struct user_namespace *mnt_userns, ++ const struct path *path, struct kstat *stat, ++ u32 request_mask, unsigned query_flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ stat->dev = inode->v.i_sb->s_dev; ++ stat->ino = inode->v.i_ino; ++ stat->mode = inode->v.i_mode; ++ stat->nlink = inode->v.i_nlink; ++ stat->uid = inode->v.i_uid; ++ stat->gid = inode->v.i_gid; ++ stat->rdev = inode->v.i_rdev; ++ stat->size = i_size_read(&inode->v); ++ stat->atime = inode->v.i_atime; ++ stat->mtime = inode->v.i_mtime; ++ stat->ctime = inode->v.i_ctime; ++ stat->blksize = block_bytes(c); ++ stat->blocks = inode->v.i_blocks; ++ ++ if (request_mask & STATX_BTIME) { ++ stat->result_mask |= STATX_BTIME; ++ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); ++ } ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ stat->attributes |= STATX_ATTR_IMMUTABLE; ++ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ stat->attributes |= STATX_ATTR_APPEND; ++ stat->attributes_mask |= STATX_ATTR_APPEND; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ stat->attributes |= STATX_ATTR_NODUMP; ++ stat->attributes_mask |= STATX_ATTR_NODUMP; ++ ++ return 0; ++} ++ ++static int bch2_setattr(struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct iattr *iattr) ++{ ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = setattr_prepare(mnt_userns, dentry, iattr); ++ if (ret) ++ return ret; ++ ++ return iattr->ia_valid & ATTR_SIZE ++ ? bch2_truncate(mnt_userns, inode, iattr) ++ : bch2_setattr_nonsize(mnt_userns, inode, iattr); ++} ++ ++static int bch2_tmpfile(struct user_namespace *mnt_userns, ++ struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, ++ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_mark_tmpfile(dentry, &inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_fill_extent(struct bch_fs *c, ++ struct fiemap_extent_info *info, ++ struct bkey_s_c k, unsigned flags) ++{ ++ if (bkey_extent_is_direct_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ flags |= FIEMAP_EXTENT_SHARED; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int flags2 = 0; ++ u64 offset = p.ptr.offset; ++ ++ if (p.crc.compression_type) ++ flags2 |= FIEMAP_EXTENT_ENCODED; ++ else ++ offset += p.crc.offset; ++ ++ if ((offset & (block_sectors(c) - 1)) || ++ (k.k->size & (block_sectors(c) - 1))) ++ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ++ ++ ret = fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ offset << 9, ++ k.k->size << 9, flags|flags2); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++ } else if (bkey_extent_is_inline_data(k.k)) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DATA_INLINE); ++ } else if (k.k->type == KEY_TYPE_reservation) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DELALLOC| ++ FIEMAP_EXTENT_UNWRITTEN); ++ } else { ++ BUG(); ++ } ++} ++ ++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ++ u64 start, u64 len) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *ei = to_bch_ei(vinode); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_buf cur, prev; ++ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); ++ unsigned offset_into_extent, sectors; ++ bool have_extent = false; ++ u32 snapshot; ++ int ret = 0; ++ ++ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); ++ if (ret) ++ return ret; ++ ++ if (start + len < start) ++ return -EINVAL; ++ ++ start >>= 9; ++ ++ bch2_bkey_buf_init(&cur); ++ bch2_bkey_buf_init(&prev); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(ei->v.i_ino, start, snapshot), 0); ++ ++ while (!(ret = btree_trans_too_many_iters(&trans)) && ++ (k = bch2_btree_iter_peek_upto(&iter, end)).k && ++ !(ret = bkey_err(k))) { ++ enum btree_id data_btree = BTREE_ID_extents; ++ ++ if (!bkey_extent_is_data(k.k) && ++ k.k->type != KEY_TYPE_reservation) { ++ bch2_btree_iter_advance(&iter); ++ continue; ++ } ++ ++ offset_into_extent = iter.pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bch2_bkey_buf_reassemble(&cur, c, k); ++ ++ ret = bch2_read_indirect_extent(&trans, &data_btree, ++ &offset_into_extent, &cur); ++ if (ret) ++ break; ++ ++ k = bkey_i_to_s_c(cur.k); ++ bch2_bkey_buf_realloc(&prev, c, k.k->u64s); ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); ++ bch2_key_resize(&cur.k->k, sectors); ++ cur.k->k.p = iter.pos; ++ cur.k->k.p.offset += cur.k->k.size; ++ ++ if (have_extent) { ++ ret = bch2_fill_extent(c, info, ++ bkey_i_to_s_c(prev.k), 0); ++ if (ret) ++ break; ++ } ++ ++ bkey_copy(prev.k, cur.k); ++ have_extent = true; ++ ++ bch2_btree_iter_set_pos(&iter, ++ POS(iter.pos.inode, iter.pos.offset + sectors)); ++ } ++ start = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ if (!ret && have_extent) ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), ++ FIEMAP_EXTENT_LAST); ++ ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&cur, c); ++ bch2_bkey_buf_exit(&prev, c); ++ return ret < 0 ? ret : 0; ++} ++ ++static const struct vm_operations_struct bch_vm_ops = { ++ .fault = bch2_page_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = bch2_page_mkwrite, ++}; ++ ++static int bch2_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ ++ vma->vm_ops = &bch_vm_ops; ++ return 0; ++} ++ ++/* Directories: */ ++ ++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ return generic_file_llseek_size(file, offset, whence, ++ S64_MAX, S64_MAX); ++} ++ ++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ if (!dir_emit_dots(file, ctx)) ++ return 0; ++ ++ return bch2_readdir(c, inode_inum(inode), ctx); ++} ++ ++static const struct file_operations bch_file_operations = { ++ .llseek = bch2_llseek, ++ .read_iter = bch2_read_iter, ++ .write_iter = bch2_write_iter, ++ .mmap = bch2_mmap, ++ .open = generic_file_open, ++ .fsync = bch2_fsync, ++ .splice_read = generic_file_splice_read, ++ .splice_write = iter_file_splice_write, ++ .fallocate = bch2_fallocate_dispatch, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++ .remap_file_range = bch2_remap_file_range, ++}; ++ ++static const struct inode_operations bch_file_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .fiemap = bch2_fiemap, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_dir_inode_operations = { ++ .lookup = bch2_lookup, ++ .create = bch2_create, ++ .link = bch2_link, ++ .unlink = bch2_unlink, ++ .symlink = bch2_symlink, ++ .mkdir = bch2_mkdir, ++ .rmdir = bch2_unlink, ++ .mknod = bch2_mknod, ++ .rename = bch2_rename2, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .tmpfile = bch2_tmpfile, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct file_operations bch_dir_file_operations = { ++ .llseek = bch2_dir_llseek, ++ .read = generic_read_dir, ++ .iterate_shared = bch2_vfs_readdir, ++ .fsync = bch2_fsync, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++}; ++ ++static const struct inode_operations bch_symlink_inode_operations = { ++ .get_link = page_get_link, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_special_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct address_space_operations bch_address_space_operations = { ++ .readpage = bch2_readpage, ++ .writepages = bch2_writepages, ++ .readahead = bch2_readahead, ++ .dirty_folio = filemap_dirty_folio, ++ .write_begin = bch2_write_begin, ++ .write_end = bch2_write_end, ++ .invalidate_folio = bch2_invalidate_folio, ++ .releasepage = bch2_releasepage, ++ .direct_IO = noop_direct_IO, ++#ifdef CONFIG_MIGRATION ++ .migratepage = bch2_migrate_page, ++#endif ++ .error_remove_page = generic_error_remove_page, ++}; ++ ++struct bcachefs_fid { ++ u64 inum; ++ u32 subvol; ++ u32 gen; ++} __packed; ++ ++struct bcachefs_fid_with_parent { ++ struct bcachefs_fid fid; ++ struct bcachefs_fid dir; ++} __packed; ++ ++static int bcachefs_fid_valid(int fh_len, int fh_type) ++{ ++ switch (fh_type) { ++ case FILEID_BCACHEFS_WITHOUT_PARENT: ++ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); ++ case FILEID_BCACHEFS_WITH_PARENT: ++ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); ++ default: ++ return false; ++ } ++} ++ ++static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) ++{ ++ return (struct bcachefs_fid) { ++ .inum = inode->ei_inode.bi_inum, ++ .subvol = inode->ei_subvol, ++ .gen = inode->ei_inode.bi_generation, ++ }; ++} ++ ++static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, ++ struct inode *vdir) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ ++ if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) ++ return FILEID_INVALID; ++ ++ if (!S_ISDIR(inode->v.i_mode) && dir) { ++ struct bcachefs_fid_with_parent *fid = (void *) fh; ++ ++ fid->fid = bch2_inode_to_fid(inode); ++ fid->dir = bch2_inode_to_fid(dir); ++ ++ *len = sizeof(*fid) / sizeof(u32); ++ return FILEID_BCACHEFS_WITH_PARENT; ++ } else { ++ struct bcachefs_fid *fid = (void *) fh; ++ ++ *fid = bch2_inode_to_fid(inode); ++ ++ *len = sizeof(*fid) / sizeof(u32); ++ return FILEID_BCACHEFS_WITHOUT_PARENT; ++ } ++} ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ struct bcachefs_fid fid) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { ++ .subvol = fid.subvol, ++ .inum = fid.inum, ++ }); ++ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { ++ iput(vinode); ++ vinode = ERR_PTR(-ESTALE); ++ } ++ return vinode; ++} ++ ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, ++ int fh_len, int fh_type) ++{ ++ struct bcachefs_fid *fid = (void *) _fid; ++ ++ if (!bcachefs_fid_valid(fh_len, fh_type)) ++ return NULL; ++ ++ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); ++} ++ ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, ++ int fh_len, int fh_type) ++{ ++ struct bcachefs_fid_with_parent *fid = (void *) _fid; ++ ++ if (!bcachefs_fid_valid(fh_len, fh_type) || ++ fh_type != FILEID_BCACHEFS_WITH_PARENT) ++ return NULL; ++ ++ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); ++} ++ ++static struct dentry *bch2_get_parent(struct dentry *child) ++{ ++ struct bch_inode_info *inode = to_bch_ei(child->d_inode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ subvol_inum parent_inum = { ++ .subvol = inode->ei_inode.bi_parent_subvol ?: ++ inode->ei_subvol, ++ .inum = inode->ei_inode.bi_dir, ++ }; ++ ++ if (!parent_inum.inum) ++ return NULL; ++ ++ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); ++} ++ ++static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) ++{ ++ struct bch_inode_info *inode = to_bch_ei(child->d_inode); ++ struct bch_inode_info *dir = to_bch_ei(parent->d_inode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter iter1; ++ struct btree_iter iter2; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked inode_u; ++ subvol_inum target; ++ u32 snapshot; ++ unsigned name_len; ++ int ret; ++ ++ if (!S_ISDIR(dir->v.i_mode)) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); ++ bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_set_snapshot(&iter1, snapshot); ++ bch2_btree_iter_set_snapshot(&iter2, snapshot); ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); ++ if (ret) ++ goto err; ++ ++ if (inode_u.bi_dir == dir->ei_inode.bi_inum) { ++ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); ++ ++ k = bch2_btree_iter_peek_slot(&iter1); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_dirent) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ d = bkey_s_c_to_dirent(k); ++ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); ++ if (ret > 0) ++ ret = -ENOENT; ++ if (ret) ++ goto err; ++ ++ if (target.subvol == inode->ei_subvol && ++ target.inum == inode->ei_inode.bi_inum) ++ goto found; ++ } else { ++ /* ++ * File with multiple hardlinks and our backref is to the wrong ++ * directory - linear search: ++ */ ++ for_each_btree_key_continue_norestart(iter2, 0, k, ret) { ++ if (k.k->p.inode > dir->ei_inode.bi_inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); ++ if (ret < 0) ++ break; ++ if (ret) ++ continue; ++ ++ if (target.subvol == inode->ei_subvol && ++ target.inum == inode->ei_inode.bi_inum) ++ goto found; ++ } ++ } ++ ++ ret = -ENOENT; ++ goto err; ++found: ++ name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); ++ ++ memcpy(name, d.v->d_name, name_len); ++ name[name_len] = '\0'; ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter1); ++ bch2_trans_iter_exit(&trans, &iter2); ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static const struct export_operations bch_export_ops = { ++ .encode_fh = bch2_encode_fh, ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ .get_parent = bch2_get_parent, ++ .get_name = bch2_get_name, ++}; ++ ++static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct bch_subvolume *subvol) ++{ ++ bch2_inode_update_after_write(trans, inode, bi, ~0); ++ ++ if (BCH_SUBVOLUME_SNAP(subvol)) ++ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); ++ else ++ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); ++ ++ inode->v.i_blocks = bi->bi_sectors; ++ inode->v.i_ino = bi->bi_inum; ++ inode->v.i_rdev = bi->bi_dev; ++ inode->v.i_generation = bi->bi_generation; ++ inode->v.i_size = bi->bi_size; ++ ++ inode->ei_flags = 0; ++ inode->ei_quota_reserved = 0; ++ inode->ei_qid = bch_qid(bi); ++ inode->ei_subvol = inum.subvol; ++ ++ inode->v.i_mapping->a_ops = &bch_address_space_operations; ++ ++ switch (inode->v.i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->v.i_op = &bch_file_inode_operations; ++ inode->v.i_fop = &bch_file_operations; ++ break; ++ case S_IFDIR: ++ inode->v.i_op = &bch_dir_inode_operations; ++ inode->v.i_fop = &bch_dir_file_operations; ++ break; ++ case S_IFLNK: ++ inode_nohighmem(&inode->v); ++ inode->v.i_op = &bch_symlink_inode_operations; ++ break; ++ default: ++ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); ++ inode->v.i_op = &bch_special_inode_operations; ++ break; ++ } ++} ++ ++static struct inode *bch2_alloc_inode(struct super_block *sb) ++{ ++ struct bch_inode_info *inode; ++ ++ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); ++ if (!inode) ++ return NULL; ++ ++ inode_init_once(&inode->v); ++ mutex_init(&inode->ei_update_lock); ++ pagecache_lock_init(&inode->ei_pagecache_lock); ++ mutex_init(&inode->ei_quota_lock); ++ ++ return &inode->v; ++} ++ ++static void bch2_i_callback(struct rcu_head *head) ++{ ++ struct inode *vinode = container_of(head, struct inode, i_rcu); ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ kmem_cache_free(bch2_inode_cache, inode); ++} ++ ++static void bch2_destroy_inode(struct inode *vinode) ++{ ++ call_rcu(&vinode->i_rcu, bch2_i_callback); ++} ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); ++ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); ++ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); ++ ++ return 0; ++} ++ ++static int bch2_vfs_write_inode(struct inode *vinode, ++ struct writeback_control *wbc) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static void bch2_evict_inode(struct inode *vinode) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ truncate_inode_pages_final(&inode->v.i_data); ++ ++ clear_inode(&inode->v); ++ ++ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); ++ ++ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), ++ KEY_TYPE_QUOTA_WARN); ++ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++ bch2_inode_rm(c, inode_inum(inode)); ++ } ++} ++ ++void bch2_evict_subvolume_inodes(struct bch_fs *c, ++ snapshot_id_list *s) ++{ ++ struct super_block *sb = c->vfs_sb; ++ struct inode *inode; ++ ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++again: ++ cond_resched(); ++ spin_lock(&sb->s_inode_list_lock); ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || ++ (inode->i_state & I_FREEING)) ++ continue; ++ ++ if (!(inode->i_state & I_DONTCACHE)) { ++ d_mark_dontcache(inode); ++ d_prune_aliases(inode); ++ } ++ ++ spin_lock(&inode->i_lock); ++ if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && ++ !(inode->i_state & I_FREEING)) { ++ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); ++ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); ++ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&sb->s_inode_list_lock); ++ schedule(); ++ finish_wait(wq, &wait.wq_entry); ++ goto again; ++ } ++ ++ spin_unlock(&inode->i_lock); ++ } ++ spin_unlock(&sb->s_inode_list_lock); ++} ++ ++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ struct super_block *sb = dentry->d_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); ++ unsigned shift = sb->s_blocksize_bits - 9; ++ /* ++ * this assumes inodes take up 64 bytes, which is a decent average ++ * number: ++ */ ++ u64 avail_inodes = ((usage.capacity - usage.used) << 3); ++ u64 fsid; ++ ++ buf->f_type = BCACHEFS_STATFS_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = usage.capacity >> shift; ++ buf->f_bfree = usage.free >> shift; ++ buf->f_bavail = avail_factor(usage.free) >> shift; ++ ++ buf->f_files = usage.nr_inodes + avail_inodes; ++ buf->f_ffree = avail_inodes; ++ ++ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ ++ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); ++ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; ++ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; ++ buf->f_namelen = BCH_NAME_MAX; ++ ++ return 0; ++} ++ ++static int bch2_sync_fs(struct super_block *sb, int wait) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ if (!wait) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ return 0; ++ } ++ ++ return bch2_journal_flush(&c->journal); ++} ++ ++static struct bch_fs *bch2_path_to_fs(const char *path) ++{ ++ struct bch_fs *c; ++ dev_t dev; ++ int ret; ++ ++ ret = lookup_bdev(path, &dev); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ c = bch2_dev_to_fs(dev); ++ if (c) ++ closure_put(&c->cl); ++ return c ?: ERR_PTR(-ENOENT); ++} ++ ++static char **split_devs(const char *_dev_name, unsigned *nr) ++{ ++ char *dev_name = NULL, **devs = NULL, *s; ++ size_t i, nr_devs = 0; ++ ++ dev_name = kstrdup(_dev_name, GFP_KERNEL); ++ if (!dev_name) ++ return NULL; ++ ++ for (s = dev_name; s; s = strchr(s + 1, ':')) ++ nr_devs++; ++ ++ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); ++ if (!devs) { ++ kfree(dev_name); ++ return NULL; ++ } ++ ++ for (i = 0, s = dev_name; ++ s; ++ (s = strchr(s, ':')) && (*s++ = '\0')) ++ devs[i++] = s; ++ ++ *nr = nr_devs; ++ return devs; ++} ++ ++static int bch2_remount(struct super_block *sb, int *flags, char *data) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_opts opts = bch2_opts_empty(); ++ int ret; ++ ++ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(c, &opts, data); ++ if (ret) ++ return ret; ++ ++ if (opts.read_only != c->opts.read_only) { ++ down_write(&c->state_lock); ++ ++ if (opts.read_only) { ++ bch2_fs_read_only(c); ++ ++ sb->s_flags |= SB_RDONLY; ++ } else { ++ ret = bch2_fs_read_write(c); ++ if (ret) { ++ bch_err(c, "error going rw: %i", ret); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ sb->s_flags &= ~SB_RDONLY; ++ } ++ ++ c->opts.read_only = opts.read_only; ++ ++ up_write(&c->state_lock); ++ } ++ ++ if (opts.errors >= 0) ++ c->opts.errors = opts.errors; ++ ++ return ret; ++} ++ ++static int bch2_show_devname(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ struct bch_dev *ca; ++ unsigned i; ++ bool first = true; ++ ++ for_each_online_member(ca, c, i) { ++ if (!first) ++ seq_putc(seq, ':'); ++ first = false; ++ seq_puts(seq, "/dev/"); ++ seq_puts(seq, ca->name); ++ } ++ ++ return 0; ++} ++ ++static int bch2_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ enum bch_opt_id i; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->flags & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ printbuf_reset(&buf); ++ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, ++ OPT_SHOW_MOUNT_STYLE); ++ seq_putc(seq, ','); ++ seq_puts(seq, buf.buf); ++ } ++ ++ if (buf.allocation_failure) ++ ret = -ENOMEM; ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static void bch2_put_super(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ __bch2_fs_stop(c); ++} ++ ++static const struct super_operations bch_super_operations = { ++ .alloc_inode = bch2_alloc_inode, ++ .destroy_inode = bch2_destroy_inode, ++ .write_inode = bch2_vfs_write_inode, ++ .evict_inode = bch2_evict_inode, ++ .sync_fs = bch2_sync_fs, ++ .statfs = bch2_statfs, ++ .show_devname = bch2_show_devname, ++ .show_options = bch2_show_options, ++ .remount_fs = bch2_remount, ++ .put_super = bch2_put_super, ++#if 0 ++ .freeze_fs = bch2_freeze, ++ .unfreeze_fs = bch2_unfreeze, ++#endif ++}; ++ ++static int bch2_set_super(struct super_block *s, void *data) ++{ ++ s->s_fs_info = data; ++ return 0; ++} ++ ++static int bch2_noset_super(struct super_block *s, void *data) ++{ ++ return -EBUSY; ++} ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ struct bch_fs *c = s->s_fs_info; ++ struct bch_fs **devs = data; ++ unsigned i; ++ ++ if (!c) ++ return false; ++ ++ for (i = 0; devs[i]; i++) ++ if (c != devs[i]) ++ return false; ++ return true; ++} ++ ++static struct dentry *bch2_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ struct super_block *sb; ++ struct inode *vinode; ++ struct bch_opts opts = bch2_opts_empty(); ++ char **devs; ++ struct bch_fs **devs_to_fs = NULL; ++ unsigned i, nr_devs; ++ int ret; ++ ++ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(NULL, &opts, data); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!dev_name || strlen(dev_name) == 0) ++ return ERR_PTR(-EINVAL); ++ ++ devs = split_devs(dev_name, &nr_devs); ++ if (!devs) ++ return ERR_PTR(-ENOMEM); ++ ++ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); ++ if (!devs_to_fs) { ++ sb = ERR_PTR(-ENOMEM); ++ goto got_sb; ++ } ++ ++ for (i = 0; i < nr_devs; i++) ++ devs_to_fs[i] = bch2_path_to_fs(devs[i]); ++ ++ sb = sget(fs_type, bch2_test_super, bch2_noset_super, ++ flags|SB_NOSEC, devs_to_fs); ++ if (!IS_ERR(sb)) ++ goto got_sb; ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ if (IS_ERR(c)) { ++ sb = ERR_CAST(c); ++ goto got_sb; ++ } ++ ++ /* Some options can't be parsed until after the fs is started: */ ++ ret = bch2_parse_mount_opts(c, &opts, data); ++ if (ret) { ++ bch2_fs_stop(c); ++ sb = ERR_PTR(ret); ++ goto got_sb; ++ } ++ ++ bch2_opts_apply(&c->opts, opts); ++ ++ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); ++ if (IS_ERR(sb)) ++ bch2_fs_stop(c); ++got_sb: ++ kfree(devs_to_fs); ++ kfree(devs[0]); ++ kfree(devs); ++ ++ if (IS_ERR(sb)) ++ return ERR_CAST(sb); ++ ++ c = sb->s_fs_info; ++ ++ if (sb->s_root) { ++ if ((flags ^ sb->s_flags) & SB_RDONLY) { ++ ret = -EBUSY; ++ goto err_put_super; ++ } ++ goto out; ++ } ++ ++ sb->s_blocksize = block_bytes(c); ++ sb->s_blocksize_bits = ilog2(block_bytes(c)); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_op = &bch_super_operations; ++ sb->s_export_op = &bch_export_ops; ++#ifdef CONFIG_BCACHEFS_QUOTA ++ sb->s_qcop = &bch2_quotactl_operations; ++ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; ++#endif ++ sb->s_xattr = bch2_xattr_handlers; ++ sb->s_magic = BCACHEFS_STATFS_MAGIC; ++ sb->s_time_gran = c->sb.nsec_per_time_unit; ++ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; ++ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); ++ c->vfs_sb = sb; ++ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ ++ ret = super_setup_bdi(sb); ++ if (ret) ++ goto err_put_super; ++ ++ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; ++ ++ for_each_online_member(ca, c, i) { ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ ++ c->dev = sb->s_dev; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ if (c->opts.acl) ++ sb->s_flags |= SB_POSIXACL; ++#endif ++ ++ sb->s_shrink.seeks = 0; ++ ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) { ++ bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); ++ goto err_put_super; ++ } ++ ++ sb->s_root = d_make_root(vinode); ++ if (!sb->s_root) { ++ bch_err(c, "error mounting: error allocating root dentry"); ++ ret = -ENOMEM; ++ goto err_put_super; ++ } ++ ++ sb->s_flags |= SB_ACTIVE; ++out: ++ return dget(sb->s_root); ++ ++err_put_super: ++ deactivate_locked_super(sb); ++ return ERR_PTR(ret); ++} ++ ++static void bch2_kill_sb(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ generic_shutdown_super(sb); ++ bch2_fs_free(c); ++} ++ ++static struct file_system_type bcache_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "bcachefs", ++ .mount = bch2_mount, ++ .kill_sb = bch2_kill_sb, ++ .fs_flags = FS_REQUIRES_DEV, ++}; ++ ++MODULE_ALIAS_FS("bcachefs"); ++ ++void bch2_vfs_exit(void) ++{ ++ unregister_filesystem(&bcache_fs_type); ++ if (bch2_inode_cache) ++ kmem_cache_destroy(bch2_inode_cache); ++} ++ ++int __init bch2_vfs_init(void) ++{ ++ int ret = -ENOMEM; ++ ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ if (!bch2_inode_cache) ++ goto err; ++ ++ ret = register_filesystem(&bcache_fs_type); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_vfs_exit(); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +new file mode 100644 +index 000000000000..9f4b57e30e2a +--- /dev/null ++++ b/fs/bcachefs/fs.h +@@ -0,0 +1,208 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_H ++#define _BCACHEFS_FS_H ++ ++#include "inode.h" ++#include "opts.h" ++#include "str_hash.h" ++#include "quota_types.h" ++ ++#include ++#include ++ ++/* ++ * Two-state lock - can be taken for add or block - both states are shared, ++ * like read side of rwsem, but conflict with other state: ++ */ ++struct pagecache_lock { ++ atomic_long_t v; ++ wait_queue_head_t wait; ++}; ++ ++static inline void pagecache_lock_init(struct pagecache_lock *lock) ++{ ++ atomic_long_set(&lock->v, 0); ++ init_waitqueue_head(&lock->wait); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *); ++bool bch2_pagecache_add_tryget(struct pagecache_lock *); ++void bch2_pagecache_add_get(struct pagecache_lock *); ++void bch2_pagecache_block_put(struct pagecache_lock *); ++void bch2_pagecache_block_get(struct pagecache_lock *); ++ ++struct bch_inode_info { ++ struct inode v; ++ unsigned long ei_flags; ++ ++ struct mutex ei_update_lock; ++ u64 ei_quota_reserved; ++ unsigned long ei_last_dirtied; ++ ++ struct pagecache_lock ei_pagecache_lock; ++ ++ struct mutex ei_quota_lock; ++ struct bch_qid ei_qid; ++ ++ u32 ei_subvol; ++ ++ /* copy of inode in btree: */ ++ struct bch_inode_unpacked ei_inode; ++}; ++ ++static inline subvol_inum inode_inum(struct bch_inode_info *inode) ++{ ++ return (subvol_inum) { ++ .subvol = inode->ei_subvol, ++ .inum = inode->ei_inode.bi_inum, ++ }; ++} ++ ++/* ++ * Set if we've gotten a btree error for this inode, and thus the vfs inode and ++ * btree inode may be inconsistent: ++ */ ++#define EI_INODE_ERROR 0 ++ ++/* ++ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in ++ * those: ++ */ ++#define EI_INODE_SNAPSHOT 1 ++ ++#define to_bch_ei(_inode) \ ++ container_of_or_null(_inode, struct bch_inode_info, v) ++ ++static inline int ptrcmp(void *l, void *r) ++{ ++ return cmp_int(l, r); ++} ++ ++enum bch_inode_lock_op { ++ INODE_LOCK = (1U << 0), ++ INODE_PAGECACHE_BLOCK = (1U << 1), ++ INODE_UPDATE_LOCK = (1U << 2), ++}; ++ ++#define bch2_lock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ down_write_nested(&a[i]->v.i_rwsem, i); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_lock_nested(&a[i]->ei_update_lock, i);\ ++ } \ ++} while (0) ++ ++#define bch2_unlock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ up_write(&a[i]->v.i_rwsem); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_unlock(&a[i]->ei_update_lock); \ ++ } \ ++} while (0) ++ ++static inline struct bch_inode_info *file_bch_inode(struct file *file) ++{ ++ return to_bch_ei(file_inode(file)); ++} ++ ++static inline bool inode_attr_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode, ++ enum inode_opt_id id) ++{ ++ return !(inode->ei_inode.bi_fields_set & (1 << id)) && ++ bch2_inode_opt_get(&dir->ei_inode, id) != ++ bch2_inode_opt_get(&inode->ei_inode, id); ++} ++ ++static inline bool inode_attrs_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode) ++{ ++ unsigned id; ++ ++ for (id = 0; id < Inode_opt_nr; id++) ++ if (inode_attr_changing(dir, inode, id)) ++ return true; ++ ++ return false; ++} ++ ++struct bch_inode_unpacked; ++ ++#ifndef NO_BCACHEFS_FS ++ ++struct bch_inode_info * ++__bch2_create(struct user_namespace *, struct bch_inode_info *, ++ struct dentry *, umode_t, dev_t, subvol_inum, unsigned); ++ ++int bch2_fs_quota_transfer(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_qid, ++ unsigned, ++ enum quota_acct_mode); ++ ++static inline int bch2_set_projid(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ u32 projid) ++{ ++ struct bch_qid qid = inode->ei_qid; ++ ++ qid.q[QTYP_PRJ] = projid; ++ ++ return bch2_fs_quota_transfer(c, inode, qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); ++ ++/* returns 0 if we want to do the update, or error is passed up */ ++typedef int (*inode_set_fn)(struct bch_inode_info *, ++ struct bch_inode_unpacked *, void *); ++ ++void bch2_inode_update_after_write(struct btree_trans *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ unsigned); ++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, ++ inode_set_fn, void *, unsigned); ++ ++int bch2_setattr_nonsize(struct user_namespace *, ++ struct bch_inode_info *, ++ struct iattr *); ++int __bch2_unlink(struct inode *, struct dentry *, bool); ++ ++void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); ++ ++void bch2_vfs_exit(void); ++int bch2_vfs_init(void); ++ ++#else ++ ++static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, ++ snapshot_id_list *s) {} ++static inline void bch2_vfs_exit(void) {} ++static inline int bch2_vfs_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_FS_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +new file mode 100644 +index 000000000000..bb8cab7cb405 +--- /dev/null ++++ b/fs/bcachefs/fsck.c +@@ -0,0 +1,2390 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "darray.h" ++#include "dirent.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "inode.h" ++#include "keylist.h" ++#include "subvolume.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include /* struct qstr */ ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 sectors = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_extents, ++ SPOS(inum, 0, snapshot), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ sectors += k.k->size; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret ?: sectors; ++} ++ ++static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 subdirs = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_dirents, ++ SPOS(inum, 0, snapshot), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ if (d.v->d_type == DT_DIR) ++ subdirs++; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret ?: subdirs; ++} ++ ++static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, ++ u32 *subvol) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, ++ POS(0, snapshot), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch_err(trans->c, "snapshot %u not fonud", snapshot); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++ ++} ++ ++static int __subvol_lookup(struct btree_trans *trans, u32 subvol, ++ u32 *snapshot, u64 *inum) ++{ ++ struct bch_subvolume s; ++ int ret; ++ ++ ret = bch2_subvolume_get(trans, subvol, false, 0, &s); ++ ++ *snapshot = le32_to_cpu(s.snapshot); ++ *inum = le64_to_cpu(s.inode); ++ return ret; ++} ++ ++static int subvol_lookup(struct btree_trans *trans, u32 subvol, ++ u32 *snapshot, u64 *inum) ++{ ++ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); ++} ++ ++static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ POS(0, inode_nr), ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bch2_inode_unpack(k, inode); ++err: ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error fetching inode %llu: %s", ++ inode_nr, bch2_err_str(ret)); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ u32 *snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inode_nr, *snapshot), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = bkey_is_inode(k.k) ++ ? bch2_inode_unpack(k, inode) ++ : -ENOENT; ++ if (!ret) ++ *snapshot = iter.pos.snapshot; ++err: ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error fetching inode %llu:%u: %s", ++ inode_nr, *snapshot, bch2_err_str(ret)); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int lookup_inode(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode, ++ u32 *snapshot) ++{ ++ return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); ++} ++ ++static int __lookup_dirent(struct btree_trans *trans, ++ struct bch_hash_info hash_info, ++ subvol_inum dir, struct qstr *name, ++ u64 *target, unsigned *type) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c_dirent d; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, ++ &hash_info, dir, name, 0); ++ if (ret) ++ return ret; ++ ++ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); ++ *target = le64_to_cpu(d.v->d_inum); ++ *type = d.v->d_type; ++ bch2_trans_iter_exit(trans, &iter); ++ return 0; ++} ++ ++static int __write_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inode->bi_inum, snapshot), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_inode_write(trans, &iter, inode); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int write_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ int ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __write_inode(trans, inode, snapshot)); ++ if (ret) ++ bch_err(trans->c, "error in fsck: error updating inode: %s", ++ bch2_err_str(ret)); ++ return ret; ++} ++ ++static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter = { NULL }; ++ struct bkey_i_inode_generation delete; ++ struct bch_inode_unpacked inode_u; ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL); ++ if (ret) ++ goto err; ++retry: ++ bch2_trans_begin(trans); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inum, snapshot), BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!bkey_is_inode(k.k)) { ++ bch2_fs_inconsistent(c, ++ "inode %llu:%u not found when deleting", ++ inum, snapshot); ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_inode_unpack(k, &inode_u); ++ ++ /* Subvolume root? */ ++ if (inode_u.bi_subvol) ++ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); ++ ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p = iter.pos; ++ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); ++ ++ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ return ret; ++} ++ ++static int __remove_dirent(struct btree_trans *trans, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bch_inode_unpacked dir_inode; ++ struct bch_hash_info dir_hash_info; ++ int ret; ++ ++ ret = lookup_first_inode(trans, pos.inode, &dir_inode); ++ if (ret) ++ goto err; ++ ++ dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ++ ++ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, &iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &iter); ++err: ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* Get lost+found, create if it doesn't exist: */ ++static int lookup_lostfound(struct btree_trans *trans, u32 subvol, ++ struct bch_inode_unpacked *lostfound) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked root; ++ struct bch_hash_info root_hash_info; ++ struct qstr lostfound_str = QSTR("lost+found"); ++ subvol_inum root_inum = { .subvol = subvol }; ++ u64 inum = 0; ++ unsigned d_type = 0; ++ u32 snapshot; ++ int ret; ++ ++ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); ++ if (ret) ++ return ret; ++ ++ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); ++ if (ret) ++ return ret; ++ ++ root_hash_info = bch2_hash_info_init(c, &root); ++ ++ ret = __lookup_dirent(trans, root_hash_info, root_inum, ++ &lostfound_str, &inum, &d_type); ++ if (ret == -ENOENT) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); ++ if (ret) ++ return ret; ++ ++ if (d_type != DT_DIR) { ++ bch_err(c, "error looking up lost+found: not a directory"); ++ return ret; ++ } ++ ++ /* ++ * The check_dirents pass has already run, dangling dirents ++ * shouldn't exist here: ++ */ ++ return __lookup_inode(trans, inum, lostfound, &snapshot); ++ ++create_lostfound: ++ bch2_inode_init_early(c, lostfound); ++ ++ ret = bch2_create_trans(trans, root_inum, &root, ++ lostfound, &lostfound_str, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL, ++ (subvol_inum) { }, 0); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int __reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 inode_snapshot) ++{ ++ struct bch_hash_info dir_hash; ++ struct bch_inode_unpacked lostfound; ++ char name_buf[20]; ++ struct qstr name; ++ u64 dir_offset = 0; ++ u32 subvol; ++ int ret; ++ ++ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); ++ if (ret) ++ return ret; ++ ++ ret = lookup_lostfound(trans, subvol, &lostfound); ++ if (ret) ++ return ret; ++ ++ if (S_ISDIR(inode->bi_mode)) { ++ lostfound.bi_nlink++; ++ ++ ret = __write_inode(trans, &lostfound, U32_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ dir_hash = bch2_hash_info_init(trans->c, &lostfound); ++ ++ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); ++ name = (struct qstr) QSTR(name_buf); ++ ++ ret = bch2_dirent_create(trans, ++ (subvol_inum) { ++ .subvol = subvol, ++ .inum = lostfound.bi_inum, ++ }, ++ &dir_hash, ++ inode_d_type(inode), ++ &name, inode->bi_inum, &dir_offset, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ return ret; ++ ++ inode->bi_dir = lostfound.bi_inum; ++ inode->bi_dir_offset = dir_offset; ++ ++ return __write_inode(trans, inode, inode_snapshot); ++} ++ ++static int reattach_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 inode_snapshot) ++{ ++ int ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL, ++ __reattach_inode(trans, inode, inode_snapshot)); ++ if (ret) { ++ bch_err(trans->c, "error reattaching inode %llu: %s", ++ inode->bi_inum, bch2_err_str(ret)); ++ return ret; ++ } ++ ++ return ret; ++} ++ ++static int remove_backpointer(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, ++ POS(inode->bi_dir, inode->bi_dir_offset), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ if (k.k->type != KEY_TYPE_dirent) { ++ ret = -ENOENT; ++ goto out; ++ } ++ ++ ret = __remove_dirent(trans, k.k->p); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++struct snapshots_seen_entry { ++ u32 id; ++ u32 equiv; ++}; ++ ++struct snapshots_seen { ++ struct bpos pos; ++ DARRAY(struct snapshots_seen_entry) ids; ++}; ++ ++static inline void snapshots_seen_exit(struct snapshots_seen *s) ++{ ++ darray_exit(&s->ids); ++} ++ ++static inline void snapshots_seen_init(struct snapshots_seen *s) ++{ ++ memset(s, 0, sizeof(*s)); ++} ++ ++static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) ++{ ++ struct snapshots_seen_entry *i, n = { id, id }; ++ int ret; ++ ++ darray_for_each(s->ids, i) { ++ if (n.equiv < i->equiv) ++ break; ++ ++ if (i->equiv == n.equiv) { ++ bch_err(c, "adding duplicate snapshot in snapshots_seen_add()"); ++ return -EINVAL; ++ } ++ } ++ ++ ret = darray_insert_item(&s->ids, i - s->ids.data, n); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; ++} ++ ++static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, ++ enum btree_id btree_id, struct bpos pos) ++{ ++ struct snapshots_seen_entry *i, n = { ++ .id = pos.snapshot, ++ .equiv = bch2_snapshot_equiv(c, pos.snapshot), ++ }; ++ int ret; ++ ++ if (bkey_cmp(s->pos, pos)) ++ s->ids.nr = 0; ++ ++ pos.snapshot = n.equiv; ++ s->pos = pos; ++ ++ darray_for_each(s->ids, i) ++ if (i->equiv == n.equiv) { ++ if (i->id != n.id) { ++ bch_err(c, "snapshot deletion did not run correctly:\n" ++ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", ++ bch2_btree_ids[btree_id], ++ pos.inode, pos.offset, ++ i->id, n.id, n.equiv); ++ return -BCH_ERR_need_snapshot_cleanup; ++ } ++ ++ return 0; ++ } ++ ++ ret = darray_push(&s->ids, n); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; ++} ++ ++/** ++ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, ++ * and @ancestor hasn't been overwritten in @seen ++ * ++ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot ++ */ ++static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, ++ u32 id, u32 ancestor) ++{ ++ ssize_t i; ++ u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0; ++ ++ BUG_ON(id > ancestor); ++ BUG_ON(!bch2_snapshot_is_equiv(c, id)); ++ BUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); ++ ++ /* @ancestor should be the snapshot most recently added to @seen */ ++ BUG_ON(ancestor != seen->pos.snapshot); ++ BUG_ON(ancestor != top); ++ ++ if (id == ancestor) ++ return true; ++ ++ if (!bch2_snapshot_is_ancestor(c, id, ancestor)) ++ return false; ++ ++ for (i = seen->ids.nr - 2; ++ i >= 0 && seen->ids.data[i].equiv >= id; ++ --i) ++ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) && ++ bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor)) ++ return false; ++ ++ return true; ++} ++ ++/** ++ * ref_visible - given a key with snapshot id @src that points to a key with ++ * snapshot id @dst, test whether there is some snapshot in which @dst is ++ * visible. ++ * ++ * This assumes we're visiting @src keys in natural key order. ++ * ++ * @s - list of snapshot IDs already seen at @src ++ * @src - snapshot ID of src key ++ * @dst - snapshot ID of dst key ++ */ ++static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, ++ u32 src, u32 dst) ++{ ++ return dst <= src ++ ? key_visible_in_snapshot(c, s, dst, src) ++ : bch2_snapshot_is_ancestor(c, src, dst); ++} ++ ++#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ ++ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ ++ (_i)->snapshot <= (_snapshot); _i++) \ ++ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) ++ ++struct inode_walker_entry { ++ struct bch_inode_unpacked inode; ++ u32 snapshot; ++ u64 count; ++}; ++ ++struct inode_walker { ++ bool first_this_inode; ++ u64 cur_inum; ++ ++ DARRAY(struct inode_walker_entry) inodes; ++}; ++ ++static void inode_walker_exit(struct inode_walker *w) ++{ ++ darray_exit(&w->inodes); ++} ++ ++static struct inode_walker inode_walker_init(void) ++{ ++ return (struct inode_walker) { 0, }; ++} ++ ++static int add_inode(struct bch_fs *c, struct inode_walker *w, ++ struct bkey_s_c inode) ++{ ++ struct bch_inode_unpacked u; ++ ++ BUG_ON(bch2_inode_unpack(inode, &u)); ++ ++ return darray_push(&w->inodes, ((struct inode_walker_entry) { ++ .inode = u, ++ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), ++ })); ++} ++ ++static int __walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 restart_count = trans->restart_count; ++ unsigned i; ++ int ret; ++ ++ pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot); ++ ++ if (pos.inode == w->cur_inum) { ++ w->first_this_inode = false; ++ goto lookup_snapshot; ++ } ++ ++ w->inodes.nr = 0; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (k.k->p.offset != pos.inode) ++ break; ++ ++ if (bkey_is_inode(k.k)) ++ add_inode(c, w, k); ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (ret) ++ return ret; ++ ++ w->cur_inum = pos.inode; ++ w->first_this_inode = true; ++ ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ ++lookup_snapshot: ++ for (i = 0; i < w->inodes.nr; i++) ++ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) ++ goto found; ++ return INT_MAX; ++found: ++ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); ++ ++ if (pos.snapshot != w->inodes.data[i].snapshot) { ++ struct inode_walker_entry e = w->inodes.data[i]; ++ ++ e.snapshot = pos.snapshot; ++ e.count = 0; ++ ++ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", ++ pos.inode, pos.snapshot, w->inodes.data[i].snapshot); ++ ++ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) ++ --i; ++ ++ ret = darray_insert_item(&w->inodes, i, e); ++ if (ret) ++ return ret; ++ } ++ ++ return i; ++} ++ ++static int __get_visible_inodes(struct btree_trans *trans, ++ struct inode_walker *w, ++ struct snapshots_seen *s, ++ u64 inum) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ w->inodes.nr = 0; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ if (k.k->p.offset != inum) ++ break; ++ ++ if (!ref_visible(c, s, s->pos.snapshot, equiv)) ++ continue; ++ ++ if (bkey_is_inode(k.k)) ++ add_inode(c, w, k); ++ ++ if (equiv >= s->pos.snapshot) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static int check_key_has_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, ++ "key in missing snapshot: %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ ret = bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int hash_redo_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ bch_err(trans->c, "hash_redo_key() not implemented yet"); ++ return -EINVAL; ++#if 0 ++ struct bkey_i *delete; ++ struct bkey_i *tmp; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ ++ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); ++ ++ bkey_reassemble(tmp, k); ++ ++ bkey_init(&delete->k); ++ delete->k.p = k_iter->pos; ++ return bch2_btree_iter_traverse(k_iter) ?: ++ bch2_trans_update(trans, k_iter, delete, 0) ?: ++ bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); ++#endif ++} ++ ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c hash_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter = { NULL }; ++ struct printbuf buf = PRINTBUF; ++ struct bkey_s_c k; ++ u64 hash; ++ int ret = 0; ++ ++ if (hash_k.k->type != desc.key_type) ++ return 0; ++ ++ hash = desc.hash_bkey(hash_info, hash_k); ++ ++ if (likely(hash == hash_k.k->p.offset)) ++ return 0; ++ ++ if (hash_k.k->p.offset < hash) ++ goto bad_hash; ++ ++ for_each_btree_key_norestart(trans, iter, desc.btree_id, ++ POS(hash_k.k->p.inode, hash), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (!bkey_cmp(k.k->p, hash_k.k->p)) ++ break; ++ ++ if (fsck_err_on(k.k->type == desc.key_type && ++ !desc.cmp_bkey(k, hash_k), c, ++ "duplicate hash table keys:\n%s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), ++ buf.buf))) { ++ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) { ++ bch2_trans_iter_exit(trans, &iter); ++ goto bad_hash; ++ } ++ } ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++bad_hash: ++ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, " ++ "hashed to %llu\n%s", ++ bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ++ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ret = -BCH_ERR_transaction_restart_nested; ++ } ++fsck_err: ++ goto out; ++} ++ ++static int check_inode(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bch_inode_unpacked *prev, ++ struct snapshots_seen *s, ++ bool full) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ return 0; ++ ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); ++ if (ret) ++ goto err; ++ ++ /* ++ * if snapshot id isn't a leaf node, skip it - deletion in ++ * particular is not atomic, so on the internal snapshot nodes ++ * we can see inodes marked for deletion after a clean shutdown ++ */ ++ if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) ++ return 0; ++ ++ if (!bkey_is_inode(k.k)) ++ return 0; ++ ++ BUG_ON(bch2_inode_unpack(k, &u)); ++ ++ if (!full && ++ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED))) ++ return 0; ++ ++ if (prev->bi_inum != u.bi_inum) ++ *prev = u; ++ ++ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || ++ inode_d_type(prev) != inode_d_type(&u), c, ++ "inodes in different snapshots don't match")) { ++ bch_err(c, "repair not implemented yet"); ++ return -EINVAL; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch2_trans_unlock(trans); ++ bch2_fs_lazy_rw(c); ++ ++ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); ++ if (ret) ++ bch_err(c, "error in fsck: error while deleting inode: %s", ++ bch2_err_str(ret)); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ bch2_trans_unlock(trans); ++ bch2_fs_lazy_rw(c); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, ++ iter->pos.snapshot), ++ POS(u.bi_inum, U64_MAX), ++ 0, NULL); ++ if (ret) { ++ bch_err(c, "error in fsck: error truncating inode: %s", ++ bch2_err_str(ret)); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error recounting inode sectors: %s", ++ bch2_err_str(sectors)); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { ++ u.bi_dir = 0; ++ u.bi_dir_offset = 0; ++ u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ ret = __write_inode(trans, &u, iter->pos.snapshot); ++ if (ret) ++ bch_err(c, "error in fsck: error updating inode: %s", ++ bch2_err_str(ret)); ++ } ++err: ++fsck_err: ++ if (ret) ++ bch_err(c, "error from check_inode(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++noinline_for_stack ++static int check_inodes(struct bch_fs *c, bool full) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bch_inode_unpacked prev = { 0 }; ++ struct snapshots_seen s; ++ struct bkey_s_c k; ++ int ret; ++ ++ snapshots_seen_init(&s); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, ++ POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_inode(&trans, &iter, k, &prev, &s, full)); ++ ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ if (ret) ++ bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Checking for overlapping extents needs to be reimplemented ++ */ ++#if 0 ++static int fix_overlapping_extent(struct btree_trans *trans, ++ struct bkey_s_c k, struct bpos cut_at) ++{ ++ struct btree_iter iter; ++ struct bkey_i *u; ++ int ret; ++ ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); ++ ++ ++ /* ++ * We don't want to go through the extent_handle_overwrites path: ++ * ++ * XXX: this is going to screw up disk accounting, extent triggers ++ * assume things about extent overwrites - we should be running the ++ * triggers manually here ++ */ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); ++ ++ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++#endif ++ ++static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (!ret && k.k->type != KEY_TYPE_dirent) ++ ret = -ENOENT; ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }; ++ } ++ ++ return bkey_s_c_to_dirent(k); ++} ++ ++static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, ++ struct bkey_s_c_dirent d) ++{ ++ return inode->bi_dir == d.k->p.inode && ++ inode->bi_dir_offset == d.k->p.offset; ++} ++ ++static bool dirent_points_to_inode(struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *inode) ++{ ++ return d.v->d_type == DT_SUBVOL ++ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol ++ : le64_to_cpu(d.v->d_inum) == inode->bi_inum; ++} ++ ++static int inode_backpointer_exists(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c_dirent d; ++ int ret; ++ ++ d = dirent_get_by_pos(trans, &iter, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); ++ ret = bkey_err(d.s_c); ++ if (ret) ++ return ret == -ENOENT ? 0 : ret; ++ ++ ret = dirent_points_to_inode(d, inode); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) ++{ ++ struct bch_fs *c = trans->c; ++ struct inode_walker_entry *i; ++ u32 restart_count = trans->restart_count; ++ int ret = 0; ++ s64 count2; ++ ++ darray_for_each(w->inodes, i) { ++ if (i->inode.bi_sectors == i->count) ++ continue; ++ ++ count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot); ++ ++ if (i->count != count2) { ++ bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", ++ i->count, count2); ++ i->count = count2; ++ if (i->inode.bi_sectors == i->count) ++ continue; ++ } ++ ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, ++ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", ++ w->cur_inum, i->snapshot, ++ i->inode.bi_sectors, i->count)) { ++ i->inode.bi_sectors = i->count; ++ ret = write_inode(trans, &i->inode, i->snapshot); ++ if (ret) ++ break; ++ } ++ } ++fsck_err: ++ if (ret) { ++ bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret)); ++ return ret; ++ } ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ return 0; ++} ++ ++static int check_extent(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct inode_walker *inode, ++ struct snapshots_seen *s) ++{ ++ struct bch_fs *c = trans->c; ++ struct inode_walker_entry *i; ++ struct printbuf buf = PRINTBUF; ++ struct bpos equiv; ++ int ret = 0; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } ++ ++ equiv = k.k->p; ++ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); ++ if (ret) ++ goto err; ++ ++ if (k.k->type == KEY_TYPE_whiteout) ++ goto out; ++ ++ if (inode->cur_inum != k.k->p.inode) { ++ ret = check_i_sectors(trans, inode); ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(!iter->path->should_be_locked); ++#if 0 ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = fix_overlapping_extent(trans, k, prev.k->k.p) ++ ?: -BCH_ERR_transaction_restart_nested; ++ goto out; ++ } ++ } ++#endif ++ ret = __walk_inode(trans, inode, equiv); ++ if (ret < 0) ++ goto err; ++ ++ if (fsck_err_on(ret == INT_MAX, c, ++ "extent in missing inode:\n %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } ++ ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } ++ ++ i = inode->inodes.data + ret; ++ ret = 0; ++ ++ if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && ++ !S_ISLNK(i->inode.bi_mode), c, ++ "extent in non regular inode mode %o:\n %s", ++ i->inode.bi_mode, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } ++ ++ /* ++ * Check inodes in reverse order, from oldest snapshots to newest, so ++ * that we emit the fewest number of whiteouts necessary: ++ */ ++ for (i = inode->inodes.data + inode->inodes.nr - 1; ++ i >= inode->inodes.data; ++ --i) { ++ if (i->snapshot > equiv.snapshot || ++ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) ++ continue; ++ ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type past end of inode %llu:%u, i_size %llu\n %s", ++ i->inode.bi_inum, i->snapshot, i->inode.bi_size, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ struct btree_iter iter2; ++ ++ bch2_trans_copy_iter(&iter2, iter); ++ bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ++ ret = bch2_btree_iter_traverse(&iter2) ?: ++ bch2_btree_delete_at(trans, &iter2, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &iter2); ++ if (ret) ++ goto err; ++ ++ if (i->snapshot != equiv.snapshot) { ++ ret = snapshots_seen_add(c, s, i->snapshot); ++ if (ret) ++ goto err; ++ } ++ } ++ } ++ ++ if (bkey_extent_is_allocation(k.k)) ++ for_each_visible_inode(c, s, inode, equiv.snapshot, i) ++ i->count += k.k->size; ++#if 0 ++ bch2_bkey_buf_reassemble(&prev, c, k); ++#endif ++ ++out: ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_extent(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and ++ * that i_size an i_sectors are consistent ++ */ ++noinline_for_stack ++static int check_extents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct snapshots_seen s; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++#if 0 ++ struct bkey_buf prev; ++ bch2_bkey_buf_init(&prev); ++ prev.k->k = KEY(0, 0, 0); ++#endif ++ snapshots_seen_init(&s); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking extents"); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_extent(&trans, &iter, k, &w, &s)); ++#if 0 ++ bch2_bkey_buf_exit(&prev, c); ++#endif ++ inode_walker_exit(&w); ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ ++ if (ret) ++ bch_err(c, "error from check_extents(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) ++{ ++ struct bch_fs *c = trans->c; ++ struct inode_walker_entry *i; ++ u32 restart_count = trans->restart_count; ++ int ret = 0; ++ s64 count2; ++ ++ darray_for_each(w->inodes, i) { ++ if (i->inode.bi_nlink == i->count) ++ continue; ++ ++ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); ++ if (count2 < 0) ++ return count2; ++ ++ if (i->count != count2) { ++ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", ++ i->count, count2); ++ i->count = count2; ++ if (i->inode.bi_nlink == i->count) ++ continue; ++ } ++ ++ if (fsck_err_on(i->inode.bi_nlink != i->count, c, ++ "directory %llu:%u with wrong i_nlink: got %u, should be %llu", ++ w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { ++ i->inode.bi_nlink = i->count; ++ ret = write_inode(trans, &i->inode, i->snapshot); ++ if (ret) ++ break; ++ } ++ } ++fsck_err: ++ if (ret) { ++ bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret)); ++ return ret; ++ } ++ if (trans_was_restarted(trans, restart_count)) ++ return -BCH_ERR_transaction_restart_nested; ++ return 0; ++} ++ ++static int check_dirent_target(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c_dirent d, ++ struct bch_inode_unpacked *target, ++ u32 target_snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *n; ++ bool backpointer_exists = true; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (!target->bi_dir && ++ !target->bi_dir_offset) { ++ target->bi_dir = d.k->p.inode; ++ target->bi_dir_offset = d.k->p.offset; ++ ++ ret = __write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ ++ if (!inode_points_to_dirent(target, d)) { ++ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); ++ if (ret < 0) ++ goto err; ++ ++ backpointer_exists = ret; ++ ret = 0; ++ ++ if (fsck_err_on(S_ISDIR(target->bi_mode) && ++ backpointer_exists, c, ++ "directory %llu with multiple links", ++ target->bi_inum)) { ++ ret = __remove_dirent(trans, d.k->p); ++ goto out; ++ } ++ ++ if (fsck_err_on(backpointer_exists && ++ !target->bi_nlink, c, ++ "inode %llu type %s has multiple links but i_nlink 0", ++ target->bi_inum, bch2_d_types[d.v->d_type])) { ++ target->bi_nlink++; ++ target->bi_flags &= ~BCH_INODE_UNLINKED; ++ ++ ret = __write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ ++ if (fsck_err_on(!backpointer_exists, c, ++ "inode %llu:%u has wrong backpointer:\n" ++ "got %llu:%llu\n" ++ "should be %llu:%llu", ++ target->bi_inum, target_snapshot, ++ target->bi_dir, ++ target->bi_dir_offset, ++ d.k->p.inode, ++ d.k->p.offset)) { ++ target->bi_dir = d.k->p.inode; ++ target->bi_dir_offset = d.k->p.offset; ++ ++ ret = __write_inode(trans, target, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (fsck_err_on(d.v->d_type != inode_d_type(target), c, ++ "incorrect d_type: got %s, should be %s:\n%s", ++ bch2_d_type_str(d.v->d_type), ++ bch2_d_type_str(inode_d_type(target)), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { ++ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = inode_d_type(target); ++ ++ ret = bch2_trans_update(trans, iter, &n->k_i, 0); ++ if (ret) ++ goto err; ++ ++ d = dirent_i_to_s_c(n); ++ } ++ ++ if (d.v->d_type == DT_SUBVOL && ++ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && ++ (c->sb.version < bcachefs_metadata_version_subvol_dirent || ++ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", ++ le32_to_cpu(d.v->d_parent_subvol), ++ target->bi_parent_subvol))) { ++ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); ++ ++ ret = bch2_trans_update(trans, iter, &n->k_i, 0); ++ if (ret) ++ goto err; ++ ++ d = dirent_i_to_s_c(n); ++ } ++out: ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_target(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bch_hash_info *hash_info, ++ struct inode_walker *dir, ++ struct inode_walker *target, ++ struct snapshots_seen *s) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_dirent d; ++ struct inode_walker_entry *i; ++ struct printbuf buf = PRINTBUF; ++ struct bpos equiv; ++ int ret = 0; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) { ++ ret = ret < 0 ? ret : 0; ++ goto out; ++ } ++ ++ equiv = k.k->p; ++ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); ++ if (ret) ++ goto err; ++ ++ if (k.k->type == KEY_TYPE_whiteout) ++ goto out; ++ ++ if (dir->cur_inum != k.k->p.inode) { ++ ret = check_subdir_count(trans, dir); ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(!iter->path->should_be_locked); ++ ++ ret = __walk_inode(trans, dir, equiv); ++ if (ret < 0) ++ goto err; ++ ++ if (fsck_err_on(ret == INT_MAX, c, ++ "dirent in nonexisting directory:\n%s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ goto out; ++ } ++ ++ if (ret == INT_MAX) { ++ ret = 0; ++ goto out; ++ } ++ ++ i = dir->inodes.data + ret; ++ ret = 0; ++ ++ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, ++ "dirent in non directory inode type %s:\n%s", ++ bch2_d_type_str(inode_d_type(&i->inode)), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, 0); ++ goto out; ++ } ++ ++ if (dir->first_this_inode) ++ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); ++ ++ ret = hash_check_key(trans, bch2_dirent_hash_desc, ++ hash_info, iter, k); ++ if (ret < 0) ++ goto err; ++ if (ret) { ++ /* dirent has been deleted */ ++ ret = 0; ++ goto out; ++ } ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ goto out; ++ ++ d = bkey_s_c_to_dirent(k); ++ ++ if (d.v->d_type == DT_SUBVOL) { ++ struct bch_inode_unpacked subvol_root; ++ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); ++ u32 target_snapshot; ++ u64 target_inum; ++ ++ ret = __subvol_lookup(trans, target_subvol, ++ &target_snapshot, &target_inum); ++ if (ret && ret != -ENOENT) ++ goto err; ++ ++ if (fsck_err_on(ret, c, ++ "dirent points to missing subvolume %llu", ++ le64_to_cpu(d.v->d_child_subvol))) { ++ ret = __remove_dirent(trans, d.k->p); ++ goto err; ++ } ++ ++ ret = __lookup_inode(trans, target_inum, ++ &subvol_root, &target_snapshot); ++ if (ret && ret != -ENOENT) ++ goto err; ++ ++ if (fsck_err_on(ret, c, ++ "subvolume %u points to missing subvolume root %llu", ++ target_subvol, ++ target_inum)) { ++ bch_err(c, "repair not implemented yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, ++ "subvol root %llu has wrong bi_subvol field: got %u, should be %u", ++ target_inum, ++ subvol_root.bi_subvol, target_subvol)) { ++ subvol_root.bi_subvol = target_subvol; ++ ret = __write_inode(trans, &subvol_root, target_snapshot); ++ if (ret) ++ goto err; ++ } ++ ++ ret = check_dirent_target(trans, iter, d, &subvol_root, ++ target_snapshot); ++ if (ret) ++ goto err; ++ } else { ++ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(!target->inodes.nr, c, ++ "dirent points to missing inode: (equiv %u)\n%s", ++ equiv.snapshot, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) { ++ ret = __remove_dirent(trans, d.k->p); ++ if (ret) ++ goto err; ++ } ++ ++ darray_for_each(target->inodes, i) { ++ ret = check_dirent_target(trans, iter, d, ++ &i->inode, i->snapshot); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (d.v->d_type == DT_DIR) ++ for_each_visible_inode(c, s, dir, equiv.snapshot, i) ++ i->count++; ++ ++out: ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker dir = inode_walker_init(); ++ struct inode_walker target = inode_walker_init(); ++ struct snapshots_seen s; ++ struct bch_hash_info hash_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking dirents"); ++ ++ snapshots_seen_init(&s); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); ++ ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ inode_walker_exit(&dir); ++ inode_walker_exit(&target); ++ ++ if (ret) ++ bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bch_hash_info *hash_info, ++ struct inode_walker *inode) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ ret = check_key_has_snapshot(trans, iter, k); ++ if (ret) ++ return ret; ++ ++ ret = __walk_inode(trans, inode, k.k->p); ++ if (ret < 0) ++ return ret; ++ ++ if (fsck_err_on(ret == INT_MAX, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) ++ return bch2_btree_delete_at(trans, iter, 0); ++ ++ if (ret == INT_MAX) ++ return 0; ++ ++ ret = 0; ++ ++ if (inode->first_this_inode) ++ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); ++ ++ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); ++fsck_err: ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Walk xattrs: verify that they all have a corresponding inode ++ */ ++noinline_for_stack ++static int check_xattrs(struct bch_fs *c) ++{ ++ struct inode_walker inode = inode_walker_init(); ++ struct bch_hash_info hash_info; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking xattrs"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, ++ k, ++ NULL, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_xattr(&trans, &iter, k, &hash_info, &inode)); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int check_root_trans(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked root_inode; ++ u32 snapshot; ++ u64 inum; ++ int ret; ++ ++ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { ++ struct bkey_i_subvolume root_subvol; ++ ++ snapshot = U32_MAX; ++ inum = BCACHEFS_ROOT_INO; ++ ++ bkey_subvolume_init(&root_subvol.k_i); ++ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; ++ root_subvol.v.flags = 0; ++ root_subvol.v.snapshot = cpu_to_le32(snapshot); ++ root_subvol.v.inode = cpu_to_le64(inum); ++ ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); ++ if (ret) { ++ bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ } ++ ++ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (mustfix_fsck_err_on(ret, c, "root directory missing") || ++ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, ++ "root inode not a directory")) { ++ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode.bi_inum = inum; ++ ++ ret = __write_inode(trans, &root_inode, snapshot); ++ if (ret) ++ bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); ++ } ++err: ++fsck_err: ++ return ret; ++} ++ ++/* Get root directory, create if it doesn't exist: */ ++noinline_for_stack ++static int check_root(struct bch_fs *c) ++{ ++ bch_verbose(c, "checking root directory"); ++ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ check_root_trans(&trans)); ++} ++ ++struct pathbuf_entry { ++ u64 inum; ++ u32 snapshot; ++}; ++ ++typedef DARRAY(struct pathbuf_entry) pathbuf; ++ ++static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) ++{ ++ struct pathbuf_entry *i; ++ ++ darray_for_each(*p, i) ++ if (i->inum == inum && ++ i->snapshot == snapshot) ++ return true; ++ ++ return false; ++} ++ ++static int path_down(struct bch_fs *c, pathbuf *p, ++ u64 inum, u32 snapshot) ++{ ++ int ret = darray_push(p, ((struct pathbuf_entry) { ++ .inum = inum, ++ .snapshot = snapshot, ++ })); ++ ++ if (ret) ++ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", ++ p->size); ++ return ret; ++} ++ ++/* ++ * Check that a given inode is reachable from the root: ++ * ++ * XXX: we should also be verifying that inodes are in the right subvolumes ++ */ ++static int check_path(struct btree_trans *trans, ++ pathbuf *p, ++ struct bch_inode_unpacked *inode, ++ u32 snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ int ret = 0; ++ ++ snapshot = bch2_snapshot_equiv(c, snapshot); ++ p->nr = 0; ++ ++ while (!(inode->bi_inum == BCACHEFS_ROOT_INO && ++ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { ++ struct btree_iter dirent_iter; ++ struct bkey_s_c_dirent d; ++ u32 parent_snapshot = snapshot; ++ ++ if (inode->bi_subvol) { ++ u64 inum; ++ ++ ret = subvol_lookup(trans, inode->bi_parent_subvol, ++ &parent_snapshot, &inum); ++ if (ret) ++ break; ++ } ++ ++ ret = lockrestart_do(trans, ++ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, ++ parent_snapshot))).k)); ++ if (ret && ret != -ENOENT) ++ break; ++ ++ if (!ret && !dirent_points_to_inode(d, inode)) { ++ bch2_trans_iter_exit(trans, &dirent_iter); ++ ret = -ENOENT; ++ } ++ ++ if (ret == -ENOENT) { ++ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", ++ inode->bi_inum, snapshot, ++ bch2_d_type_str(inode_d_type(inode)), ++ inode->bi_nlink, ++ inode->bi_dir, ++ inode->bi_dir_offset)) ++ ret = reattach_inode(trans, inode, snapshot); ++ break; ++ } ++ ++ bch2_trans_iter_exit(trans, &dirent_iter); ++ ++ if (!S_ISDIR(inode->bi_mode)) ++ break; ++ ++ ret = path_down(c, p, inode->bi_inum, snapshot); ++ if (ret) { ++ bch_err(c, "memory allocation failure"); ++ return ret; ++ } ++ ++ snapshot = parent_snapshot; ++ ++ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); ++ if (ret) { ++ /* Should have been caught in dirents pass */ ++ bch_err(c, "error looking up parent directory: %i", ret); ++ break; ++ } ++ ++ if (path_is_dup(p, inode->bi_inum, snapshot)) { ++ struct pathbuf_entry *i; ++ ++ /* XXX print path */ ++ bch_err(c, "directory structure loop"); ++ ++ darray_for_each(*p, i) ++ pr_err("%llu:%u", i->inum, i->snapshot); ++ pr_err("%llu:%u", inode->bi_inum, snapshot); ++ ++ if (!fsck_err(c, "directory structure loop")) ++ return 0; ++ ++ ret = commit_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ remove_backpointer(trans, inode)); ++ if (ret) { ++ bch_err(c, "error removing dirent: %i", ret); ++ break; ++ } ++ ++ ret = reattach_inode(trans, inode, snapshot); ++ } ++ } ++fsck_err: ++ if (ret) ++ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Check for unreachable inodes, as well as loops in the directory structure: ++ * After check_dirents(), if an inode backpointer doesn't exist that means it's ++ * unreachable: ++ */ ++noinline_for_stack ++static int check_directory_structure(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked u; ++ pathbuf path = { 0, }; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (!bkey_is_inode(k.k)) ++ continue; ++ ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) { ++ /* Should have been caught earlier in fsck: */ ++ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); ++ break; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED) ++ continue; ++ ++ ret = check_path(&trans, &path, &u, iter.pos.snapshot); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ darray_exit(&path); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++struct nlink_table { ++ size_t nr; ++ size_t size; ++ ++ struct nlink { ++ u64 inum; ++ u32 snapshot; ++ u32 count; ++ } *d; ++}; ++ ++static int add_nlink(struct bch_fs *c, struct nlink_table *t, ++ u64 inum, u32 snapshot) ++{ ++ if (t->nr == t->size) { ++ size_t new_size = max_t(size_t, 128UL, t->size * 2); ++ void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); ++ if (!d) { ++ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", ++ new_size); ++ return -ENOMEM; ++ } ++ ++ if (t->d) ++ memcpy(d, t->d, t->size * sizeof(t->d[0])); ++ kvfree(t->d); ++ ++ t->d = d; ++ t->size = new_size; ++ } ++ ++ ++ t->d[t->nr++] = (struct nlink) { ++ .inum = inum, ++ .snapshot = snapshot, ++ }; ++ ++ return 0; ++} ++ ++static int nlink_cmp(const void *_l, const void *_r) ++{ ++ const struct nlink *l = _l; ++ const struct nlink *r = _r; ++ ++ return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); ++} ++ ++static void inc_link(struct bch_fs *c, struct snapshots_seen *s, ++ struct nlink_table *links, ++ u64 range_start, u64 range_end, u64 inum, u32 snapshot) ++{ ++ struct nlink *link, key = { ++ .inum = inum, .snapshot = U32_MAX, ++ }; ++ ++ if (inum < range_start || inum >= range_end) ++ return; ++ ++ link = __inline_bsearch(&key, links->d, links->nr, ++ sizeof(links->d[0]), nlink_cmp); ++ if (!link) ++ return; ++ ++ while (link > links->d && link[0].inum == link[-1].inum) ++ --link; ++ ++ for (; link < links->d + links->nr && link->inum == inum; link++) ++ if (ref_visible(c, s, snapshot, link->snapshot)) { ++ link->count++; ++ if (link->snapshot >= snapshot) ++ break; ++ } ++} ++ ++noinline_for_stack ++static int check_nlinks_find_hardlinks(struct bch_fs *c, ++ struct nlink_table *t, ++ u64 start, u64 *end) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked u; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_inodes, ++ POS(0, start), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ if (!bkey_is_inode(k.k)) ++ continue; ++ ++ /* Should never fail, checked by bch2_inode_invalid: */ ++ BUG_ON(bch2_inode_unpack(k, &u)); ++ ++ /* ++ * Backpointer and directory structure checks are sufficient for ++ * directories, since they can't have hardlinks: ++ */ ++ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ continue; ++ ++ if (!u.bi_nlink) ++ continue; ++ ++ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); ++ if (ret) { ++ *end = k.k->p.offset; ++ ret = 0; ++ break; ++ } ++ ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); ++ ++ return ret; ++} ++ ++noinline_for_stack ++static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct snapshots_seen s; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ int ret; ++ ++ snapshots_seen_init(&s); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); ++ if (ret) ++ break; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_dirent: ++ d = bkey_s_c_to_dirent(k); ++ ++ if (d.v->d_type != DT_DIR && ++ d.v->d_type != DT_SUBVOL) ++ inc_link(c, &s, links, range_start, range_end, ++ le64_to_cpu(d.v->d_inum), ++ bch2_snapshot_equiv(c, d.k->p.snapshot)); ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); ++ ++ bch2_trans_exit(&trans); ++ snapshots_seen_exit(&s); ++ return ret; ++} ++ ++static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct nlink_table *links, ++ size_t *idx, u64 range_end) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ struct nlink *link = &links->d[*idx]; ++ int ret = 0; ++ ++ if (k.k->p.offset >= range_end) ++ return 1; ++ ++ if (!bkey_is_inode(k.k)) ++ return 0; ++ ++ BUG_ON(bch2_inode_unpack(k, &u)); ++ ++ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ return 0; ++ ++ if (!u.bi_nlink) ++ return 0; ++ ++ while ((cmp_int(link->inum, k.k->p.offset) ?: ++ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { ++ BUG_ON(*idx == links->nr); ++ link = &links->d[++*idx]; ++ } ++ ++ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, ++ "inode %llu type %s has wrong i_nlink (%u, should be %u)", ++ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], ++ bch2_inode_nlink_get(&u), link->count)) { ++ bch2_inode_nlink_set(&u, link->count); ++ ret = __write_inode(trans, &u, k.k->p.snapshot); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int check_nlinks_update_hardlinks(struct bch_fs *c, ++ struct nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ size_t idx = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, ++ POS(0, range_start), ++ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret < 0) { ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++noinline_for_stack ++static int check_nlinks(struct bch_fs *c) ++{ ++ struct nlink_table links = { 0 }; ++ u64 this_iter_range_start, next_iter_range_start = 0; ++ int ret = 0; ++ ++ bch_verbose(c, "checking inode nlinks"); ++ ++ do { ++ this_iter_range_start = next_iter_range_start; ++ next_iter_range_start = U64_MAX; ++ ++ ret = check_nlinks_find_hardlinks(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ ++ ret = check_nlinks_walk_dirents(c, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ ret = check_nlinks_update_hardlinks(c, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ links.nr = 0; ++ } while (next_iter_range_start != U64_MAX); ++ ++ kvfree(links.d); ++ ++ return ret; ++} ++ ++static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p; ++ struct bkey_i_reflink_p *u; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_p) ++ return 0; ++ ++ p = bkey_s_c_to_reflink_p(k); ++ ++ if (!p.v->front_pad && !p.v->back_pad) ++ return 0; ++ ++ u = bch2_trans_kmalloc(trans, sizeof(*u)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(&u->k_i, k); ++ u->v.front_pad = 0; ++ u->v.back_pad = 0; ++ ++ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); ++} ++ ++noinline_for_stack ++static int fix_reflink_p(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) ++ return 0; ++ ++ bch_verbose(c, "fixing reflink_p keys"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ fix_reflink_p_key(&trans, &iter, k)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++/* ++ * Checks for inconsistencies that shouldn't happen, unless we have a bug. ++ * Doesn't fix them yet, mainly because they haven't yet been observed: ++ */ ++int bch2_fsck_full(struct bch_fs *c) ++{ ++ int ret; ++again: ++ ret = bch2_fs_check_snapshots(c) ?: ++ bch2_fs_check_subvols(c) ?: ++ bch2_delete_dead_snapshots(c) ?: ++ check_inodes(c, true) ?: ++ check_extents(c) ?: ++ check_dirents(c) ?: ++ check_xattrs(c) ?: ++ check_root(c) ?: ++ check_directory_structure(c) ?: ++ check_nlinks(c) ?: ++ fix_reflink_p(c); ++ ++ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ goto again; ++ } ++ ++ return ret; ++} ++ ++int bch2_fsck_walk_inodes_only(struct bch_fs *c) ++{ ++ return bch2_fs_check_snapshots(c) ?: ++ bch2_fs_check_subvols(c) ?: ++ bch2_delete_dead_snapshots(c) ?: ++ check_inodes(c, false); ++} +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +new file mode 100644 +index 000000000000..264f2706b12d +--- /dev/null ++++ b/fs/bcachefs/fsck.h +@@ -0,0 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FSCK_H ++#define _BCACHEFS_FSCK_H ++ ++int bch2_fsck_full(struct bch_fs *); ++int bch2_fsck_walk_inodes_only(struct bch_fs *); ++ ++#endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +new file mode 100644 +index 000000000000..083106006747 +--- /dev/null ++++ b/fs/bcachefs/inode.c +@@ -0,0 +1,771 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_key_cache.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "inode.h" ++#include "str_hash.h" ++#include "subvolume.h" ++#include "varint.h" ++ ++#include ++ ++#include ++ ++const char * const bch2_inode_opts[] = { ++#define x(name, ...) #name, ++ BCH_INODE_OPTS() ++#undef x ++ NULL, ++}; ++ ++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; ++ ++static int inode_decode_field(const u8 *in, const u8 *end, ++ u64 out[2], unsigned *out_bits) ++{ ++ __be64 be[2] = { 0, 0 }; ++ unsigned bytes, shift; ++ u8 *p; ++ ++ if (in >= end) ++ return -1; ++ ++ if (!*in) ++ return -1; ++ ++ /* ++ * position of highest set bit indicates number of bytes: ++ * shift = number of bits to remove in high byte: ++ */ ++ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ ++ bytes = byte_table[shift - 1]; ++ ++ if (in + bytes > end) ++ return -1; ++ ++ p = (u8 *) be + 16 - bytes; ++ memcpy(p, in, bytes); ++ *p ^= (1 << 8) >> shift; ++ ++ out[0] = be64_to_cpu(be[0]); ++ out[1] = be64_to_cpu(be[1]); ++ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); ++ ++ return bytes; ++} ++ ++void bch2_inode_pack(struct bch_fs *c, ++ struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ struct bkey_i_inode_v2 *k = &packed->inode; ++ u8 *out = k->v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ int ret; ++ ++ bkey_inode_v2_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); ++ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++#define x(_name, _bits) \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ ret = bch2_varint_encode_fast(out, inode->_name); \ ++ out += ret; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } else { \ ++ *out++ = 0; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ BUG_ON(out > end); ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODEv2_NR_FIELDS(&k->v, nr_fields); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct bch_inode_unpacked unpacked; ++ ++ int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), ++ &unpacked); ++ BUG_ON(ret); ++ BUG_ON(unpacked.bi_inum != inode->bi_inum); ++ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_mode != inode->bi_mode); ++ ++#define x(_name, _bits) if (unpacked._name != inode->_name) \ ++ panic("unpacked %llu should be %llu", \ ++ (u64) unpacked._name, (u64) inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++ } ++} ++ ++static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = bkey_val_end(inode); ++ u64 field[2]; ++ unsigned fieldnr = 0, field_bits; ++ int ret; ++ ++#define x(_name, _bits) \ ++ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ ++ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ ++ memset((void *) unpacked + offset, 0, \ ++ sizeof(*unpacked) - offset); \ ++ return 0; \ ++ } \ ++ \ ++ ret = inode_decode_field(in, end, field, &field_bits); \ ++ if (ret < 0) \ ++ return ret; \ ++ \ ++ if (field_bits > sizeof(unpacked->_name) * 8) \ ++ return -1; \ ++ \ ++ unpacked->_name = field[1]; \ ++ in += ret; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, ++ const u8 *in, const u8 *end, ++ unsigned nr_fields) ++{ ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v[2]; ++ ++#define x(_name, _bits) \ ++ if (fieldnr < nr_fields) { \ ++ ret = bch2_varint_decode_fast(in, end, &v[0]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ \ ++ if (_bits > 64) { \ ++ ret = bch2_varint_decode_fast(in, end, &v[1]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v[1] = 0; \ ++ } \ ++ } else { \ ++ v[0] = v[1] = 0; \ ++ } \ ++ \ ++ unpacked->_name = v[0]; \ ++ if (v[1] || v[0] != unpacked->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++int bch2_inode_unpack(struct bkey_s_c k, ++ struct bch_inode_unpacked *unpacked) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_journal_seq= 0; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ if (INODE_NEW_VARINT(inode.v)) { ++ return bch2_inode_unpack_v2(unpacked, inode.v->fields, ++ bkey_val_end(inode), ++ INODE_NR_FIELDS(inode.v)); ++ } else { ++ return bch2_inode_unpack_v1(inode, unpacked); ++ } ++ break; ++ } ++ case KEY_TYPE_inode_v2: { ++ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ return bch2_inode_unpack_v2(unpacked, inode.v->fields, ++ bkey_val_end(inode), ++ INODEv2_NR_FIELDS(inode.v)); ++ } ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_inode_peek(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode, ++ subvol_inum inum, unsigned flags) ++{ ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, ++ SPOS(0, inum.inum, snapshot), ++ flags|BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = bkey_is_inode(k.k) ? 0 : -ENOENT; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(k, inode); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_trans_iter_exit(trans, iter); ++ return ret; ++} ++ ++int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bkey_inode_buf *inode_p; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ bch2_inode_pack(trans->c, inode_p, inode); ++ inode_p->inode.k.p.snapshot = iter->snapshot; ++ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++} ++ ++static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) ++{ ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.inode) { ++ prt_printf(err, "nonzero k.p.inode"); ++ return -EINVAL; ++ } ++ ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) { ++ prt_printf(err, "fs inode in blockdev range"); ++ return -EINVAL; ++ } ++ ++ if (bch2_inode_unpack(k, &unpacked)){ ++ prt_printf(err, "invalid variable length fields"); ++ return -EINVAL; ++ } ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { ++ prt_printf(err, "invalid data checksum type (%u >= %u", ++ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); ++ return -EINVAL; ++ } ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { ++ prt_printf(err, "invalid data checksum type (%u >= %u)", ++ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); ++ return -EINVAL; ++ } ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) { ++ prt_printf(err, "flagged as unlinked but bi_nlink != 0"); ++ return -EINVAL; ++ } ++ ++ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { ++ prt_printf(err, "subvolume root but not a directory"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*inode.v)); ++ return -EINVAL; ++ } ++ ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { ++ prt_printf(err, "invalid str hash type (%llu >= %u)", ++ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); ++ return -EINVAL; ++ } ++ ++ return __bch2_inode_invalid(k, err); ++} ++ ++int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*inode.v)); ++ return -EINVAL; ++ } ++ ++ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { ++ prt_printf(err, "invalid str hash type (%llu >= %u)", ++ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); ++ return -EINVAL; ++ } ++ ++ return __bch2_inode_invalid(k, err); ++} ++ ++static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) ++{ ++ prt_printf(out, "mode %o flags %x journal_seq %llu", ++ inode->bi_mode, inode->bi_flags, ++ inode->bi_journal_seq); ++ ++#define x(_name, _bits) \ ++ prt_printf(out, " "#_name " %llu", (u64) inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) ++{ ++ prt_printf(out, "inum: %llu ", inode->bi_inum); ++ __bch2_inode_unpacked_to_text(out, inode); ++} ++ ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bch_inode_unpacked inode; ++ ++ if (bch2_inode_unpack(k, &inode)) { ++ prt_printf(out, "(unpack error)"); ++ return; ++ } ++ ++ __bch2_inode_unpacked_to_text(out, &inode); ++} ++ ++int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (k.k->p.inode) { ++ prt_printf(err, "nonzero k.p.inode"); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_inode_generation)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); ++ ++ prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); ++} ++ ++void bch2_inode_init_early(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u) ++{ ++ enum bch_str_hash_type str_hash = ++ bch2_str_hash_opt_to_type(c, c->opts.str_hash); ++ ++ memset(inode_u, 0, sizeof(*inode_u)); ++ ++ /* ick */ ++ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; ++ get_random_bytes(&inode_u->bi_hash_seed, ++ sizeof(inode_u->bi_hash_seed)); ++} ++ ++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ inode_u->bi_mode = mode; ++ inode_u->bi_uid = uid; ++ inode_u->bi_gid = gid; ++ inode_u->bi_dev = rdev; ++ inode_u->bi_atime = now; ++ inode_u->bi_mtime = now; ++ inode_u->bi_ctime = now; ++ inode_u->bi_otime = now; ++ ++ if (parent && parent->bi_mode & S_ISGID) { ++ inode_u->bi_gid = parent->bi_gid; ++ if (S_ISDIR(mode)) ++ inode_u->bi_mode |= S_ISGID; ++ } ++ ++ if (parent) { ++#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ } ++} ++ ++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ bch2_inode_init_early(c, inode_u); ++ bch2_inode_init_late(inode_u, bch2_current_time(c), ++ uid, gid, mode, rdev, parent); ++} ++ ++static inline u32 bkey_generation(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ case KEY_TYPE_inode_v2: ++ BUG(); ++ case KEY_TYPE_inode_generation: ++ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ default: ++ return 0; ++ } ++} ++ ++/* ++ * This just finds an empty slot: ++ */ ++int bch2_inode_create(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode_u, ++ u32 snapshot, u64 cpu) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ u64 min, max, start, pos, *hint; ++ int ret = 0; ++ unsigned bits = (c->opts.inodes_32bit ? 31 : 63); ++ ++ if (c->opts.shard_inode_numbers) { ++ bits -= c->inode_shard_bits; ++ ++ min = (cpu << bits); ++ max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ ++ min = max_t(u64, min, BLOCKDEV_INODE_MAX); ++ hint = c->unused_inode_hints + cpu; ++ } else { ++ min = BLOCKDEV_INODE_MAX; ++ max = ~(ULLONG_MAX << bits); ++ hint = c->unused_inode_hints; ++ } ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; ++ ++ pos = start; ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_INTENT); ++again: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(k.k->p, POS(0, max)) < 0) { ++ while (pos < iter->pos.offset) { ++ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) ++ goto found_slot; ++ ++ pos++; ++ } ++ ++ if (k.k->p.snapshot == snapshot && ++ !bkey_is_inode(k.k) && ++ !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { ++ bch2_btree_iter_advance(iter); ++ continue; ++ } ++ ++ /* ++ * We don't need to iterate over keys in every snapshot once ++ * we've found just one: ++ */ ++ pos = iter->pos.offset + 1; ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ } ++ ++ while (!ret && pos < max) { ++ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) ++ goto found_slot; ++ ++ pos++; ++ } ++ ++ if (!ret && start == min) ++ ret = -ENOSPC; ++ ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ret; ++ } ++ ++ /* Retry from start */ ++ pos = start = min; ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ goto again; ++found_slot: ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_exit(trans, iter); ++ return ret; ++ } ++ ++ /* We may have raced while the iterator wasn't pointing at pos: */ ++ if (bkey_is_inode(k.k) || ++ bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) ++ goto again; ++ ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; ++ inode_u->bi_generation = bkey_generation(k); ++ return 0; ++} ++ ++static int bch2_inode_delete_keys(struct btree_trans *trans, ++ subvol_inum inum, enum btree_id id) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i delete; ++ u32 snapshot; ++ int ret = 0; ++ ++ /* ++ * We're never going to be deleting extents, no need to use an extent ++ * iterator: ++ */ ++ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ++ while (1) { ++ bch2_trans_begin(trans); ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_set_snapshot(&iter, snapshot); ++ ++ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k) ++ break; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter.pos; ++ ++ ret = bch2_trans_update(trans, &iter, &delete, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ break; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_i_inode_generation delete; ++ struct bch_inode_unpacked inode_u; ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 1024); ++ ++ /* ++ * If this was a directory, there shouldn't be any real dirents left - ++ * but there could be whiteouts (from hash collisions) that we should ++ * delete: ++ * ++ * XXX: the dirent could ideally would delete whiteouts when they're no ++ * longer needed ++ */ ++ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: ++ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: ++ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); ++ if (ret) ++ goto err; ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inum.inum, snapshot), ++ BTREE_ITER_INTENT|BTREE_ITER_CACHED); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!bkey_is_inode(k.k)) { ++ bch2_fs_inconsistent(trans.c, ++ "inode %llu not found when deleting", ++ inum.inum); ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_inode_unpack(k, &inode_u); ++ ++ /* Subvolume root? */ ++ BUG_ON(inode_u.bi_subvol); ++ ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p = iter.pos; ++ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); ++ ++ ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: ++ bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, ++ subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ ret = bch2_inode_peek(trans, &iter, inode, inum, 0); ++ if (!ret) ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_inode_find_by_inum_trans(&trans, inum, inode)); ++} ++ ++int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ else { ++ if (bi->bi_nlink == U32_MAX) ++ return -EINVAL; ++ ++ bi->bi_nlink++; ++ } ++ ++ return 0; ++} ++ ++void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { ++ bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", ++ bi->bi_inum); ++ return; ++ } ++ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) { ++ bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); ++ return; ++ } ++ ++ if (bi->bi_nlink) ++ bi->bi_nlink--; ++ else ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++} +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +new file mode 100644 +index 000000000000..2ac2fc10513b +--- /dev/null ++++ b/fs/bcachefs/inode.h +@@ -0,0 +1,189 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_INODE_H ++#define _BCACHEFS_INODE_H ++ ++#include "opts.h" ++ ++extern const char * const bch2_inode_opts[]; ++ ++int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ ++} ++ ++#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_v2_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++ .trans_trigger = bch2_trans_mark_inode, \ ++ .atomic_trigger = bch2_mark_inode, \ ++} ++ ++static inline bool bkey_is_inode(const struct bkey *k) ++{ ++ return k->type == KEY_TYPE_inode || ++ k->type == KEY_TYPE_inode_v2; ++} ++ ++int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_generation_invalid, \ ++ .val_to_text = bch2_inode_generation_to_text, \ ++} ++ ++#if 0 ++typedef struct { ++ u64 lo; ++ u32 hi; ++} __packed __aligned(4) u96; ++#endif ++typedef u64 u96; ++ ++struct bch_inode_unpacked { ++ u64 bi_inum; ++ u64 bi_journal_seq; ++ __le64 bi_hash_seed; ++ u32 bi_flags; ++ u16 bi_mode; ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_FIELDS() ++#undef x ++}; ++ ++struct bkey_inode_buf { ++ struct bkey_i_inode_v2 inode; ++ ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[0 + BCH_INODE_FIELDS()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ ++void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, ++ const struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); ++ ++void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); ++ ++int bch2_inode_peek(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *, subvol_inum, unsigned); ++int bch2_inode_write(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *); ++ ++void bch2_inode_init_early(struct bch_fs *, ++ struct bch_inode_unpacked *); ++void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_create(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *, u32, u64); ++ ++int bch2_inode_rm(struct bch_fs *, subvol_inum); ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, ++ struct bch_inode_unpacked *); ++ ++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts ret = { 0 }; ++ ++#define x(_name, _bits) \ ++ if (inode->bi_##_name) \ ++ opt_set(ret, _name, inode->bi_##_name - 1); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ inode->bi_##_name = v; \ ++ break; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ return inode->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct bch_io_opts ++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); ++ ++ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); ++ return opts; ++} ++ ++static inline u8 mode_to_type(umode_t mode) ++{ ++ return (mode >> 12) & 15; ++} ++ ++static inline u8 inode_d_type(struct bch_inode_unpacked *inode) ++{ ++ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); ++} ++ ++/* i_nlink: */ ++ ++static inline unsigned nlink_bias(umode_t mode) ++{ ++ return S_ISDIR(mode) ? 2 : 1; ++} ++ ++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_flags & BCH_INODE_UNLINKED ++ ? 0 ++ : bi->bi_nlink + nlink_bias(bi->bi_mode); ++} ++ ++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, ++ unsigned nlink) ++{ ++ if (nlink) { ++ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ } else { ++ bi->bi_nlink = 0; ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++ } ++} ++ ++int bch2_inode_nlink_inc(struct bch_inode_unpacked *); ++void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +new file mode 100644 +index 000000000000..971f8ba00dbd +--- /dev/null ++++ b/fs/bcachefs/io.c +@@ -0,0 +1,2422 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Some low level IO code, and hacks for various block layer limitations ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_buf.h" ++#include "bset.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "compress.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "extent_update.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "rebalance.h" ++#include "subvolume.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++#include ++ ++const char *bch2_blk_status_to_str(blk_status_t status) ++{ ++ if (status == BLK_STS_REMOVED) ++ return "device removed"; ++ return blk_status_to_str(status); ++} ++ ++static bool bch2_target_congested(struct bch_fs *c, u16 target) ++{ ++ const struct bch_devs_mask *devs; ++ unsigned d, nr = 0, total = 0; ++ u64 now = local_clock(), last; ++ s64 congested; ++ struct bch_dev *ca; ++ ++ if (!target) ++ return false; ++ ++ rcu_read_lock(); ++ devs = bch2_target_to_mask(c, target) ?: ++ &c->rw_devs[BCH_DATA_user]; ++ ++ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ++ ca = rcu_dereference(c->devs[d]); ++ if (!ca) ++ continue; ++ ++ congested = atomic_read(&ca->congested); ++ last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ total += max(congested, 0LL); ++ nr++; ++ } ++ rcu_read_unlock(); ++ ++ return bch2_rand_range(nr * CONGESTED_MAX) < total; ++} ++ ++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, ++ u64 now, int rw) ++{ ++ u64 latency_capable = ++ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; ++ /* ideally we'd be taking into account the device's variance here: */ ++ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); ++ s64 latency_over = io_latency - latency_threshold; ++ ++ if (latency_threshold && latency_over > 0) { ++ /* ++ * bump up congested by approximately latency_over * 4 / ++ * latency_threshold - we don't need much accuracy here so don't ++ * bother with the divide: ++ */ ++ if (atomic_read(&ca->congested) < CONGESTED_MAX) ++ atomic_add(latency_over >> ++ max_t(int, ilog2(latency_threshold) - 2, 0), ++ &ca->congested); ++ ++ ca->congested_last = now; ++ } else if (atomic_read(&ca->congested) > 0) { ++ atomic_dec(&ca->congested); ++ } ++} ++ ++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) ++{ ++ atomic64_t *latency = &ca->cur_latency[rw]; ++ u64 now = local_clock(); ++ u64 io_latency = time_after64(now, submit_time) ++ ? now - submit_time ++ : 0; ++ u64 old, new, v = atomic64_read(latency); ++ ++ do { ++ old = v; ++ ++ /* ++ * If the io latency was reasonably close to the current ++ * latency, skip doing the update and atomic operation - most of ++ * the time: ++ */ ++ if (abs((int) (old - io_latency)) < (old >> 1) && ++ now & ~(~0U << 5)) ++ break; ++ ++ new = ewma_add(old, io_latency, 5); ++ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); ++ ++ bch2_congested_acct(ca, io_latency, now, rw); ++ ++ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); ++} ++ ++/* Allocate, free from mempool: */ ++ ++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ if (bv->bv_page != ZERO_PAGE(0)) ++ mempool_free(bv->bv_page, &c->bio_bounce_pages); ++ bio->bi_vcnt = 0; ++} ++ ++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) ++{ ++ struct page *page; ++ ++ if (likely(!*using_mempool)) { ++ page = alloc_page(GFP_NOIO); ++ if (unlikely(!page)) { ++ mutex_lock(&c->bio_bounce_pages_lock); ++ *using_mempool = true; ++ goto pool_alloc; ++ ++ } ++ } else { ++pool_alloc: ++ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); ++ } ++ ++ return page; ++} ++ ++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, ++ size_t size) ++{ ++ bool using_mempool = false; ++ ++ while (size) { ++ struct page *page = __bio_alloc_page_pool(c, &using_mempool); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ if (using_mempool) ++ mutex_unlock(&c->bio_bounce_pages_lock); ++} ++ ++/* Extent update path: */ ++ ++int bch2_sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool *usage_increasing, ++ s64 *i_sectors_delta, ++ s64 *disk_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c old; ++ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); ++ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); ++ int ret = 0; ++ ++ *usage_increasing = false; ++ *i_sectors_delta = 0; ++ *disk_sectors_delta = 0; ++ ++ bch2_trans_copy_iter(&iter, extent_iter); ++ ++ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { ++ s64 sectors = min(new->k.p.offset, old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k)); ++ ++ *i_sectors_delta += sectors * ++ (bkey_extent_is_allocation(&new->k) - ++ bkey_extent_is_allocation(old.k)); ++ ++ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); ++ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot ++ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) ++ : 0; ++ ++ if (!*usage_increasing && ++ (new->k.p.snapshot != old.k->p.snapshot || ++ new_replicas > bch2_bkey_replicas(c, old) || ++ (!new_compressed && bch2_bkey_sectors_compressed(old)))) ++ *usage_increasing = true; ++ ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) ++ break; ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_extent_update(struct btree_trans *trans, ++ subvol_inum inum, ++ struct btree_iter *iter, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ u64 new_i_size, ++ s64 *i_sectors_delta_total, ++ bool check_enospc) ++{ ++ struct btree_iter inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct bpos next_pos; ++ bool usage_increasing; ++ s64 i_sectors_delta = 0, disk_sectors_delta = 0; ++ int ret; ++ ++ /* ++ * This traverses us the iterator without changing iter->path->pos to ++ * search_key() (which is pos + 1 for extents): we want there to be a ++ * path already traversed at iter->pos because ++ * bch2_trans_extent_update() will use it to attempt extent merging ++ */ ++ ret = __bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ ret = bch2_extent_trim_atomic(trans, iter, k); ++ if (ret) ++ return ret; ++ ++ new_i_size = min(k->k.p.offset << 9, new_i_size); ++ next_pos = k->k.p; ++ ++ ret = bch2_sum_sector_overwrites(trans, iter, k, ++ &usage_increasing, ++ &i_sectors_delta, ++ &disk_sectors_delta); ++ if (ret) ++ return ret; ++ ++ if (disk_res && ++ disk_sectors_delta > (s64) disk_res->sectors) { ++ ret = bch2_disk_reservation_add(trans->c, disk_res, ++ disk_sectors_delta - disk_res->sectors, ++ !check_enospc || !usage_increasing ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); ++ if (ret) ++ return ret; ++ } ++ ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, ++ BTREE_ITER_INTENT); ++ if (ret) ++ return ret; ++ ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ ++ inode_u.bi_sectors += i_sectors_delta; ++ ++ ret = bch2_trans_update(trans, iter, k, 0) ?: ++ bch2_inode_write(trans, &inode_iter, &inode_u) ?: ++ bch2_trans_commit(trans, disk_res, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ ++ if (ret) ++ return ret; ++ ++ if (i_sectors_delta_total) ++ *i_sectors_delta_total += i_sectors_delta; ++ bch2_btree_iter_set_pos(iter, next_pos); ++ ++ return 0; ++} ++ ++/* ++ * Returns -BCH_ERR_transacton_restart if we had to drop locks: ++ */ ++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, ++ subvol_inum inum, u64 end, ++ s64 *i_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bpos end_pos = POS(inum.inum, end); ++ struct bkey_s_c k; ++ int ret = 0, ret2 = 0; ++ u32 snapshot; ++ ++ while (!ret || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ ++ if (ret) ++ ret2 = ret; ++ ++ bch2_trans_begin(trans); ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ continue; ++ ++ bch2_btree_iter_set_snapshot(iter, snapshot); ++ ++ k = bch2_btree_iter_peek(iter); ++ if (bkey_cmp(iter->pos, end_pos) >= 0) { ++ bch2_btree_iter_set_pos(iter, end_pos); ++ break; ++ } ++ ++ ret = bkey_err(k); ++ if (ret) ++ continue; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter->pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end_pos, &delete); ++ ++ ret = bch2_extent_update(trans, inum, iter, &delete, ++ &disk_res, NULL, ++ 0, i_sectors_delta, false); ++ bch2_disk_reservation_put(c, &disk_res); ++ } ++ ++ return ret ?: ret2; ++} ++ ++int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, ++ s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ POS(inum.inum, start), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ ++ return ret; ++} ++ ++int bch2_write_index_default(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_buf sk; ++ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ subvol_inum inum = { ++ .subvol = op->subvol, ++ .inum = k->k.p.inode, ++ }; ++ int ret; ++ ++ BUG_ON(!inum.subvol); ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ k = bch2_keylist_front(keys); ++ bch2_bkey_buf_copy(&sk, c, k); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, ++ &sk.k->k.p.snapshot); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ bkey_start_pos(&sk.k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ ret = bch2_extent_update(&trans, inum, &iter, sk.k, ++ &op->res, op_journal_seq(op), ++ op->new_i_size, &op->i_sectors_delta, ++ op->flags & BCH_WRITE_CHECK_ENOSPC); ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++ ++ if (ec_ob) ++ bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); ++ ++ if (bkey_cmp(iter.pos, k->k.p) >= 0) ++ bch2_keylist_pop_front(&op->insert_keys); ++ else ++ bch2_cut_front(iter.pos, k); ++ } while (!bch2_keylist_empty(keys)); ++ ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* Writes */ ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ++ enum bch_data_type type, ++ const struct bkey_i *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); ++ const struct bch_extent_ptr *ptr; ++ struct bch_write_bio *n; ++ struct bch_dev *ca; ++ ++ BUG_ON(c->opts.nochanges); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || ++ !c->devs[ptr->dev]); ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (to_entry(ptr + 1) < ptrs.end) { ++ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, ++ GFP_NOIO, &ca->replica_set)); ++ ++ n->bio.bi_end_io = wbio->bio.bi_end_io; ++ n->bio.bi_private = wbio->bio.bi_private; ++ n->parent = wbio; ++ n->split = true; ++ n->bounce = false; ++ n->put_bio = true; ++ n->bio.bi_opf = wbio->bio.bi_opf; ++ bio_inc_remaining(&wbio->bio); ++ } else { ++ n = wbio; ++ n->split = false; ++ } ++ ++ n->c = c; ++ n->dev = ptr->dev; ++ n->have_ioref = bch2_dev_get_ioref(ca, ++ type == BCH_DATA_btree ? READ : WRITE); ++ n->submit_time = local_clock(); ++ n->bio.bi_iter.bi_sector = ptr->offset; ++ ++ if (likely(n->have_ioref)) { ++ this_cpu_add(ca->io_done->sectors[WRITE][type], ++ bio_sectors(&n->bio)); ++ ++ bio_set_dev(&n->bio, ca->disk_sb.bdev); ++ submit_bio(&n->bio); ++ } else { ++ n->bio.bi_status = BLK_STS_REMOVED; ++ bio_endio(&n->bio); ++ } ++ } ++} ++ ++static void __bch2_write(struct closure *); ++ ++static void bch2_write_done(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) ++ op->error = bch2_journal_error(&c->journal); ++ ++ bch2_disk_reservation_put(c, &op->res); ++ percpu_ref_put(&c->writes); ++ bch2_keylist_free(&op->insert_keys, op->inline_keys); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/** ++ * bch_write_index - after a write, update index to point to new data ++ */ ++static void __bch2_write_index(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *src, *dst = keys->keys, *n, *k; ++ unsigned dev; ++ int ret; ++ ++ for (src = keys->keys; src != keys->top; src = n) { ++ n = bkey_next(src); ++ ++ if (bkey_extent_is_direct_data(&src->k)) { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, ++ test_bit(ptr->dev, op->failed.d)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { ++ ret = -EIO; ++ goto err; ++ } ++ } ++ ++ if (dst != src) ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ ++ keys->top = dst; ++ ++ /* ++ * probably not the ideal place to hook this in, but I don't ++ * particularly want to plumb io_opts all the way through the btree ++ * update stack right now ++ */ ++ for_each_keylist_key(keys, k) { ++ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); ++ ++ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) ++ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); ++ ++ } ++ ++ if (!bch2_keylist_empty(keys)) { ++ u64 sectors_start = keylist_sectors(keys); ++ int ret = op->index_update_fn(op); ++ ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); ++ BUG_ON(keylist_sectors(keys) && !ret); ++ ++ op->written += sectors_start - keylist_sectors(keys); ++ ++ if (ret) { ++ bch_err_inum_ratelimited(c, op->pos.inode, ++ "write error %i from btree update", ret); ++ op->error = ret; ++ } ++ } ++out: ++ /* If some a bucket wasn't written, we can't erasure code it: */ ++ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) ++ bch2_open_bucket_write_error(c, &op->open_buckets, dev); ++ ++ bch2_open_buckets_put(c, &op->open_buckets); ++ return; ++err: ++ keys->top = keys->keys; ++ op->error = ret; ++ goto out; ++} ++ ++static void bch2_write_index(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ __bch2_write_index(op); ++ ++ if (!(op->flags & BCH_WRITE_DONE)) { ++ continue_at(cl, __bch2_write, index_update_wq(op)); ++ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ bch2_journal_flush_seq_async(&c->journal, ++ *op_journal_seq(op), ++ cl); ++ continue_at(cl, bch2_write_done, index_update_wq(op)); ++ } else { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ } ++} ++ ++static void bch2_write_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ op->pos.inode, ++ op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ ++ "data write error: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ set_bit(wbio->dev, op->failed.d); ++ ++ if (wbio->have_ioref) { ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (wbio->bounce) ++ bch2_bio_free_pages_pool(c, bio); ++ ++ if (wbio->put_bio) ++ bio_put(bio); ++ ++ if (parent) ++ bio_endio(&parent->bio); ++ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) ++ closure_put(cl); ++ else ++ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++} ++ ++static void init_append_extent(struct bch_write_op *op, ++ struct write_point *wp, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_i_extent *e; ++ ++ op->pos.offset += crc.uncompressed_size; ++ ++ e = bkey_extent_init(op->insert_keys.top); ++ e->k.p = op->pos; ++ e->k.size = crc.uncompressed_size; ++ e->k.version = version; ++ ++ if (crc.csum_type || ++ crc.compression_type || ++ crc.nonce) ++ bch2_extent_crc_append(&e->k_i, crc); ++ ++ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, ++ op->flags & BCH_WRITE_CACHED); ++ ++ bch2_keylist_push(&op->insert_keys); ++} ++ ++static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ++ struct write_point *wp, ++ struct bio *src, ++ bool *page_alloc_failed, ++ void *buf) ++{ ++ struct bch_write_bio *wbio; ++ struct bio *bio; ++ unsigned output_available = ++ min(wp->sectors_free << 9, src->bi_iter.bi_size); ++ unsigned pages = DIV_ROUND_UP(output_available + ++ (buf ++ ? ((unsigned long) buf & (PAGE_SIZE - 1)) ++ : 0), PAGE_SIZE); ++ ++ pages = min(pages, BIO_MAX_VECS); ++ ++ bio = bio_alloc_bioset(NULL, pages, 0, ++ GFP_NOIO, &c->bio_write); ++ wbio = wbio_init(bio); ++ wbio->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ wbio->bio.bi_opf = src->bi_opf; ++ ++ if (buf) { ++ bch2_bio_map(bio, buf, output_available); ++ return bio; ++ } ++ ++ wbio->bounce = true; ++ ++ /* ++ * We can't use mempool for more than c->sb.encoded_extent_max ++ * worth of pages, but we'd like to allocate more if we can: ++ */ ++ bch2_bio_alloc_pages_pool(c, bio, ++ min_t(unsigned, output_available, ++ c->opts.encoded_extent_max)); ++ ++ if (bio->bi_iter.bi_size < output_available) ++ *page_alloc_failed = ++ bch2_bio_alloc_pages(bio, ++ output_available - ++ bio->bi_iter.bi_size, ++ GFP_NOFS) != 0; ++ ++ return bio; ++} ++ ++static int bch2_write_rechecksum(struct bch_fs *c, ++ struct bch_write_op *op, ++ unsigned new_csum_type) ++{ ++ struct bio *bio = &op->wbio.bio; ++ struct bch_extent_crc_unpacked new_crc; ++ int ret; ++ ++ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ ++ ++ if (bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)) ++ new_csum_type = op->crc.csum_type; ++ ++ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, ++ NULL, &new_crc, ++ op->crc.offset, op->crc.live_size, ++ new_csum_type); ++ if (ret) ++ return ret; ++ ++ bio_advance(bio, op->crc.offset << 9); ++ bio->bi_iter.bi_size = op->crc.live_size << 9; ++ op->crc = new_crc; ++ return 0; ++} ++ ++static int bch2_write_decrypt(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct nonce nonce = extent_nonce(op->version, op->crc); ++ struct bch_csum csum; ++ int ret; ++ ++ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) ++ return 0; ++ ++ /* ++ * If we need to decrypt data in the write path, we'll no longer be able ++ * to verify the existing checksum (poly1305 mac, in this case) after ++ * it's decrypted - this is the last point we'll be able to reverify the ++ * checksum: ++ */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return -EIO; ++ ++ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ op->crc.csum_type = 0; ++ op->crc.csum = (struct bch_csum) { 0, 0 }; ++ return ret; ++} ++ ++static enum prep_encoded_ret { ++ PREP_ENCODED_OK, ++ PREP_ENCODED_ERR, ++ PREP_ENCODED_CHECKSUM_ERR, ++ PREP_ENCODED_DO_WRITE, ++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *bio = &op->wbio.bio; ++ ++ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ return PREP_ENCODED_OK; ++ ++ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); ++ ++ /* Can we just write the entire extent as is? */ ++ if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.compressed_size <= wp->sectors_free && ++ (op->crc.compression_type == op->compression_type || ++ op->incompressible)) { ++ if (!crc_is_compressed(op->crc) && ++ op->csum_type != op->crc.csum_type && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_DO_WRITE; ++ } ++ ++ /* ++ * If the data is compressed and we couldn't write the entire extent as ++ * is, we have to decompress it: ++ */ ++ if (crc_is_compressed(op->crc)) { ++ struct bch_csum csum; ++ ++ if (bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* Last point we can still verify checksum: */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, ++ extent_nonce(op->version, op->crc), ++ bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ return PREP_ENCODED_ERR; ++ } ++ ++ /* ++ * No longer have compressed data after this point - data might be ++ * encrypted: ++ */ ++ ++ /* ++ * If the data is checksummed and we're only writing a subset, ++ * rechecksum and adjust bio to point to currently live data: ++ */ ++ if ((op->crc.live_size != op->crc.uncompressed_size || ++ op->crc.csum_type != op->csum_type) && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* ++ * If we want to compress the data, it has to be decrypted: ++ */ ++ if ((op->compression_type || ++ bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(op->csum_type)) && ++ bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_OK; ++} ++ ++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ++ struct bio **_dst) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *src = &op->wbio.bio, *dst = src; ++ struct bvec_iter saved_iter; ++ void *ec_buf; ++ unsigned total_output = 0, total_input = 0; ++ bool bounce = false; ++ bool page_alloc_failed = false; ++ int ret, more = 0; ++ ++ BUG_ON(!bio_sectors(src)); ++ ++ ec_buf = bch2_writepoint_ec_buf(c, wp); ++ ++ switch (bch2_write_prep_encoded_data(op, wp)) { ++ case PREP_ENCODED_OK: ++ break; ++ case PREP_ENCODED_ERR: ++ ret = -EIO; ++ goto err; ++ case PREP_ENCODED_CHECKSUM_ERR: ++ goto csum_err; ++ case PREP_ENCODED_DO_WRITE: ++ /* XXX look for bug here */ ++ if (ec_buf) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bio_copy_data(dst, src); ++ bounce = true; ++ } ++ init_append_extent(op, wp, op->version, op->crc); ++ goto do_write; ++ } ++ ++ if (ec_buf || ++ op->compression_type || ++ (op->csum_type && ++ !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ (bch2_csum_type_is_encryption(op->csum_type) && ++ !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bounce = true; ++ } ++ ++ saved_iter = dst->bi_iter; ++ ++ do { ++ struct bch_extent_crc_unpacked crc = ++ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bversion version = op->version; ++ size_t dst_len, src_len; ++ ++ if (page_alloc_failed && ++ dst->bi_iter.bi_size < (wp->sectors_free << 9) && ++ dst->bi_iter.bi_size < c->opts.encoded_extent_max) ++ break; ++ ++ BUG_ON(op->compression_type && ++ (op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_csum_type_is_encryption(op->crc.csum_type)); ++ BUG_ON(op->compression_type && !bounce); ++ ++ crc.compression_type = op->incompressible ++ ? BCH_COMPRESSION_TYPE_incompressible ++ : op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) ++ : 0; ++ if (!crc_is_compressed(crc)) { ++ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); ++ ++ if (op->csum_type) ++ dst_len = min_t(unsigned, dst_len, ++ c->opts.encoded_extent_max); ++ ++ if (bounce) { ++ swap(dst->bi_iter.bi_size, dst_len); ++ bio_copy_data(dst, src); ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ src_len = dst_len; ++ } ++ ++ BUG_ON(!src_len || !dst_len); ++ ++ if (bch2_csum_type_is_encryption(op->csum_type)) { ++ if (bversion_zero(version)) { ++ version.lo = atomic64_inc_return(&c->key_version); ++ } else { ++ crc.nonce = op->nonce; ++ op->nonce += src_len >> 9; ++ } ++ } ++ ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ !crc_is_compressed(crc) && ++ bch2_csum_type_is_encryption(op->crc.csum_type) == ++ bch2_csum_type_is_encryption(op->csum_type)) { ++ /* ++ * Note: when we're using rechecksum(), we need to be ++ * checksumming @src because it has all the data our ++ * existing checksum covers - if we bounced (because we ++ * were trying to compress), @dst will only have the ++ * part of the data the new checksum will cover. ++ * ++ * But normally we want to be checksumming post bounce, ++ * because part of the reason for bouncing is so the ++ * data can't be modified (by userspace) while it's in ++ * flight. ++ */ ++ if (bch2_rechecksum_bio(c, src, version, op->crc, ++ &crc, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->csum_type)) ++ goto csum_err; ++ } else { ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_rechecksum_bio(c, src, version, op->crc, ++ NULL, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->crc.csum_type)) ++ goto csum_err; ++ ++ crc.compressed_size = dst_len >> 9; ++ crc.uncompressed_size = src_len >> 9; ++ crc.live_size = src_len >> 9; ++ ++ swap(dst->bi_iter.bi_size, dst_len); ++ ret = bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ if (ret) ++ goto err; ++ ++ crc.csum = bch2_checksum_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum_type = op->csum_type; ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ init_append_extent(op, wp, version, crc); ++ ++ if (dst != src) ++ bio_advance(dst, dst_len); ++ bio_advance(src, src_len); ++ total_output += dst_len; ++ total_input += src_len; ++ } while (dst->bi_iter.bi_size && ++ src->bi_iter.bi_size && ++ wp->sectors_free && ++ !bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)); ++ ++ more = src->bi_iter.bi_size != 0; ++ ++ dst->bi_iter = saved_iter; ++ ++ if (dst == src && more) { ++ BUG_ON(total_output != total_input); ++ ++ dst = bio_split(src, total_input >> 9, ++ GFP_NOIO, &c->bio_write); ++ wbio_init(dst)->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ dst->bi_opf = src->bi_opf; ++ } ++ ++ dst->bi_iter.bi_size = total_output; ++do_write: ++ *_dst = dst; ++ return more; ++csum_err: ++ bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); ++ ret = -EIO; ++err: ++ if (to_wbio(dst)->bounce) ++ bch2_bio_free_pages_pool(c, dst); ++ if (to_wbio(dst)->put_bio) ++ bio_put(dst); ++ ++ return ret; ++} ++ ++static void __bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ struct write_point *wp; ++ struct bio *bio = NULL; ++ bool skip_put = true; ++ unsigned nofs_flags; ++ int ret; ++ ++ nofs_flags = memalloc_nofs_save(); ++again: ++ memset(&op->failed, 0, sizeof(op->failed)); ++ ++ do { ++ struct bkey_i *key_to_write; ++ unsigned key_to_write_offset = op->insert_keys.top_p - ++ op->insert_keys.keys_p; ++ ++ /* +1 for possible cache device: */ ++ if (op->open_buckets.nr + op->nr_replicas + 1 > ++ ARRAY_SIZE(op->open_buckets.v)) ++ goto flush_io; ++ ++ if (bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)) ++ goto flush_io; ++ ++ /* ++ * The copygc thread is now global, which means it's no longer ++ * freeing up space on specific disks, which means that ++ * allocations for specific disks may hang arbitrarily long: ++ */ ++ wp = bch2_alloc_sectors_start(c, ++ op->target, ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| ++ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); ++ EBUG_ON(!wp); ++ ++ if (unlikely(IS_ERR(wp))) { ++ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { ++ ret = PTR_ERR(wp); ++ goto err; ++ } ++ ++ goto flush_io; ++ } ++ ++ /* ++ * It's possible for the allocator to fail, put us on the ++ * freelist waitlist, and then succeed in one of various retry ++ * paths: if that happens, we need to disable the skip_put ++ * optimization because otherwise there won't necessarily be a ++ * barrier before we free the bch_write_op: ++ */ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ skip_put = false; ++ ++ bch2_open_bucket_get(c, wp, &op->open_buckets); ++ ret = bch2_write_extent(op, wp, &bio); ++ bch2_alloc_sectors_done(c, wp); ++ ++ if (ret < 0) ++ goto err; ++ ++ if (ret) { ++ skip_put = false; ++ } else { ++ /* ++ * for the skip_put optimization this has to be set ++ * before we submit the bio: ++ */ ++ op->flags |= BCH_WRITE_DONE; ++ } ++ ++ bio->bi_end_io = bch2_write_endio; ++ bio->bi_private = &op->cl; ++ bio->bi_opf |= REQ_OP_WRITE; ++ ++ if (!skip_put) ++ closure_get(bio->bi_private); ++ else ++ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ ++ key_to_write = (void *) (op->insert_keys.keys_p + ++ key_to_write_offset); ++ ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, ++ key_to_write); ++ } while (ret); ++ ++ if (!skip_put) ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++out: ++ memalloc_nofs_restore(nofs_flags); ++ return; ++err: ++ op->error = ret; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++flush_io: ++ /* ++ * If the write can't all be submitted at once, we generally want to ++ * block synchronously as that signals backpressure to the caller. ++ * ++ * However, if we're running out of a workqueue, we can't block here ++ * because we'll be blocking other work items from completing: ++ */ ++ if (current->flags & PF_WQ_WORKER) { ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++ } ++ ++ closure_sync(cl); ++ ++ if (!bch2_keylist_empty(&op->insert_keys)) { ++ __bch2_write_index(op); ++ ++ if (op->error) { ++ op->flags |= BCH_WRITE_DONE; ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ goto out; ++ } ++ } ++ ++ goto again; ++} ++ ++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->wbio.bio; ++ struct bvec_iter iter; ++ struct bkey_i_inline_data *id; ++ unsigned sectors; ++ int ret; ++ ++ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ++ ++ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); ++ if (ret) { ++ op->error = ret; ++ goto err; ++ } ++ ++ sectors = bio_sectors(bio); ++ op->pos.offset += sectors; ++ ++ id = bkey_inline_data_init(op->insert_keys.top); ++ id->k.p = op->pos; ++ id->k.version = op->version; ++ id->k.size = sectors; ++ ++ iter = bio->bi_iter; ++ iter.bi_size = data_len; ++ memcpy_from_bio(id->v.data, bio, iter); ++ ++ while (data_len & 7) ++ id->v.data[data_len++] = '\0'; ++ set_bkey_val_bytes(&id->k, data_len); ++ bch2_keylist_push(&op->insert_keys); ++ ++ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at_nobarrier(cl, bch2_write_index, NULL); ++ return; ++err: ++ bch2_write_done(&op->cl); ++} ++ ++/** ++ * bch_write - handle a write to a cache device or flash only volume ++ * ++ * This is the starting point for any data to end up in a cache device; it could ++ * be from a normal write, or a writeback write, or a write to a flash only ++ * volume - it's also used by the moving garbage collector to compact data in ++ * mostly empty buckets. ++ * ++ * It first writes the data to the cache, creating a list of keys to be inserted ++ * (if the data won't fit in a single open bucket, there will be multiple keys); ++ * after the data is written it calls bch_journal, and after the keys have been ++ * added to the next journal write they're inserted into the btree. ++ * ++ * If op->discard is true, instead of inserting the data it invalidates the ++ * region of the cache represented by op->bio and op->inode. ++ */ ++void bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bio *bio = &op->wbio.bio; ++ struct bch_fs *c = op->c; ++ unsigned data_len; ++ ++ BUG_ON(!op->nr_replicas); ++ BUG_ON(!op->write_point.v); ++ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); ++ ++ op->start_time = local_clock(); ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ ++ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { ++ bch_err_inum_ratelimited(c, op->pos.inode, ++ "misaligned write"); ++ op->error = -EIO; ++ goto err; ++ } ++ ++ if (c->opts.nochanges || ++ !percpu_ref_tryget_live(&c->writes)) { ++ op->error = -EROFS; ++ goto err; ++ } ++ ++ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); ++ bch2_increment_clock(c, bio_sectors(bio), WRITE); ++ ++ data_len = min_t(u64, bio->bi_iter.bi_size, ++ op->new_i_size - (op->pos.offset << 9)); ++ ++ if (c->opts.inline_data && ++ data_len <= min(block_bytes(c) / 2, 1024U)) { ++ bch2_write_data_inline(op, data_len); ++ return; ++ } ++ ++ continue_at_nobarrier(cl, __bch2_write, NULL); ++ return; ++err: ++ bch2_disk_reservation_put(c, &op->res); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/* Cache promotion on read */ ++ ++struct promote_op { ++ struct closure cl; ++ struct rcu_head rcu; ++ u64 start_time; ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct data_update write; ++ struct bio_vec bi_inline_vecs[0]; /* must be last */ ++}; ++ ++static const struct rhashtable_params bch_promote_params = { ++ .head_offset = offsetof(struct promote_op, hash), ++ .key_offset = offsetof(struct promote_op, pos), ++ .key_len = sizeof(struct bpos), ++}; ++ ++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, ++ struct bpos pos, ++ struct bch_io_opts opts, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_MAY_PROMOTE)) ++ return false; ++ ++ if (!opts.promote_target) ++ return false; ++ ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) ++ return false; ++ ++ if (bch2_target_congested(c, opts.promote_target)) { ++ /* XXX trace this */ ++ return false; ++ } ++ ++ if (rhashtable_lookup_fast(&c->promote_table, &pos, ++ bch_promote_params)) ++ return false; ++ ++ return true; ++} ++ ++static void promote_free(struct bch_fs *c, struct promote_op *op) ++{ ++ int ret; ++ ++ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); ++ percpu_ref_put(&c->writes); ++ kfree_rcu(op, rcu); ++} ++ ++static void promote_done(struct closure *cl) ++{ ++ struct promote_op *op = ++ container_of(cl, struct promote_op, cl); ++ struct bch_fs *c = op->write.op.c; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], ++ op->start_time); ++ ++ bch2_data_update_exit(&op->write); ++ promote_free(c, op); ++} ++ ++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->write.op.wbio.bio; ++ ++ trace_promote(&rbio->bio); ++ ++ /* we now own pages: */ ++ BUG_ON(!rbio->bounce); ++ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++ ++ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, ++ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); ++ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ ++ closure_init(cl, NULL); ++ bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); ++ closure_return_with_destructor(cl, promote_done); ++} ++ ++static struct promote_op *__promote_alloc(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned sectors, ++ struct bch_read_bio **rbio) ++{ ++ struct promote_op *op = NULL; ++ struct bio *bio; ++ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ int ret; ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return NULL; ++ ++ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); ++ if (!op) ++ goto err; ++ ++ op->start_time = local_clock(); ++ op->pos = pos; ++ ++ /* ++ * We don't use the mempool here because extents that aren't ++ * checksummed or compressed can be too big for the mempool: ++ */ ++ *rbio = kzalloc(sizeof(struct bch_read_bio) + ++ sizeof(struct bio_vec) * pages, ++ GFP_NOIO); ++ if (!*rbio) ++ goto err; ++ ++ rbio_init(&(*rbio)->bio, opts); ++ bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); ++ ++ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, ++ GFP_NOIO)) ++ goto err; ++ ++ (*rbio)->bounce = true; ++ (*rbio)->split = true; ++ (*rbio)->kmalloc = true; ++ ++ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, ++ bch_promote_params)) ++ goto err; ++ ++ bio = &op->write.op.wbio.bio; ++ bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); ++ ++ ret = bch2_data_update_init(c, &op->write, ++ writepoint_hashed((unsigned long) current), ++ opts, ++ (struct data_update_opts) { ++ .target = opts.promote_target, ++ .extra_replicas = 1, ++ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, ++ }, ++ btree_id, k); ++ BUG_ON(ret); ++ ++ return op; ++err: ++ if (*rbio) ++ bio_free_pages(&(*rbio)->bio); ++ kfree(*rbio); ++ *rbio = NULL; ++ kfree(op); ++ percpu_ref_put(&c->writes); ++ return NULL; ++} ++ ++noinline ++static struct promote_op *promote_alloc(struct bch_fs *c, ++ struct bvec_iter iter, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned flags, ++ struct bch_read_bio **rbio, ++ bool *bounce, ++ bool *read_full) ++{ ++ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); ++ /* data might have to be decompressed in the write path: */ ++ unsigned sectors = promote_full ++ ? max(pick->crc.compressed_size, pick->crc.live_size) ++ : bvec_iter_sectors(iter); ++ struct bpos pos = promote_full ++ ? bkey_start_pos(k.k) ++ : POS(k.k->p.inode, iter.bi_sector); ++ struct promote_op *promote; ++ ++ if (!should_promote(c, k, pos, opts, flags)) ++ return NULL; ++ ++ promote = __promote_alloc(c, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_reflink ++ : BTREE_ID_extents, ++ k, pos, pick, opts, sectors, rbio); ++ if (!promote) ++ return NULL; ++ ++ *bounce = true; ++ *read_full = promote_full; ++ return promote; ++} ++ ++/* Read */ ++ ++#define READ_RETRY_AVOID 1 ++#define READ_RETRY 2 ++#define READ_ERR 3 ++ ++enum rbio_context { ++ RBIO_CONTEXT_NULL, ++ RBIO_CONTEXT_HIGHPRI, ++ RBIO_CONTEXT_UNBOUND, ++}; ++ ++static inline struct bch_read_bio * ++bch2_rbio_parent(struct bch_read_bio *rbio) ++{ ++ return rbio->split ? rbio->parent : rbio; ++} ++ ++__always_inline ++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, ++ enum rbio_context context, ++ struct workqueue_struct *wq) ++{ ++ if (context <= rbio->context) { ++ fn(&rbio->work); ++ } else { ++ rbio->work.func = fn; ++ rbio->context = context; ++ queue_work(wq, &rbio->work); ++ } ++} ++ ++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) ++{ ++ BUG_ON(rbio->bounce && !rbio->split); ++ ++ if (rbio->promote) ++ promote_free(rbio->c, rbio->promote); ++ rbio->promote = NULL; ++ ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ ++ if (rbio->split) { ++ struct bch_read_bio *parent = rbio->parent; ++ ++ if (rbio->kmalloc) ++ kfree(rbio); ++ else ++ bio_put(&rbio->bio); ++ ++ rbio = parent; ++ } ++ ++ return rbio; ++} ++ ++/* ++ * Only called on a top level bch_read_bio to complete an entire read request, ++ * not a split: ++ */ ++static void bch2_rbio_done(struct bch_read_bio *rbio) ++{ ++ if (rbio->start_time) ++ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], ++ rbio->start_time); ++ bio_endio(&rbio->bio); ++} ++ ++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, ++ struct bch_io_failures *failed, ++ unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_buf sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_trans_iter_init(&trans, &iter, rbio->data_btree, ++ rbio->read_pos, BTREE_ITER_SLOTS); ++retry: ++ rbio->bio.bi_status = 0; ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ if (bkey_err(k)) ++ goto err; ++ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ if (!bch2_bkey_matches_ptr(c, k, ++ rbio->pick.ptr, ++ rbio->data_pos.offset - ++ rbio->pick.crc.offset)) { ++ /* extent we wanted to read no longer exists: */ ++ rbio->hole = true; ++ goto out; ++ } ++ ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, ++ rbio->read_pos, ++ rbio->data_btree, ++ k, 0, failed, flags); ++ if (ret == READ_RETRY) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_rbio_done(rbio); ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ return; ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ goto out; ++} ++ ++static void bch2_rbio_retry(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bvec_iter iter = rbio->bvec_iter; ++ unsigned flags = rbio->flags; ++ subvol_inum inum = { ++ .subvol = rbio->subvol, ++ .inum = rbio->read_pos.inode, ++ }; ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ trace_read_retry(&rbio->bio); ++ ++ if (rbio->retry == READ_RETRY_AVOID) ++ bch2_mark_io_failure(&failed, &rbio->pick); ++ ++ rbio->bio.bi_status = 0; ++ ++ rbio = bch2_rbio_free(rbio); ++ ++ flags |= BCH_READ_IN_RETRY; ++ flags &= ~BCH_READ_MAY_PROMOTE; ++ ++ if (flags & BCH_READ_NODECODE) { ++ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); ++ } else { ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ __bch2_read(c, rbio, iter, inum, &failed, flags); ++ } ++} ++ ++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, ++ blk_status_t error) ++{ ++ rbio->retry = retry; ++ ++ if (rbio->flags & BCH_READ_IN_RETRY) ++ return; ++ ++ if (retry == READ_ERR) { ++ rbio = bch2_rbio_free(rbio); ++ ++ rbio->bio.bi_status = error; ++ bch2_rbio_done(rbio); ++ } else { ++ bch2_rbio_punt(rbio, bch2_rbio_retry, ++ RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ } ++} ++ ++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, ++ struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; ++ struct bch_extent_crc_unpacked new_crc; ++ struct btree_iter iter; ++ struct bkey_i *new; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ if (crc_is_compressed(rbio->pick.crc)) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ if ((ret = bkey_err(k))) ++ goto out; ++ ++ if (bversion_cmp(k.k->version, rbio->version) || ++ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) ++ goto out; ++ ++ /* Extent was merged? */ ++ if (bkey_start_offset(k.k) < data_offset || ++ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) ++ goto out; ++ ++ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, ++ rbio->pick.crc, NULL, &new_crc, ++ bkey_start_offset(k.k) - data_offset, k.k->size, ++ rbio->pick.crc.csum_type)) { ++ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ ret = 0; ++ goto out; ++ } ++ ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ sizeof(struct bch_extent_crc128)); ++ if ((ret = PTR_ERR_OR_ZERO(new))) ++ goto out; ++ ++ bkey_reassemble(new, k); ++ ++ if (!bch2_bkey_narrow_crcs(new, new_crc)) ++ goto out; ++ ++ ret = bch2_trans_update(trans, &iter, new, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ __bch2_rbio_narrow_crcs(&trans, rbio)); ++} ++ ++/* Inner part that may run in process context */ ++static void __bch2_read_endio(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct bio *src = &rbio->bio; ++ struct bio *dst = &bch2_rbio_parent(rbio)->bio; ++ struct bvec_iter dst_iter = rbio->bvec_iter; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ unsigned nofs_flags; ++ struct bch_csum csum; ++ int ret; ++ ++ nofs_flags = memalloc_nofs_save(); ++ ++ /* Reset iterator for checksumming and copying bounced data: */ ++ if (rbio->bounce) { ++ src->bi_iter.bi_size = crc.compressed_size << 9; ++ src->bi_iter.bi_idx = 0; ++ src->bi_iter.bi_bvec_done = 0; ++ } else { ++ src->bi_iter = rbio->bvec_iter; ++ } ++ ++ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) ++ goto csum_err; ++ ++ /* ++ * XXX ++ * We need to rework the narrow_crcs path to deliver the read completion ++ * first, and then punt to a different workqueue, otherwise we're ++ * holding up reads while doing btree updates which is bad for memory ++ * reclaim. ++ */ ++ if (unlikely(rbio->narrow_crcs)) ++ bch2_rbio_narrow_crcs(rbio); ++ ++ if (rbio->flags & BCH_READ_NODECODE) ++ goto nodecode; ++ ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ ++ if (crc_is_compressed(crc)) { ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); ++ ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; ++ ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ ++ if (rbio->promote) { ++ /* ++ * Re encrypt data we decrypted, so it's consistent with ++ * rbio->crc: ++ */ ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; ++ ++ promote_start(rbio->promote, rbio); ++ rbio->promote = NULL; ++ } ++nodecode: ++ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ rbio = bch2_rbio_free(rbio); ++ bch2_rbio_done(rbio); ++ } ++out: ++ memalloc_nofs_restore(nofs_flags); ++ return; ++csum_err: ++ /* ++ * Checksum error: if the bio wasn't bounced, we may have been ++ * reading into buffers owned by userspace (that userspace can ++ * scribble over) - retry the read, bouncing it this time: ++ */ ++ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { ++ rbio->flags |= BCH_READ_MUST_BOUNCE; ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", ++ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, ++ csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++decompression_err: ++ bch_err_inum_ratelimited(c, rbio->read_pos.inode, ++ "decompression error"); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ goto out; ++decrypt_err: ++ bch_err_inum_ratelimited(c, rbio->read_pos.inode, ++ "decrypt error"); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ goto out; ++} ++ ++static void bch2_read_endio(struct bio *bio) ++{ ++ struct bch_read_bio *rbio = ++ container_of(bio, struct bch_read_bio, bio); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct workqueue_struct *wq = NULL; ++ enum rbio_context context = RBIO_CONTEXT_NULL; ++ ++ if (rbio->have_ioref) { ++ bch2_latency_acct(ca, rbio->submit_time, READ); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (!rbio->split) ++ rbio->bio.bi_end_io = rbio->end_io; ++ ++ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, ++ rbio->read_pos.inode, ++ rbio->read_pos.offset, ++ "data read error: %s", ++ bch2_blk_status_to_str(bio->bi_status))) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ return; ++ } ++ ++ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr)) { ++ atomic_long_inc(&c->read_realloc_races); ++ ++ if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); ++ else ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); ++ return; ++ } ++ ++ if (rbio->narrow_crcs || ++ crc_is_compressed(rbio->pick.crc) || ++ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) ++ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; ++ else if (rbio->pick.crc.csum_type) ++ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; ++ ++ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); ++} ++ ++int __bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_buf *orig_k) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 reflink_offset; ++ int ret; ++ ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + ++ *offset_into_extent; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_reflink_v && ++ k.k->type != KEY_TYPE_indirect_inline_data) { ++ bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, ++ "%llu len %u points to nonexistent indirect extent %llu", ++ orig_k->k->k.p.offset, ++ orig_k->k->k.size, ++ reflink_offset); ++ bch2_inconsistent_error(trans->c); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); ++ bch2_bkey_buf_reassemble(orig_k, trans->c, k); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, ++ struct bch_extent_ptr ptr) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); ++ struct btree_iter iter; ++ struct printbuf buf = PRINTBUF; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ PTR_BUCKET_POS(c, &ptr), ++ BTREE_ITER_CACHED); ++ ++ prt_printf(&buf, "Attempting to read from stale dirty pointer:"); ++ printbuf_indent_add(&buf, 2); ++ prt_newline(&buf); ++ ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ ++ prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); ++ ++ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ if (!ret) { ++ prt_newline(&buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ } ++ ++ bch2_fs_inconsistent(c, "%s", buf.buf); ++ ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++} ++ ++int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, ++ struct bvec_iter iter, struct bpos read_pos, ++ enum btree_id data_btree, struct bkey_s_c k, ++ unsigned offset_into_extent, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct extent_ptr_decoded pick; ++ struct bch_read_bio *rbio = NULL; ++ struct bch_dev *ca = NULL; ++ struct promote_op *promote = NULL; ++ bool bounce = false, read_full = false, narrow_crcs = false; ++ struct bpos data_pos = bkey_start_pos(k.k); ++ int pick_ret; ++ ++ if (bkey_extent_is_inline_data(k.k)) { ++ unsigned bytes = min_t(unsigned, iter.bi_size, ++ bkey_inline_data_bytes(k.k)); ++ ++ swap(iter.bi_size, bytes); ++ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); ++ swap(iter.bi_size, bytes); ++ bio_advance_iter(&orig->bio, &iter, bytes); ++ zero_fill_bio_iter(&orig->bio, iter); ++ goto out_read_done; ++ } ++retry_pick: ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ ++ /* hole or reservation - just zero fill: */ ++ if (!pick_ret) ++ goto hole; ++ ++ if (pick_ret < 0) { ++ bch_err_inum_ratelimited(c, k.k->p.inode, ++ "no device to read from"); ++ goto err; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ /* ++ * Stale dirty pointers are treated as IO errors, but @failed isn't ++ * allocated unless we're in the retry path - so if we're not in the ++ * retry path, don't check here, it'll be caught in bch2_read_endio() ++ * and we'll end up in the retry path: ++ */ ++ if ((flags & BCH_READ_IN_RETRY) && ++ !pick.ptr.cached && ++ unlikely(ptr_stale(ca, &pick.ptr))) { ++ read_from_stale_dirty_pointer(trans, k, pick.ptr); ++ bch2_mark_io_failure(failed, &pick); ++ goto retry_pick; ++ } ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(trans); ++ ++ if (flags & BCH_READ_NODECODE) { ++ /* ++ * can happen if we retry, and the extent we were going to read ++ * has been merged in the meantime: ++ */ ++ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) ++ goto hole; ++ ++ iter.bi_size = pick.crc.compressed_size << 9; ++ goto get_bio; ++ } ++ ++ if (!(flags & BCH_READ_LAST_FRAGMENT) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_MUST_CLONE; ++ ++ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) ++ flags |= BCH_READ_MUST_BOUNCE; ++ ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_none && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_USER_MAPPED)) || ++ (flags & BCH_READ_MUST_BOUNCE)))) { ++ read_full = true; ++ bounce = true; ++ } ++ ++ if (orig->opts.promote_target) ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); ++ ++ if (!read_full) { ++ EBUG_ON(crc_is_compressed(pick.crc)); ++ EBUG_ON(pick.crc.csum_type && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ bvec_iter_sectors(iter) != pick.crc.live_size || ++ pick.crc.offset || ++ offset_into_extent)); ++ ++ data_pos.offset += offset_into_extent; ++ pick.ptr.offset += pick.crc.offset + ++ offset_into_extent; ++ offset_into_extent = 0; ++ pick.crc.compressed_size = bvec_iter_sectors(iter); ++ pick.crc.uncompressed_size = bvec_iter_sectors(iter); ++ pick.crc.offset = 0; ++ pick.crc.live_size = bvec_iter_sectors(iter); ++ offset_into_extent = 0; ++ } ++get_bio: ++ if (rbio) { ++ /* ++ * promote already allocated bounce rbio: ++ * promote needs to allocate a bio big enough for uncompressing ++ * data in the write path, but we're not going to use it all ++ * here: ++ */ ++ EBUG_ON(rbio->bio.bi_iter.bi_size < ++ pick.crc.compressed_size << 9); ++ rbio->bio.bi_iter.bi_size = ++ pick.crc.compressed_size << 9; ++ } else if (bounce) { ++ unsigned sectors = pick.crc.compressed_size; ++ ++ rbio = rbio_init(bio_alloc_bioset(NULL, ++ DIV_ROUND_UP(sectors, PAGE_SECTORS), ++ 0, ++ GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ ++ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); ++ rbio->bounce = true; ++ rbio->split = true; ++ } else if (flags & BCH_READ_MUST_CLONE) { ++ /* ++ * Have to clone if there were any splits, due to error ++ * reporting issues (if a split errored, and retrying didn't ++ * work, when it reports the error to its parent (us) we don't ++ * know if the error was from our bio, and we should retry, or ++ * from the whole bio, in which case we don't want to retry and ++ * lose the error) ++ */ ++ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ rbio->bio.bi_iter = iter; ++ rbio->split = true; ++ } else { ++ rbio = orig; ++ rbio->bio.bi_iter = iter; ++ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ } ++ ++ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ ++ rbio->c = c; ++ rbio->submit_time = local_clock(); ++ if (rbio->split) ++ rbio->parent = orig; ++ else ++ rbio->end_io = orig->bio.bi_end_io; ++ rbio->bvec_iter = iter; ++ rbio->offset_into_extent= offset_into_extent; ++ rbio->flags = flags; ++ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); ++ rbio->narrow_crcs = narrow_crcs; ++ rbio->hole = 0; ++ rbio->retry = 0; ++ rbio->context = 0; ++ /* XXX: only initialize this if needed */ ++ rbio->devs_have = bch2_bkey_devs(k); ++ rbio->pick = pick; ++ rbio->subvol = orig->subvol; ++ rbio->read_pos = read_pos; ++ rbio->data_btree = data_btree; ++ rbio->data_pos = data_pos; ++ rbio->version = k.k->version; ++ rbio->promote = promote; ++ INIT_WORK(&rbio->work, NULL); ++ ++ rbio->bio.bi_opf = orig->bio.bi_opf; ++ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rbio->bio.bi_end_io = bch2_read_endio; ++ ++ if (rbio->bounce) ++ trace_read_bounce(&rbio->bio); ++ ++ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); ++ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); ++ ++ /* ++ * If it's being moved internally, we don't want to flag it as a cache ++ * hit: ++ */ ++ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) ++ bch2_bucket_io_time_reset(trans, pick.ptr.dev, ++ PTR_BUCKET_NR(ca, &pick.ptr), READ); ++ ++ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ bio_inc_remaining(&orig->bio); ++ trace_read_split(&orig->bio); ++ } ++ ++ if (!rbio->pick.idx) { ++ if (!rbio->have_ioref) { ++ bch_err_inum_ratelimited(c, k.k->p.inode, ++ "no device to read from"); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], ++ bio_sectors(&rbio->bio)); ++ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ submit_bio(&rbio->bio); ++ else ++ submit_bio_wait(&rbio->bio); ++ } else { ++ /* Attempting reconstruct read: */ ++ if (bch2_ec_read_extent(c, rbio)) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ bio_endio(&rbio->bio); ++ } ++out: ++ if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ return 0; ++ } else { ++ int ret; ++ ++ rbio->context = RBIO_CONTEXT_UNBOUND; ++ bch2_read_endio(&rbio->bio); ++ ++ ret = rbio->retry; ++ rbio = bch2_rbio_free(rbio); ++ ++ if (ret == READ_RETRY_AVOID) { ++ bch2_mark_io_failure(failed, &pick); ++ ret = READ_RETRY; ++ } ++ ++ if (!ret) ++ goto out_read_done; ++ ++ return ret; ++ } ++ ++err: ++ if (flags & BCH_READ_IN_RETRY) ++ return READ_ERR; ++ ++ orig->bio.bi_status = BLK_STS_IOERR; ++ goto out_read_done; ++ ++hole: ++ /* ++ * won't normally happen in the BCH_READ_NODECODE ++ * (bch2_move_extent()) path, but if we retry and the extent we wanted ++ * to read no longer exists we have to signal that: ++ */ ++ if (flags & BCH_READ_NODECODE) ++ orig->hole = true; ++ ++ zero_fill_bio_iter(&orig->bio, iter); ++out_read_done: ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ bch2_rbio_done(orig); ++ return 0; ++} ++ ++void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, subvol_inum inum, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_buf sk; ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ BUG_ON(flags & BCH_READ_NODECODE); ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ iter = (struct btree_iter) { NULL }; ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(inum.inum, bvec_iter.bi_sector, snapshot), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ unsigned bytes, sectors, offset_into_extent; ++ enum btree_id data_btree = BTREE_ID_extents; ++ ++ /* ++ * read_extent -> io_time_reset may cause a transaction restart ++ * without returning an error, we need to check for that here: ++ */ ++ ret = bch2_trans_relock(&trans); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_set_pos(&iter, ++ POS(inum.inum, bvec_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ offset_into_extent = iter.pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ ++ ret = bch2_read_indirect_extent(&trans, &data_btree, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ k = bkey_i_to_s_c(sk.k); ++ ++ /* ++ * With indirect extents, the amount of data to read is the min ++ * of the original extent and the indirect extent: ++ */ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); ++ ++ if (bvec_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, ++ data_btree, k, ++ offset_into_extent, failed, flags); ++ if (ret) ++ break; ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ ++ ret = btree_trans_too_many_iters(&trans); ++ if (ret) ++ break; ++ } ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ ret == READ_RETRY || ++ ret == READ_RETRY_AVOID) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ ++ if (ret) { ++ bch_err_inum_ratelimited(c, inum.inum, ++ "read error %i from btree lookup", ret); ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ bch2_rbio_done(rbio); ++ } ++} ++ ++void bch2_fs_io_exit(struct bch_fs *c) ++{ ++ if (c->promote_table.tbl) ++ rhashtable_destroy(&c->promote_table); ++ mempool_exit(&c->bio_bounce_pages); ++ bioset_exit(&c->bio_write); ++ bioset_exit(&c->bio_read_split); ++ bioset_exit(&c->bio_read); ++} ++ ++int bch2_fs_io_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), ++ BIOSET_NEED_BVECS) || ++ mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->opts.encoded_extent_max) / ++ PAGE_SIZE, 0) || ++ rhashtable_init(&c->promote_table, &bch_promote_params)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +new file mode 100644 +index 000000000000..fb5114518666 +--- /dev/null ++++ b/fs/bcachefs/io.h +@@ -0,0 +1,189 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_H ++#define _BCACHEFS_IO_H ++ ++#include "checksum.h" ++#include "bkey_buf.h" ++#include "io_types.h" ++ ++#define to_wbio(_bio) \ ++ container_of((_bio), struct bch_write_bio, bio) ++ ++#define to_rbio(_bio) \ ++ container_of((_bio), struct bch_read_bio, bio) ++ ++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); ++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); ++ ++void bch2_latency_acct(struct bch_dev *, u64, int); ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, ++ enum bch_data_type, const struct bkey_i *); ++ ++#define BLK_STS_REMOVED ((__force blk_status_t)128) ++ ++const char *bch2_blk_status_to_str(blk_status_t); ++ ++enum bch_write_flags { ++ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), ++ BCH_WRITE_CACHED = (1 << 1), ++ BCH_WRITE_FLUSH = (1 << 2), ++ BCH_WRITE_DATA_ENCODED = (1 << 3), ++ BCH_WRITE_PAGES_STABLE = (1 << 4), ++ BCH_WRITE_PAGES_OWNED = (1 << 5), ++ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), ++ BCH_WRITE_FROM_INTERNAL = (1 << 8), ++ BCH_WRITE_CHECK_ENOSPC = (1 << 9), ++ ++ /* Internal: */ ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), ++ BCH_WRITE_DONE = (1 << 12), ++}; ++ ++static inline u64 *op_journal_seq(struct bch_write_op *op) ++{ ++ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) ++ ? op->journal_seq_p : &op->journal_seq; ++} ++ ++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) ++{ ++ return op->alloc_reserve == RESERVE_movinggc ++ ? op->c->copygc_wq ++ : op->c->btree_update_wq; ++} ++ ++int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, bool *, s64 *, s64 *); ++int bch2_extent_update(struct btree_trans *, subvol_inum, ++ struct btree_iter *, struct bkey_i *, ++ struct disk_reservation *, u64 *, u64, s64 *, bool); ++ ++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, ++ subvol_inum, u64, s64 *); ++int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); ++ ++int bch2_write_index_default(struct bch_write_op *); ++ ++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, ++ struct bch_io_opts opts) ++{ ++ op->c = c; ++ op->end_io = NULL; ++ op->flags = 0; ++ op->written = 0; ++ op->error = 0; ++ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); ++ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->nr_replicas = 0; ++ op->nr_replicas_required = c->opts.data_replicas_required; ++ op->alloc_reserve = RESERVE_none; ++ op->incompressible = 0; ++ op->open_buckets.nr = 0; ++ op->devs_have.nr = 0; ++ op->target = 0; ++ op->opts = opts; ++ op->subvol = 0; ++ op->pos = POS_MAX; ++ op->version = ZERO_VERSION; ++ op->write_point = (struct write_point_specifier) { 0 }; ++ op->res = (struct disk_reservation) { 0 }; ++ op->journal_seq = 0; ++ op->new_i_size = U64_MAX; ++ op->i_sectors_delta = 0; ++ op->index_update_fn = bch2_write_index_default; ++} ++ ++void bch2_write(struct closure *); ++ ++static inline struct bch_write_bio *wbio_init(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ ++ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); ++ return wbio; ++} ++ ++struct bch_devs_mask; ++struct cache_promote_op; ++struct extent_ptr_decoded; ++ ++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, ++ struct bkey_buf *); ++ ++static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ enum btree_id *data_btree, ++ unsigned *offset_into_extent, ++ struct bkey_buf *k) ++{ ++ if (k->k->k.type != KEY_TYPE_reflink_p) ++ return 0; ++ ++ *data_btree = BTREE_ID_reflink; ++ return __bch2_read_indirect_extent(trans, offset_into_extent, k); ++} ++ ++enum bch_read_flags { ++ BCH_READ_RETRY_IF_STALE = 1 << 0, ++ BCH_READ_MAY_PROMOTE = 1 << 1, ++ BCH_READ_USER_MAPPED = 1 << 2, ++ BCH_READ_NODECODE = 1 << 3, ++ BCH_READ_LAST_FRAGMENT = 1 << 4, ++ ++ /* internal: */ ++ BCH_READ_MUST_BOUNCE = 1 << 5, ++ BCH_READ_MUST_CLONE = 1 << 6, ++ BCH_READ_IN_RETRY = 1 << 7, ++}; ++ ++int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, ++ struct bvec_iter, struct bpos, enum btree_id, ++ struct bkey_s_c, unsigned, ++ struct bch_io_failures *, unsigned); ++ ++static inline void bch2_read_extent(struct btree_trans *trans, ++ struct bch_read_bio *rbio, struct bpos read_pos, ++ enum btree_id data_btree, struct bkey_s_c k, ++ unsigned offset_into_extent, unsigned flags) ++{ ++ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, ++ data_btree, k, offset_into_extent, NULL, flags); ++} ++ ++void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, ++ subvol_inum, struct bch_io_failures *, unsigned flags); ++ ++static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, ++ subvol_inum inum) ++{ ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ BUG_ON(rbio->_state); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ rbio->subvol = inum.subvol; ++ ++ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, ++ BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED); ++} ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_io_opts opts) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->_state = 0; ++ rbio->promote = NULL; ++ rbio->opts = opts; ++ return rbio; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *); ++int bch2_fs_io_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_IO_H */ +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +new file mode 100644 +index 000000000000..78bff13d36f2 +--- /dev/null ++++ b/fs/bcachefs/io_types.h +@@ -0,0 +1,161 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_TYPES_H ++#define _BCACHEFS_IO_TYPES_H ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "extents_types.h" ++#include "keylist_types.h" ++#include "opts.h" ++#include "super_types.h" ++ ++#include ++#include ++ ++struct bch_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ u64 submit_time; ++ ++ /* ++ * Reads will often have to be split, and if the extent being read from ++ * was checksummed or compressed we'll also have to allocate bounce ++ * buffers and copy the data back into the original bio. ++ * ++ * If we didn't have to split, we have to save and restore the original ++ * bi_end_io - @split below indicates which: ++ */ ++ union { ++ struct bch_read_bio *parent; ++ bio_end_io_t *end_io; ++ }; ++ ++ /* ++ * Saved copy of bio->bi_iter, from submission time - allows us to ++ * resubmit on IO error, and also to copy data back to the original bio ++ * when we're bouncing: ++ */ ++ struct bvec_iter bvec_iter; ++ ++ unsigned offset_into_extent; ++ ++ u16 flags; ++ union { ++ struct { ++ u16 bounce:1, ++ split:1, ++ kmalloc:1, ++ have_ioref:1, ++ narrow_crcs:1, ++ hole:1, ++ retry:2, ++ context:2; ++ }; ++ u16 _state; ++ }; ++ ++ struct bch_devs_list devs_have; ++ ++ struct extent_ptr_decoded pick; ++ ++ /* ++ * pos we read from - different from data_pos for indirect extents: ++ */ ++ u32 subvol; ++ struct bpos read_pos; ++ ++ /* ++ * start pos of data we read (may not be pos of data we want) - for ++ * promote, narrow extents paths: ++ */ ++ enum btree_id data_btree; ++ struct bpos data_pos; ++ struct bversion version; ++ ++ struct promote_op *promote; ++ ++ struct bch_io_opts opts; ++ ++ struct work_struct work; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_bio { ++ struct bch_fs *c; ++ struct bch_write_bio *parent; ++ ++ u64 submit_time; ++ ++ struct bch_devs_list failed; ++ u8 dev; ++ ++ unsigned split:1, ++ bounce:1, ++ put_bio:1, ++ have_ioref:1, ++ used_mempool:1, ++ first_btree_write:1; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_op { ++ struct closure cl; ++ struct bch_fs *c; ++ void (*end_io)(struct bch_write_op *); ++ u64 start_time; ++ ++ unsigned written; /* sectors */ ++ u16 flags; ++ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ ++ ++ unsigned csum_type:4; ++ unsigned compression_type:4; ++ unsigned nr_replicas:4; ++ unsigned nr_replicas_required:4; ++ unsigned alloc_reserve:3; ++ unsigned incompressible:1; ++ ++ struct bch_devs_list devs_have; ++ u16 target; ++ u16 nonce; ++ struct bch_io_opts opts; ++ ++ u32 subvol; ++ struct bpos pos; ++ struct bversion version; ++ ++ /* For BCH_WRITE_DATA_ENCODED: */ ++ struct bch_extent_crc_unpacked crc; ++ ++ struct write_point_specifier write_point; ++ ++ struct disk_reservation res; ++ ++ struct open_buckets open_buckets; ++ ++ /* ++ * If caller wants to flush but hasn't passed us a journal_seq ptr, we ++ * still need to stash the journal_seq somewhere: ++ */ ++ union { ++ u64 *journal_seq_p; ++ u64 journal_seq; ++ }; ++ u64 new_i_size; ++ s64 i_sectors_delta; ++ ++ int (*index_update_fn)(struct bch_write_op *); ++ ++ struct bch_devs_mask failed; ++ ++ struct keylist insert_keys; ++ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; ++ ++ /* Must be last: */ ++ struct bch_write_bio wbio; ++}; ++ ++#endif /* _BCACHEFS_IO_TYPES_H */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +new file mode 100644 +index 000000000000..937ed1395e46 +--- /dev/null ++++ b/fs/bcachefs/journal.c +@@ -0,0 +1,1429 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs journalling code, for btree insertions ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_sb.h" ++#include "journal_seq_blacklist.h" ++ ++#include ++ ++#define x(n) #n, ++static const char * const bch2_journal_watermarks[] = { ++ JOURNAL_WATERMARKS() ++ NULL ++}; ++ ++static const char * const bch2_journal_errors[] = { ++ JOURNAL_ERRORS() ++ NULL ++}; ++#undef x ++ ++static inline bool journal_seq_unwritten(struct journal *j, u64 seq) ++{ ++ return seq > j->seq_ondisk; ++} ++ ++static bool __journal_entry_is_open(union journal_res_state state) ++{ ++ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; ++} ++ ++static inline unsigned nr_unwritten_journal_entries(struct journal *j) ++{ ++ return atomic64_read(&j->seq) - j->seq_ondisk; ++} ++ ++static bool journal_entry_is_open(struct journal *j) ++{ ++ return __journal_entry_is_open(j->reservations); ++} ++ ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ struct journal_buf *buf = NULL; ++ ++ EBUG_ON(seq > journal_cur_seq(j)); ++ ++ if (journal_seq_unwritten(j, seq)) { ++ buf = j->buf + (seq & JOURNAL_BUF_MASK); ++ EBUG_ON(le64_to_cpu(buf->data->seq) != seq); ++ } ++ return buf; ++} ++ ++static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) ++{ ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->key_cache_list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} ++ ++/* journal entry close/open: */ ++ ++void __bch2_journal_buf_put(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ ++ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++} ++ ++/* ++ * Returns true if journal entry is now closed: ++ * ++ * We don't close a journal_buf until the next journal_buf is finished writing, ++ * and can be opened again - this also initializes the next journal_buf: ++ */ ++static void __journal_entry_close(struct journal *j, unsigned closed_val) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ unsigned sectors; ++ ++ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && ++ closed_val != JOURNAL_ENTRY_ERROR_VAL); ++ ++ lockdep_assert_held(&j->lock); ++ ++ do { ++ old.v = new.v = v; ++ new.cur_entry_offset = closed_val; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || ++ old.cur_entry_offset == new.cur_entry_offset) ++ return; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (!__journal_entry_is_open(old)) ++ return; ++ ++ /* Close out old buffer: */ ++ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ ++ sectors = vstruct_blocks_plus(buf->data, c->block_bits, ++ buf->u64s_reserved) << c->block_bits; ++ BUG_ON(sectors > buf->sectors); ++ buf->sectors = sectors; ++ ++ /* ++ * We have to set last_seq here, _before_ opening a new journal entry: ++ * ++ * A threads may replace an old pin with a new pin on their current ++ * journal reservation - the expectation being that the journal will ++ * contain either what the old pin protected or what the new pin ++ * protects. ++ * ++ * After the old pin is dropped journal_last_seq() won't include the old ++ * pin, so we can only write the updated last_seq on the entry that ++ * contains whatever the new pin protects. ++ * ++ * Restated, we can _not_ update last_seq for a given entry if there ++ * could be a newer entry open with reservations/pins that have been ++ * taken against it. ++ * ++ * Hence, we want update/set last_seq on the current journal entry right ++ * before we open a new one: ++ */ ++ buf->last_seq = journal_last_seq(j); ++ buf->data->last_seq = cpu_to_le64(buf->last_seq); ++ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); ++ ++ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); ++ ++ cancel_delayed_work(&j->write_work); ++ ++ bch2_journal_space_available(j); ++ ++ bch2_journal_buf_put(j, old.idx); ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); ++ if (!j->err_seq) ++ j->err_seq = journal_cur_seq(j); ++ spin_unlock(&j->lock); ++} ++ ++static bool journal_entry_want_write(struct journal *j) ++{ ++ bool ret = !journal_entry_is_open(j) || ++ journal_cur_seq(j) == journal_last_unwritten_seq(j); ++ ++ /* Don't close it yet if we already have a write in flight: */ ++ if (ret) ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ else if (nr_unwritten_journal_entries(j)) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ } ++ ++ return ret; ++} ++ ++static bool journal_entry_close(struct journal *j) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = journal_entry_want_write(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * should _only_ called from journal_res_get() - when we actually want a ++ * journal reservation - journal entry is open means journal is dirty: ++ * ++ * returns: ++ * 0: success ++ * -ENOSPC: journal currently full, must invoke reclaim ++ * -EAGAIN: journal blocked, must wait ++ * -EROFS: insufficient rw devices or journal error ++ */ ++static int journal_entry_open(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = j->buf + ++ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); ++ union journal_res_state old, new; ++ int u64s; ++ u64 v; ++ ++ lockdep_assert_held(&j->lock); ++ BUG_ON(journal_entry_is_open(j)); ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ ++ if (j->blocked) ++ return JOURNAL_ERR_blocked; ++ ++ if (j->cur_entry_error) ++ return j->cur_entry_error; ++ ++ if (bch2_journal_error(j)) ++ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ ++ ++ if (!fifo_free(&j->pin)) ++ return JOURNAL_ERR_journal_pin_full; ++ ++ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) ++ return JOURNAL_ERR_max_in_flight; ++ ++ BUG_ON(!j->cur_entry_sectors); ++ ++ buf->expires = ++ (journal_cur_seq(j) == j->flushed_seq_ondisk ++ ? jiffies ++ : j->last_flush_write) + ++ msecs_to_jiffies(c->opts.journal_flush_delay); ++ ++ buf->u64s_reserved = j->entry_u64s_reserved; ++ buf->disk_sectors = j->cur_entry_sectors; ++ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); ++ ++ u64s = (int) (buf->sectors << 9) / sizeof(u64) - ++ journal_entry_overhead(j); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ ++ if (u64s <= 0) ++ return JOURNAL_ERR_journal_full; ++ ++ if (fifo_empty(&j->pin) && j->reclaim_thread) ++ wake_up_process(j->reclaim_thread); ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ journal_pin_list_init(fifo_push_ref(&j->pin), 1); ++ ++ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); ++ ++ bkey_extent_init(&buf->key); ++ buf->noflush = false; ++ buf->must_flush = false; ++ buf->separate_flush = false; ++ buf->flush_time = 0; ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++ ++ /* ++ * Must be set before marking the journal entry as open: ++ */ ++ j->cur_entry_u64s = u64s; ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ ++ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); ++ ++ new.idx++; ++ BUG_ON(journal_state_count(new, new.idx)); ++ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); ++ ++ journal_state_inc(&new); ++ new.cur_entry_offset = 0; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (j->res_get_blocked_start) ++ bch2_time_stats_update(j->blocked_time, ++ j->res_get_blocked_start); ++ j->res_get_blocked_start = 0; ++ ++ mod_delayed_work(c->io_complete_wq, ++ &j->write_work, ++ msecs_to_jiffies(c->opts.journal_flush_delay)); ++ journal_wake(j); ++ return 0; ++} ++ ++static bool journal_quiesced(struct journal *j) ++{ ++ bool ret = atomic64_read(&j->seq) == j->seq_ondisk; ++ ++ if (!ret) ++ journal_entry_close(j); ++ return ret; ++} ++ ++static void journal_quiesce(struct journal *j) ++{ ++ wait_event(j->wait, journal_quiesced(j)); ++} ++ ++static void journal_write_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(work, struct journal, write_work.work); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ long delta; ++ ++ spin_lock(&j->lock); ++ if (!__journal_entry_is_open(j->reservations)) ++ goto unlock; ++ ++ delta = journal_cur_buf(j)->expires - jiffies; ++ ++ if (delta > 0) ++ mod_delayed_work(c->io_complete_wq, &j->write_work, delta); ++ else ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++unlock: ++ spin_unlock(&j->lock); ++} ++ ++static int __journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ bool can_discard; ++ int ret; ++retry: ++ if (journal_res_get_fast(j, res, flags)) ++ return 0; ++ ++ if (bch2_journal_error(j)) ++ return -EROFS; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Recheck after taking the lock, so we don't race with another thread ++ * that just did journal_entry_open() and call journal_entry_close() ++ * unnecessarily ++ */ ++ if (journal_res_get_fast(j, res, flags)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { ++ /* ++ * Don't want to close current journal entry, just need to ++ * invoke reclaim: ++ */ ++ ret = JOURNAL_ERR_journal_full; ++ goto unlock; ++ } ++ ++ /* ++ * If we couldn't get a reservation because the current buf filled up, ++ * and we had room for a bigger entry on disk, signal that we want to ++ * realloc the journal bufs: ++ */ ++ buf = journal_cur_buf(j); ++ if (journal_entry_is_open(j) && ++ buf->buf_size >> 9 < buf->disk_sectors && ++ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) ++ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); ++ ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ ret = journal_entry_open(j); ++ ++ if (ret == JOURNAL_ERR_max_in_flight) ++ trace_journal_entry_full(c); ++unlock: ++ if ((ret && ret != JOURNAL_ERR_insufficient_devices) && ++ !j->res_get_blocked_start) { ++ j->res_get_blocked_start = local_clock() ?: 1; ++ trace_journal_full(c); ++ } ++ ++ can_discard = j->can_discard; ++ spin_unlock(&j->lock); ++ ++ if (!ret) ++ goto retry; ++ ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && ++ !can_discard && ++ !nr_unwritten_journal_entries(j) && ++ (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", ++ bch2_journal_errors[ret]); ++ ++ bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "%s", buf.buf); ++ ++ printbuf_reset(&buf); ++ bch2_journal_pins_to_text(&buf, j); ++ bch_err(c, "Journal pins:\n%s", buf.buf); ++ ++ printbuf_exit(&buf); ++ bch2_fatal_error(c); ++ dump_stack(); ++ } ++ ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ if ((ret == JOURNAL_ERR_journal_full || ++ ret == JOURNAL_ERR_journal_pin_full) && ++ !(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; ++} ++ ++/* ++ * Essentially the entry function to the journaling code. When bcachefs is doing ++ * a btree insert, it calls this function to get the current journal write. ++ * Journal write is the structure used set up journal writes. The calling ++ * function will then add its keys to the structure, queuing them for the next ++ * write. ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. ++ */ ++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || ++ (flags & JOURNAL_RES_GET_NONBLOCK)); ++ return ret; ++} ++ ++/* journal_preres: */ ++ ++static bool journal_preres_available(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); ++ ++ if (!ret && mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ ++ return ret; ++} ++ ++int __bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->preres_wait, ++ (ret = bch2_journal_error(j)) || ++ journal_preres_available(j, res, new_u64s, flags)); ++ return ret; ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *j, ++ struct journal_entry_res *res, ++ unsigned new_u64s) ++{ ++ union journal_res_state state; ++ int d = new_u64s - res->u64s; ++ ++ spin_lock(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ if (d <= 0) ++ goto out; ++ ++ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); ++ smp_mb(); ++ state = READ_ONCE(j->reservations); ++ ++ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && ++ state.cur_entry_offset > j->cur_entry_u64s) { ++ j->cur_entry_u64s += d; ++ /* ++ * Not enough room in current journal entry, have to flush it: ++ */ ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ } else { ++ journal_cur_buf(j)->u64s_reserved += d; ++ } ++out: ++ spin_unlock(&j->lock); ++ res->u64s += d; ++} ++ ++/* journal flushing: */ ++ ++/** ++ * bch2_journal_flush_seq_async - wait for a journal entry to be written ++ * ++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if ++ * necessary ++ */ ++int bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ int ret = 0; ++ ++ if (seq <= j->flushed_seq_ondisk) ++ return 1; ++ ++ spin_lock(&j->lock); ++ ++ if (WARN_ONCE(seq > journal_cur_seq(j), ++ "requested to flush journal seq %llu, but currently at %llu", ++ seq, journal_cur_seq(j))) ++ goto out; ++ ++ /* Recheck under lock: */ ++ if (j->err_seq && seq >= j->err_seq) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (seq <= j->flushed_seq_ondisk) { ++ ret = 1; ++ goto out; ++ } ++ ++ /* if seq was written, but not flushed - flush a newer one instead */ ++ seq = max(seq, journal_last_unwritten_seq(j)); ++ ++recheck_need_open: ++ if (seq > journal_cur_seq(j)) { ++ struct journal_res res = { 0 }; ++ ++ if (journal_entry_is_open(j)) ++ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ++ ++ spin_unlock(&j->lock); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ seq = res.seq; ++ buf = j->buf + (seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ ++ if (parent && !closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ bch2_journal_res_put(j, &res); ++ ++ spin_lock(&j->lock); ++ goto want_write; ++ } ++ ++ /* ++ * if write was kicked off without a flush, flush the next sequence ++ * number instead ++ */ ++ buf = journal_seq_to_buf(j, seq); ++ if (buf->noflush) { ++ seq++; ++ goto recheck_need_open; ++ } ++ ++ buf->must_flush = true; ++ ++ if (parent && !closure_wait(&buf->wait, parent)) ++ BUG(); ++want_write: ++ if (seq == journal_cur_seq(j)) ++ journal_entry_want_write(j); ++out: ++ spin_unlock(&j->lock); ++ return ret; ++} ++ ++int bch2_journal_flush_seq(struct journal *j, u64 seq) ++{ ++ u64 start_time = local_clock(); ++ int ret, ret2; ++ ++ /* ++ * Don't update time_stats when @seq is already flushed: ++ */ ++ if (seq <= j->flushed_seq_ondisk) ++ return 0; ++ ++ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); ++ ++ if (!ret) ++ bch2_time_stats_update(j->flush_seq_time, start_time); ++ ++ return ret ?: ret2 < 0 ? ret2 : 0; ++} ++ ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); ++} ++ ++/* ++ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before ++ * @seq ++ */ ++bool bch2_journal_noflush_seq(struct journal *j, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ u64 unwritten_seq; ++ bool ret = false; ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) ++ return false; ++ ++ if (seq <= c->journal.flushed_seq_ondisk) ++ return false; ++ ++ spin_lock(&j->lock); ++ if (seq <= c->journal.flushed_seq_ondisk) ++ goto out; ++ ++ for (unwritten_seq = journal_last_unwritten_seq(j); ++ unwritten_seq < seq; ++ unwritten_seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); ++ ++ /* journal write is already in flight, and was a flush write: */ ++ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) ++ goto out; ++ ++ buf->noflush = true; ++ } ++ ++ ret = true; ++out: ++ spin_unlock(&j->lock); ++ return ret; ++} ++ ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_buf *buf; ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ buf = j->buf + (res.seq & JOURNAL_BUF_MASK); ++ buf->must_flush = true; ++ ++ if (!buf->flush_time) { ++ buf->flush_time = local_clock() ?: 1; ++ buf->expires = jiffies; ++ } ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) ++{ ++ struct jset_entry_log *entry; ++ struct journal_res res = { 0 }; ++ unsigned msglen, u64s; ++ va_list args; ++ int ret; ++ ++ va_start(args, fmt); ++ msglen = vsnprintf(NULL, 0, fmt, args) + 1; ++ va_end(args); ++ ++ u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); ++ ++ ret = bch2_journal_res_get(j, &res, u64s, 0); ++ if (ret) ++ return ret; ++ ++ entry = container_of(journal_res_entry(j, &res), ++ struct jset_entry_log, entry);; ++ memset(entry, 0, u64s * sizeof(u64)); ++ entry->entry.type = BCH_JSET_ENTRY_log; ++ entry->entry.u64s = u64s - 1; ++ ++ va_start(args, fmt); ++ vsnprintf(entry->d, INT_MAX, fmt, args); ++ va_end(args); ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++/* block/unlock the journal: */ ++ ++void bch2_journal_unblock(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked--; ++ spin_unlock(&j->lock); ++ ++ journal_wake(j); ++} ++ ++void bch2_journal_block(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked++; ++ spin_unlock(&j->lock); ++ ++ journal_quiesce(j); ++} ++ ++/* allocate journal on a device: */ ++ ++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ++ bool new_fs, struct closure *cl) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ struct open_bucket **ob = NULL; ++ long *bu = NULL; ++ unsigned i, nr_got = 0, nr_want = nr - ja->nr; ++ unsigned old_nr = ja->nr; ++ unsigned old_discard_idx = ja->discard_idx; ++ unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; ++ unsigned old_dirty_idx = ja->dirty_idx; ++ unsigned old_cur_idx = ja->cur_idx; ++ int ret = 0; ++ ++ if (c) { ++ bch2_journal_flush_all_pins(&c->journal); ++ bch2_journal_block(&c->journal); ++ } ++ ++ bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); ++ ob = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL); ++ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ if (!bu || !ob || !new_buckets || !new_bucket_seq) { ++ ret = -ENOMEM; ++ goto err_unblock; ++ } ++ ++ for (nr_got = 0; nr_got < nr_want; nr_got++) { ++ if (new_fs) { ++ bu[nr_got] = bch2_bucket_alloc_new_fs(ca); ++ if (bu[nr_got] < 0) { ++ ret = -ENOSPC; ++ break; ++ } ++ } else { ++ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, ++ false, cl); ++ if (IS_ERR(ob[nr_got])) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ break; ++ } ++ ++ bu[nr_got] = ob[nr_got]->bucket; ++ } ++ } ++ ++ if (!nr_got) ++ goto err_unblock; ++ ++ /* ++ * We may be called from the device add path, before the new device has ++ * actually been added to the running filesystem: ++ */ ++ if (!new_fs) ++ spin_lock(&c->journal.lock); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ++ for (i = 0; i < nr_got; i++) { ++ unsigned pos = ja->discard_idx ?: ja->nr; ++ long b = bu[i]; ++ ++ __array_insert_item(ja->buckets, ja->nr, pos); ++ __array_insert_item(ja->bucket_seq, ja->nr, pos); ++ ja->nr++; ++ ++ ja->buckets[pos] = b; ++ ja->bucket_seq[pos] = 0; ++ ++ if (pos <= ja->discard_idx) ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ if (pos <= ja->dirty_idx_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ if (pos <= ja->dirty_idx) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ if (pos <= ja->cur_idx) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ } ++ ++ ret = bch2_journal_buckets_to_sb(c, ca); ++ if (ret) { ++ /* Revert: */ ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ja->nr = old_nr; ++ ja->discard_idx = old_discard_idx; ++ ja->dirty_idx_ondisk = old_dirty_idx_ondisk; ++ ja->dirty_idx = old_dirty_idx; ++ ja->cur_idx = old_cur_idx; ++ } ++ ++ if (!new_fs) ++ spin_unlock(&c->journal.lock); ++ ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ ++ if (ret) ++ goto err; ++ ++ if (!new_fs) { ++ for (i = 0; i < nr_got; i++) { ++ ret = bch2_trans_run(c, ++ bch2_trans_mark_metadata_bucket(&trans, ca, ++ bu[i], BCH_DATA_journal, ++ ca->mi.bucket_size)); ++ if (ret) { ++ bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); ++ goto err; ++ } ++ } ++ } ++err: ++ if (ob && !new_fs) ++ for (i = 0; i < nr_got; i++) ++ bch2_open_bucket_put(c, ob[i]); ++ ++ kfree(new_bucket_seq); ++ kfree(new_buckets); ++ kfree(ob); ++ kfree(bu); ++ ++ return ret; ++err_unblock: ++ if (c) ++ bch2_journal_unblock(&c->journal); ++ goto err; ++} ++ ++/* ++ * Allocate more journal space at runtime - not currently making use if it, but ++ * the code works: ++ */ ++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ++ unsigned nr) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct closure cl; ++ unsigned current_nr; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr < ja->nr) ++ return 0; ++ ++ closure_init_stack(&cl); ++ ++ while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { ++ struct disk_reservation disk_res = { 0, 0 }; ++ ++ closure_sync(&cl); ++ ++ mutex_lock(&c->sb_lock); ++ current_nr = ja->nr; ++ ++ /* ++ * note: journal buckets aren't really counted as _sectors_ used yet, so ++ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c ++ * when space used goes up without a reservation - but we do need the ++ * reservation to ensure we'll actually be able to allocate: ++ */ ++ ++ if (bch2_disk_reservation_get(c, &disk_res, ++ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOSPC; ++ } ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (ja->nr != current_nr) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ return ret; ++} ++ ++int bch2_dev_journal_alloc(struct bch_dev *ca) ++{ ++ unsigned nr; ++ int ret; ++ ++ if (dynamic_fault("bcachefs:add:journal_alloc")) ++ return -ENOMEM; ++ ++ /* 1/128th of the device by default: */ ++ nr = ca->mi.nbuckets >> 7; ++ ++ /* ++ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever ++ * is smaller: ++ */ ++ nr = clamp_t(unsigned, nr, ++ BCH_JOURNAL_BUCKETS_MIN, ++ min(1 << 13, ++ (1 << 24) / ca->mi.bucket_size)); ++ ++ if (ca->fs) ++ mutex_lock(&ca->fs->sb_lock); ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++ ++ if (ca->fs) ++ mutex_unlock(&ca->fs->sb_lock); ++ ++ return ret; ++} ++ ++/* startup/shutdown: */ ++ ++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) ++{ ++ bool ret = false; ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j) && !ret; ++ seq++) { ++ struct journal_buf *buf = journal_seq_to_buf(j, seq); ++ ++ if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) ++ ret = true; ++ } ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ++{ ++ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); ++} ++ ++void bch2_fs_journal_stop(struct journal *j) ++{ ++ bch2_journal_reclaim_stop(j); ++ bch2_journal_flush_all_pins(j); ++ ++ wait_event(j->wait, journal_entry_close(j)); ++ ++ /* ++ * Always write a new journal entry, to make sure the clock hands are up ++ * to date (and match the superblock) ++ */ ++ bch2_journal_meta(j); ++ ++ journal_quiesce(j); ++ ++ BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags) && ++ j->last_empty_seq != journal_cur_seq(j)); ++ ++ cancel_delayed_work_sync(&j->write_work); ++} ++ ++int bch2_fs_journal_start(struct journal *j, u64 cur_seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ struct journal_replay *i, **_i; ++ struct genradix_iter iter; ++ bool had_entries = false; ++ unsigned ptr; ++ u64 last_seq = cur_seq, nr, seq; ++ ++ genradix_for_each_reverse(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ last_seq = le64_to_cpu(i->j.last_seq); ++ break; ++ } ++ ++ nr = cur_seq - last_seq; ++ ++ if (nr + 1 > j->pin.size) { ++ free_fifo(&j->pin); ++ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); ++ return -ENOMEM; ++ } ++ } ++ ++ j->replay_journal_seq = last_seq; ++ j->replay_journal_seq_end = cur_seq; ++ j->last_seq_ondisk = last_seq; ++ j->flushed_seq_ondisk = cur_seq - 1; ++ j->seq_ondisk = cur_seq - 1; ++ j->pin.front = last_seq; ++ j->pin.back = cur_seq; ++ atomic64_set(&j->seq, cur_seq - 1); ++ ++ fifo_for_each_entry_ptr(p, &j->pin, seq) ++ journal_pin_list_init(p, 1); ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ seq = le64_to_cpu(i->j.seq); ++ BUG_ON(seq >= cur_seq); ++ ++ if (seq < last_seq) ++ continue; ++ ++ if (journal_entry_empty(&i->j)) ++ j->last_empty_seq = le64_to_cpu(i->j.seq); ++ ++ p = journal_seq_pin(j, seq); ++ ++ p->devs.nr = 0; ++ for (ptr = 0; ptr < i->nr_ptrs; ptr++) ++ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); ++ ++ had_entries = true; ++ } ++ ++ if (!had_entries) ++ j->last_empty_seq = cur_seq; ++ ++ spin_lock(&j->lock); ++ ++ set_bit(JOURNAL_STARTED, &j->flags); ++ j->last_flush_write = jiffies; ++ ++ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); ++ j->reservations.unwritten_idx++; ++ ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ return bch2_journal_reclaim_start(j); ++} ++ ++/* init/exit: */ ++ ++void bch2_dev_journal_exit(struct bch_dev *ca) ++{ ++ kfree(ca->journal.bio); ++ kfree(ca->journal.buckets); ++ kfree(ca->journal.bucket_seq); ++ ++ ca->journal.bio = NULL; ++ ca->journal.buckets = NULL; ++ ca->journal.bucket_seq = NULL; ++} ++ ++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(sb); ++ struct bch_sb_field_journal_v2 *journal_buckets_v2 = ++ bch2_sb_get_journal_v2(sb); ++ unsigned i; ++ ++ ja->nr = 0; ++ ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ ++ for (i = 0; i < nr; i++) ++ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); ++ } else if (journal_buckets) { ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ } ++ ++ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->bucket_seq) ++ return -ENOMEM; ++ ++ ca->journal.bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); ++ if (!ca->journal.bio) ++ return -ENOMEM; ++ ++ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->buckets) ++ return -ENOMEM; ++ ++ if (journal_buckets_v2) { ++ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); ++ unsigned j, dst = 0; ++ ++ for (i = 0; i < nr; i++) ++ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ++ ja->buckets[dst++] = ++ le64_to_cpu(journal_buckets_v2->d[i].start) + j; ++ } else if (journal_buckets) { ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ } ++ ++ return 0; ++} ++ ++void bch2_fs_journal_exit(struct journal *j) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(j->buf); i++) ++ kvpfree(j->buf[i].data, j->buf[i].buf_size); ++ free_fifo(&j->pin); ++} ++ ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ static struct lock_class_key res_key; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ spin_lock_init(&j->lock); ++ spin_lock_init(&j->err_lock); ++ init_waitqueue_head(&j->wait); ++ INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ init_waitqueue_head(&j->reclaim_wait); ++ init_waitqueue_head(&j->pin_flush_wait); ++ mutex_init(&j->reclaim_lock); ++ mutex_init(&j->discard_lock); ++ ++ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); ++ ++ atomic64_set(&j->reservations.counter, ++ ((union journal_res_state) ++ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++ ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(j->buf); i++) { ++ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); ++ if (!j->buf[i].data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ j->pin.front = j->pin.back = 1; ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++/* debug: */ ++ ++void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ union journal_res_state s; ++ struct bch_dev *ca; ++ unsigned long now = jiffies; ++ u64 seq; ++ unsigned i; ++ ++ out->atomic++; ++ out->tabstops[0] = 24; ++ ++ rcu_read_lock(); ++ s = READ_ONCE(j->reservations); ++ ++ prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); ++ prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); ++ prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); ++ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); ++ prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); ++ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); ++ prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); ++ prt_printf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); ++ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); ++ prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); ++ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); ++ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); ++ prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); ++ prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); ++ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ++ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); ++ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); ++ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); ++ prt_printf(out, "current entry:\t\t"); ++ ++ switch (s.cur_entry_offset) { ++ case JOURNAL_ENTRY_ERROR_VAL: ++ prt_printf(out, "error"); ++ break; ++ case JOURNAL_ENTRY_CLOSED_VAL: ++ prt_printf(out, "closed"); ++ break; ++ default: ++ prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); ++ break; ++ } ++ ++ prt_newline(out); ++ ++ for (seq = journal_cur_seq(j); ++ seq >= journal_last_unwritten_seq(j); ++ --seq) { ++ i = seq & JOURNAL_BUF_MASK; ++ ++ prt_printf(out, "unwritten entry:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", seq); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "refcount:"); ++ prt_tab(out); ++ prt_printf(out, "%u", journal_state_count(s, i)); ++ prt_newline(out); ++ ++ prt_printf(out, "sectors:"); ++ prt_tab(out); ++ prt_printf(out, "%u", j->buf[i].sectors); ++ prt_newline(out); ++ ++ prt_printf(out, "expires"); ++ prt_tab(out); ++ prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); ++ prt_newline(out); ++ ++ printbuf_indent_sub(out, 2); ++ } ++ ++ prt_printf(out, ++ "replay done:\t\t%i\n", ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); ++ ++ prt_printf(out, "space:\n"); ++ prt_printf(out, "\tdiscarded\t%u:%u\n", ++ j->space[journal_space_discarded].next_entry, ++ j->space[journal_space_discarded].total); ++ prt_printf(out, "\tclean ondisk\t%u:%u\n", ++ j->space[journal_space_clean_ondisk].next_entry, ++ j->space[journal_space_clean_ondisk].total); ++ prt_printf(out, "\tclean\t\t%u:%u\n", ++ j->space[journal_space_clean].next_entry, ++ j->space[journal_space_clean].total); ++ prt_printf(out, "\ttotal\t\t%u:%u\n", ++ j->space[journal_space_total].next_entry, ++ j->space[journal_space_total].total); ++ ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) ++ continue; ++ ++ if (!ja->nr) ++ continue; ++ ++ prt_printf(out, "dev %u:\n", i); ++ prt_printf(out, "\tnr\t\t%u\n", ja->nr); ++ prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); ++ prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); ++ prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); ++ prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); ++ prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); ++ prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ } ++ ++ rcu_read_unlock(); ++ ++ --out->atomic; ++} ++ ++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++{ ++ spin_lock(&j->lock); ++ __bch2_journal_debug_to_text(out, j); ++ spin_unlock(&j->lock); ++} ++ ++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ ++ spin_lock(&j->lock); ++ *seq = max(*seq, j->pin.front); ++ ++ if (*seq >= j->pin.back) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ ++ out->atomic++; ++ ++ pin_list = journal_seq_pin(j, *seq); ++ ++ prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ list_for_each_entry(pin, &pin_list->list, list) { ++ prt_printf(out, "\t%px %ps", pin, pin->flush); ++ prt_newline(out); ++ } ++ ++ list_for_each_entry(pin, &pin_list->key_cache_list, list) { ++ prt_printf(out, "\t%px %ps", pin, pin->flush); ++ prt_newline(out); ++ } ++ ++ if (!list_empty(&pin_list->flushed)) { ++ prt_printf(out, "flushed:"); ++ prt_newline(out); ++ } ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) { ++ prt_printf(out, "\t%px %ps", pin, pin->flush); ++ prt_newline(out); ++ } ++ ++ printbuf_indent_sub(out, 2); ++ ++ --out->atomic; ++ spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ u64 seq = 0; ++ ++ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) ++ seq++; ++} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +new file mode 100644 +index 000000000000..d3caa7ea7ce9 +--- /dev/null ++++ b/fs/bcachefs/journal.h +@@ -0,0 +1,521 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_H ++#define _BCACHEFS_JOURNAL_H ++ ++/* ++ * THE JOURNAL: ++ * ++ * The primary purpose of the journal is to log updates (insertions) to the ++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. ++ * ++ * Without the journal, the b-tree is always internally consistent on ++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal ++ * but did handle unclean shutdowns by doing all index updates synchronously ++ * (with coalescing). ++ * ++ * Updates to interior nodes still happen synchronously and without the journal ++ * (for simplicity) - this may change eventually but updates to interior nodes ++ * are rare enough it's not a huge priority. ++ * ++ * This means the journal is relatively separate from the b-tree; it consists of ++ * just a list of keys and journal replay consists of just redoing those ++ * insertions in same order that they appear in the journal. ++ * ++ * PERSISTENCE: ++ * ++ * For synchronous updates (where we're waiting on the index update to hit ++ * disk), the journal entry will be written out immediately (or as soon as ++ * possible, if the write for the previous journal entry was still in flight). ++ * ++ * Synchronous updates are specified by passing a closure (@flush_cl) to ++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter ++ * down to the journalling code. That closure will will wait on the journal ++ * write to complete (via closure_wait()). ++ * ++ * If the index update wasn't synchronous, the journal entry will be ++ * written out after 10 ms have elapsed, by default (the delay_ms field ++ * in struct journal). ++ * ++ * JOURNAL ENTRIES: ++ * ++ * A journal entry is variable size (struct jset), it's got a fixed length ++ * header and then a variable number of struct jset_entry entries. ++ * ++ * Journal entries are identified by monotonically increasing 64 bit sequence ++ * numbers - jset->seq; other places in the code refer to this sequence number. ++ * ++ * A jset_entry entry contains one or more bkeys (which is what gets inserted ++ * into the b-tree). We need a container to indicate which b-tree the key is ++ * for; also, the roots of the various b-trees are stored in jset_entry entries ++ * (one for each b-tree) - this lets us add new b-tree types without changing ++ * the on disk format. ++ * ++ * We also keep some things in the journal header that are logically part of the ++ * superblock - all the things that are frequently updated. This is for future ++ * bcache on raw flash support; the superblock (which will become another ++ * journal) can't be moved or wear leveled, so it contains just enough ++ * information to find the main journal, and the superblock only has to be ++ * rewritten when we want to move/wear level the main journal. ++ * ++ * JOURNAL LAYOUT ON DISK: ++ * ++ * The journal is written to a ringbuffer of buckets (which is kept in the ++ * superblock); the individual buckets are not necessarily contiguous on disk ++ * which means that journal entries are not allowed to span buckets, but also ++ * that we can resize the journal at runtime if desired (unimplemented). ++ * ++ * The journal buckets exist in the same pool as all the other buckets that are ++ * managed by the allocator and garbage collection - garbage collection marks ++ * the journal buckets as metadata buckets. ++ * ++ * OPEN/DIRTY JOURNAL ENTRIES: ++ * ++ * Open/dirty journal entries are journal entries that contain b-tree updates ++ * that have not yet been written out to the b-tree on disk. We have to track ++ * which journal entries are dirty, and we also have to avoid wrapping around ++ * the journal and overwriting old but still dirty journal entries with new ++ * journal entries. ++ * ++ * On disk, this is represented with the "last_seq" field of struct jset; ++ * last_seq is the first sequence number that journal replay has to replay. ++ * ++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in ++ * journal_device->seq) of for each journal bucket, the highest sequence number ++ * any journal entry it contains. Then, by comparing that against last_seq we ++ * can determine whether that journal bucket contains dirty journal entries or ++ * not. ++ * ++ * To track which journal entries are dirty, we maintain a fifo of refcounts ++ * (where each entry corresponds to a specific sequence number) - when a ref ++ * goes to 0, that journal entry is no longer dirty. ++ * ++ * Journalling of index updates is done at the same time as the b-tree itself is ++ * being modified (see btree_insert_key()); when we add the key to the journal ++ * the pending b-tree write takes a ref on the journal entry the key was added ++ * to. If a pending b-tree write would need to take refs on multiple dirty ++ * journal entries, it only keeps the ref on the oldest one (since a newer ++ * journal entry will still be replayed if an older entry was dirty). ++ * ++ * JOURNAL FILLING UP: ++ * ++ * There are two ways the journal could fill up; either we could run out of ++ * space to write to, or we could have too many open journal entries and run out ++ * of room in the fifo of refcounts. Since those refcounts are decremented ++ * without any locking we can't safely resize that fifo, so we handle it the ++ * same way. ++ * ++ * If the journal fills up, we start flushing dirty btree nodes until we can ++ * allocate space for a journal write again - preferentially flushing btree ++ * nodes that are pinning the oldest journal entries first. ++ */ ++ ++#include ++ ++#include "journal_types.h" ++ ++struct bch_fs; ++ ++static inline void journal_wake(struct journal *j) ++{ ++ wake_up(&j->wait); ++ closure_wake_up(&j->async_wait); ++ closure_wake_up(&j->preres_wait); ++} ++ ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.idx; ++} ++ ++/* Sequence number of oldest dirty journal entry */ ++ ++static inline u64 journal_last_seq(struct journal *j) ++{ ++ return j->pin.front; ++} ++ ++static inline u64 journal_cur_seq(struct journal *j) ++{ ++ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ ++ return j->pin.back - 1; ++} ++ ++static inline u64 journal_last_unwritten_seq(struct journal *j) ++{ ++ return j->seq_ondisk + 1; ++} ++ ++static inline int journal_state_count(union journal_res_state s, int idx) ++{ ++ switch (idx) { ++ case 0: return s.buf0_count; ++ case 1: return s.buf1_count; ++ case 2: return s.buf2_count; ++ case 3: return s.buf3_count; ++ } ++ BUG(); ++} ++ ++static inline void journal_state_inc(union journal_res_state *s) ++{ ++ s->buf0_count += s->idx == 0; ++ s->buf1_count += s->idx == 1; ++ s->buf2_count += s->idx == 2; ++ s->buf3_count += s->idx == 3; ++} ++ ++/* ++ * Amount of space that will be taken up by some keys in the journal (i.e. ++ * including the jset header) ++ */ ++static inline unsigned jset_u64s(unsigned u64s) ++{ ++ return u64s + sizeof(struct jset_entry) / sizeof(u64); ++} ++ ++static inline int journal_entry_overhead(struct journal *j) ++{ ++ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) ++{ ++ struct jset *jset = buf->data; ++ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ ++ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); ++ ++ return entry; ++} ++ ++static inline struct jset_entry * ++journal_res_entry(struct journal *j, struct journal_res *res) ++{ ++ return vstruct_idx(j->buf[res->idx].data, res->offset); ++} ++ ++static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ unsigned u64s) ++{ ++ entry->u64s = cpu_to_le16(u64s); ++ entry->btree_id = id; ++ entry->level = level; ++ entry->type = type; ++ entry->pad[0] = 0; ++ entry->pad[1] = 0; ++ entry->pad[2] = 0; ++ return jset_u64s(u64s); ++} ++ ++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ const void *data, unsigned u64s) ++{ ++ unsigned ret = journal_entry_init(entry, type, id, level, u64s); ++ ++ memcpy_u64s_small(entry->_data, data, u64s); ++ return ret; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry(struct journal *j, struct journal_res *res, ++ unsigned type, enum btree_id id, ++ unsigned level, unsigned u64s) ++{ ++ struct jset_entry *entry = journal_res_entry(j, res); ++ unsigned actual = journal_entry_init(entry, type, id, level, u64s); ++ ++ EBUG_ON(!res->ref); ++ EBUG_ON(actual > res->u64s); ++ ++ res->offset += actual; ++ res->u64s -= actual; ++ return entry; ++} ++ ++static inline bool journal_entry_empty(struct jset *j) ++{ ++ struct jset_entry *i; ++ ++ if (j->seq != j->last_seq) ++ return false; ++ ++ vstruct_for_each(j, i) ++ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) ++ return false; ++ return true; ++} ++ ++void __bch2_journal_buf_put(struct journal *); ++ ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) ++{ ++ union journal_res_state s; ++ ++ s.v = atomic64_sub_return(((union journal_res_state) { ++ .buf0_count = idx == 0, ++ .buf1_count = idx == 1, ++ .buf2_count = idx == 2, ++ .buf3_count = idx == 3, ++ }).v, &j->reservations.counter); ++ ++ if (!journal_state_count(s, idx) && idx == s.unwritten_idx) ++ __bch2_journal_buf_put(j); ++} ++ ++/* ++ * This function releases the journal write structure so other threads can ++ * then proceed to add their keys as well. ++ */ ++static inline void bch2_journal_res_put(struct journal *j, ++ struct journal_res *res) ++{ ++ if (!res->ref) ++ return; ++ ++ lock_release(&j->res_map, _THIS_IP_); ++ ++ while (res->u64s) ++ bch2_journal_add_entry(j, res, ++ BCH_JSET_ENTRY_btree_keys, ++ 0, 0, 0); ++ ++ bch2_journal_buf_put(j, res->idx); ++ ++ res->ref = 0; ++} ++ ++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ++ unsigned); ++ ++/* First two bits for JOURNAL_WATERMARK: */ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 2) ++#define JOURNAL_RES_GET_CHECK (1 << 3) ++ ++static inline int journal_res_get_fast(struct journal *j, ++ struct journal_res *res, ++ unsigned flags) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ /* ++ * Check if there is still room in the current journal ++ * entry: ++ */ ++ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ return 0; ++ ++ EBUG_ON(!journal_state_count(new, new.idx)); ++ ++ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) ++ return 0; ++ ++ new.cur_entry_offset += res->u64s; ++ journal_state_inc(&new); ++ ++ /* ++ * If the refcount would overflow, we have to wait: ++ * XXX - tracepoint this: ++ */ ++ if (!journal_state_count(new, new.idx)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ res->ref = true; ++ res->idx = old.idx; ++ res->offset = old.cur_entry_offset; ++ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ return 1; ++} ++ ++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned u64s, unsigned flags) ++{ ++ int ret; ++ ++ EBUG_ON(res->ref); ++ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ ++ res->u64s = u64s; ++ ++ if (journal_res_get_fast(j, res, flags)) ++ goto out; ++ ++ ret = bch2_journal_res_get_slowpath(j, res, flags); ++ if (ret) ++ return ret; ++out: ++ if (!(flags & JOURNAL_RES_GET_CHECK)) { ++ lock_acquire_shared(&j->res_map, 0, ++ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, ++ NULL, _THIS_IP_); ++ EBUG_ON(!res->ref); ++ } ++ return 0; ++} ++ ++/* journal_preres: */ ++ ++static inline void journal_set_watermark(struct journal *j) ++{ ++ union journal_preres_state s = READ_ONCE(j->prereserved); ++ unsigned watermark = JOURNAL_WATERMARK_any; ++ ++ if (fifo_free(&j->pin) < j->pin.size / 4) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (fifo_free(&j->pin) < j->pin.size / 8) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); ++ ++ if (s.reserved > s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); ++ if (!s.remaining) ++ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); ++ ++ if (watermark == j->watermark) ++ return; ++ ++ swap(watermark, j->watermark); ++ if (watermark > j->watermark) ++ journal_wake(j); ++} ++ ++static inline void bch2_journal_preres_put(struct journal *j, ++ struct journal_preres *res) ++{ ++ union journal_preres_state s = { .reserved = res->u64s }; ++ ++ if (!res->u64s) ++ return; ++ ++ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); ++ res->u64s = 0; ++ ++ if (unlikely(s.waiting)) { ++ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), ++ (unsigned long *) &j->prereserved.v); ++ closure_wake_up(&j->preres_wait); ++ } ++ ++ if (s.reserved <= s.remaining && j->watermark) ++ journal_set_watermark(j); ++} ++ ++int __bch2_journal_preres_get(struct journal *, ++ struct journal_preres *, unsigned, unsigned); ++ ++static inline int bch2_journal_preres_get_fast(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags, ++ bool set_waiting) ++{ ++ int d = new_u64s - res->u64s; ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ int ret; ++ ++ do { ++ old.v = new.v = v; ++ ret = 0; ++ ++ if ((flags & JOURNAL_WATERMARK_reserved) || ++ new.reserved + d < new.remaining) { ++ new.reserved += d; ++ ret = 1; ++ } else if (set_waiting && !new.waiting) ++ new.waiting = true; ++ else ++ return 0; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++ ++ if (ret) ++ res->u64s += d; ++ return ret; ++} ++ ++static inline int bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ if (new_u64s <= res->u64s) ++ return 0; ++ ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_NONBLOCK) ++ return -EAGAIN; ++ ++ return __bch2_journal_preres_get(j, res, new_u64s, flags); ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *, ++ struct journal_entry_res *, ++ unsigned); ++ ++int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++void bch2_journal_flush_async(struct journal *, struct closure *); ++ ++int bch2_journal_flush_seq(struct journal *, u64); ++int bch2_journal_flush(struct journal *); ++bool bch2_journal_noflush_seq(struct journal *, u64); ++int bch2_journal_meta(struct journal *); ++int bch2_journal_log_msg(struct journal *, const char *, ...); ++ ++void bch2_journal_halt(struct journal *); ++ ++static inline int bch2_journal_error(struct journal *j) ++{ ++ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ++ ? -EIO : 0; ++} ++ ++struct bch_dev; ++ ++static inline void bch2_journal_set_replay_done(struct journal *j) ++{ ++ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ set_bit(JOURNAL_REPLAY_DONE, &j->flags); ++} ++ ++void bch2_journal_unblock(struct journal *); ++void bch2_journal_block(struct journal *); ++ ++void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); ++void bch2_journal_debug_to_text(struct printbuf *, struct journal *); ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); ++ ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, ++ unsigned nr); ++int bch2_dev_journal_alloc(struct bch_dev *); ++ ++void bch2_dev_journal_stop(struct journal *, struct bch_dev *); ++ ++void bch2_fs_journal_stop(struct journal *); ++int bch2_fs_journal_start(struct journal *, u64); ++ ++void bch2_dev_journal_exit(struct bch_dev *); ++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); ++void bch2_fs_journal_exit(struct journal *); ++int bch2_fs_journal_init(struct journal *); ++ ++#endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +new file mode 100644 +index 000000000000..6fa2c54c1af4 +--- /dev/null ++++ b/fs/bcachefs/journal_io.c +@@ -0,0 +1,1735 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_io.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++ ++#include ++ ++static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) ++{ ++ return (seq - c->journal_entries_base_seq) & (~0U >> 1); ++} ++ ++static void __journal_replay_free(struct bch_fs *c, ++ struct journal_replay *i) ++{ ++ struct journal_replay **p = ++ genradix_ptr(&c->journal_entries, ++ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); ++ ++ BUG_ON(*p != i); ++ *p = NULL; ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++} ++ ++static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) ++{ ++ i->ignore = true; ++ ++ if (!c->opts.read_entire_journal) ++ __journal_replay_free(c, i); ++} ++ ++struct journal_list { ++ struct closure cl; ++ u64 last_seq; ++ struct mutex lock; ++ int ret; ++}; ++ ++#define JOURNAL_ENTRY_ADD_OK 0 ++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 ++ ++/* ++ * Given a journal entry we just read, add it to the list of journal entries to ++ * be replayed: ++ */ ++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct journal_ptr entry_ptr, ++ struct journal_list *jlist, struct jset *j, ++ bool bad) ++{ ++ struct genradix_iter iter; ++ struct journal_replay **_i, *i, *dup; ++ struct journal_ptr *ptr; ++ size_t bytes = vstruct_bytes(j); ++ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; ++ int ret = JOURNAL_ENTRY_ADD_OK; ++ ++ /* Is this entry older than the range we need? */ ++ if (!c->opts.read_entire_journal && ++ le64_to_cpu(j->seq) < jlist->last_seq) ++ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ ++ /* ++ * genradixes are indexed by a ulong, not a u64, so we can't index them ++ * by sequence number directly: Assume instead that they will all fall ++ * within the range of +-2billion of the filrst one we find. ++ */ ++ if (!c->journal_entries_base_seq) ++ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); ++ ++ /* Drop entries we don't need anymore */ ++ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { ++ genradix_for_each_from(&c->journal_entries, iter, _i, ++ journal_entry_radix_idx(c, jlist->last_seq)) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ if (le64_to_cpu(i->j.seq) >= last_seq) ++ break; ++ journal_replay_free(c, i); ++ } ++ } ++ ++ jlist->last_seq = max(jlist->last_seq, last_seq); ++ ++ _i = genradix_ptr_alloc(&c->journal_entries, ++ journal_entry_radix_idx(c, le64_to_cpu(j->seq)), ++ GFP_KERNEL); ++ if (!_i) ++ return -ENOMEM; ++ ++ /* ++ * Duplicate journal entries? If so we want the one that didn't have a ++ * checksum error: ++ */ ++ dup = *_i; ++ if (dup) { ++ if (dup->bad) { ++ /* we'll replace @dup: */ ++ } else if (bad) { ++ i = dup; ++ goto found; ++ } else { ++ fsck_err_on(bytes != vstruct_bytes(&dup->j) || ++ memcmp(j, &dup->j, bytes), c, ++ "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ i = dup; ++ goto found; ++ } ++ } ++ ++ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ i->nr_ptrs = 0; ++ i->bad = bad; ++ i->ignore = false; ++ memcpy(&i->j, j, bytes); ++ ++ if (dup) { ++ i->nr_ptrs = dup->nr_ptrs; ++ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); ++ __journal_replay_free(c, dup); ++ } ++ ++ ++ *_i = i; ++found: ++ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { ++ if (ptr->dev == ca->dev_idx) { ++ bch_err(c, "duplicate journal entry %llu on same device", ++ le64_to_cpu(i->j.seq)); ++ goto out; ++ } ++ } ++ ++ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { ++ bch_err(c, "found too many copies of journal entry %llu", ++ le64_to_cpu(i->j.seq)); ++ goto out; ++ } ++ ++ i->ptrs[i->nr_ptrs++] = entry_ptr; ++out: ++fsck_err: ++ return ret; ++} ++ ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++/* this fills in a range with empty jset_entries: */ ++static void journal_entry_null_range(void *start, void *end) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = start; entry != end; entry = vstruct_next(entry)) ++ memset(entry, 0, sizeof(*entry)); ++} ++ ++#define JOURNAL_ENTRY_REREAD 5 ++#define JOURNAL_ENTRY_NONE 6 ++#define JOURNAL_ENTRY_BAD 7 ++ ++#define journal_entry_err(c, msg, ...) \ ++({ \ ++ switch (write) { \ ++ case READ: \ ++ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write:\n" \ ++ msg, ##__VA_ARGS__); \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = -BCH_ERR_fsck_errors_not_fixed; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++ true; \ ++}) ++ ++#define journal_entry_err_on(cond, c, msg, ...) \ ++ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ++ ++#define FSCK_DELETED_KEY 5 ++ ++static int journal_validate_key(struct bch_fs *c, const char *where, ++ struct jset_entry *entry, ++ unsigned level, enum btree_id btree_id, ++ struct bkey_i *k, ++ unsigned version, int big_endian, int write) ++{ ++ void *next = vstruct_next(entry); ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!k->k.u64s, c, ++ "invalid key in %s at %s offset %zi/%u: k->u64s 0", ++ bch2_jset_entry_types[entry->type], where, ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return FSCK_DELETED_KEY; ++ } ++ ++ if (journal_entry_err_on((void *) bkey_next(k) > ++ (void *) vstruct_next(entry), c, ++ "invalid key in %s at %s offset %zi/%u: extends past end of journal entry", ++ bch2_jset_entry_types[entry->type], where, ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return FSCK_DELETED_KEY; ++ } ++ ++ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, ++ "invalid key in %s at %s offset %zi/%u: bad format %u", ++ bch2_jset_entry_types[entry->type], where, ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s), ++ k->k.format)) { ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return FSCK_DELETED_KEY; ++ } ++ ++ if (!write) ++ bch2_bkey_compat(level, btree_id, version, big_endian, ++ write, NULL, bkey_to_packed(k)); ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id), write, &buf)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:", ++ bch2_jset_entry_types[entry->type], where, ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s)); ++ prt_newline(&buf); ++ printbuf_indent_add(&buf, 2); ++ ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); ++ prt_newline(&buf); ++ bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id), write, &buf); ++ ++ mustfix_fsck_err(c, "%s", buf.buf); ++ ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ ++ printbuf_exit(&buf); ++ return FSCK_DELETED_KEY; ++ } ++ ++ if (write) ++ bch2_bkey_compat(level, btree_id, version, big_endian, ++ write, NULL, bkey_to_packed(k)); ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int journal_entry_btree_keys_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct bkey_i *k = entry->start; ++ ++ while (k != vstruct_last(entry)) { ++ int ret = journal_validate_key(c, where, entry, ++ entry->level, ++ entry->btree_id, ++ k, version, big_endian, write); ++ if (ret == FSCK_DELETED_KEY) ++ continue; ++ ++ k = bkey_next(k); ++ } ++ ++ return 0; ++} ++ ++static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct bkey_i *k; ++ bool first = true; ++ ++ vstruct_for_each(entry, k) { ++ if (!first) { ++ prt_newline(out); ++ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ } ++ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); ++ first = false; ++ } ++} ++ ++static int journal_entry_btree_root_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct bkey_i *k = entry->start; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!entry->u64s || ++ le16_to_cpu(entry->u64s) != k->k.u64s, c, ++ "invalid btree root journal entry: wrong number of keys")) { ++ void *next = vstruct_next(entry); ++ /* ++ * we don't want to null out this jset_entry, ++ * just the contents, so that later we can tell ++ * we were _supposed_ to have a btree root ++ */ ++ entry->u64s = 0; ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ return journal_validate_key(c, where, entry, 1, entry->btree_id, k, ++ version, big_endian, write); ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ journal_entry_btree_keys_to_text(out, c, entry); ++} ++ ++static int journal_entry_prio_ptrs_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ /* obsolete, don't care: */ ++ return 0; ++} ++ ++static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++} ++ ++static int journal_entry_blacklist_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist *bl = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); ++} ++ ++static int journal_entry_blacklist_v2_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct jset_entry_blacklist_v2 *bl_entry; ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ goto out; ++ } ++ ++ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > ++ le64_to_cpu(bl_entry->end), c, ++ "invalid journal seq blacklist entry: start > end")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++out: ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_blacklist_v2 *bl = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ prt_printf(out, "start=%llu end=%llu", ++ le64_to_cpu(bl->start), ++ le64_to_cpu(bl->end)); ++} ++ ++static int journal_entry_usage_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u), ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ prt_printf(out, "type=%s v=%llu", ++ bch2_fs_usage_types[u->entry.btree_id], ++ le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_data_usage_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u) || ++ bytes < sizeof(*u) + u->r.nr_devs, ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ bch2_replicas_entry_to_text(out, &u->r); ++ prt_printf(out, "=%llu", le64_to_cpu(u->v)); ++} ++ ++static int journal_entry_clock_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes != sizeof(*clock), ++ c, "invalid journal entry clock: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ if (journal_entry_err_on(clock->rw > 1, ++ c, "invalid journal entry clock: bad rw")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ ++ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); ++} ++ ++static int journal_entry_dev_usage_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ unsigned expected = sizeof(*u); ++ unsigned dev; ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < expected, ++ c, "invalid journal entry dev usage: bad size (%u < %u)", ++ bytes, expected)) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ dev = le32_to_cpu(u->dev); ++ ++ if (journal_entry_err_on(!bch2_dev_exists2(c, dev), ++ c, "invalid journal entry dev usage: bad dev")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++ if (journal_entry_err_on(u->pad, ++ c, "invalid journal entry dev usage: bad pad")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ++ ++ prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); ++ ++ for (i = 0; i < nr_types; i++) { ++ if (i < BCH_DATA_NR) ++ prt_printf(out, " %s", bch2_data_types[i]); ++ else ++ prt_printf(out, " (unknown data type %u)", i); ++ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", ++ le64_to_cpu(u->d[i].buckets), ++ le64_to_cpu(u->d[i].sectors), ++ le64_to_cpu(u->d[i].fragmented)); ++ } ++ ++ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); ++} ++ ++static int journal_entry_log_validate(struct bch_fs *c, ++ const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ return 0; ++} ++ ++static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); ++ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); ++ ++ prt_printf(out, "%.*s", bytes, l->d); ++} ++ ++static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write); ++} ++ ++static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ journal_entry_btree_keys_to_text(out, c, entry); ++} ++ ++struct jset_entry_ops { ++ int (*validate)(struct bch_fs *, const char *, ++ struct jset_entry *, unsigned, int, int); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); ++}; ++ ++static const struct jset_entry_ops bch2_jset_entry_ops[] = { ++#define x(f, nr) \ ++ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ ++ .validate = journal_entry_##f##_validate, \ ++ .to_text = journal_entry_##f##_to_text, \ ++ }, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++}; ++ ++int bch2_journal_entry_validate(struct bch_fs *c, const char *where, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, int write) ++{ ++ return entry->type < BCH_JSET_ENTRY_NR ++ ? bch2_jset_entry_ops[entry->type].validate(c, where, entry, ++ version, big_endian, write) ++ : 0; ++} ++ ++void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ if (entry->type < BCH_JSET_ENTRY_NR) { ++ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); ++ bch2_jset_entry_ops[entry->type].to_text(out, c, entry); ++ } else { ++ prt_printf(out, "(unknown type %u)", entry->type); ++ } ++} ++ ++static int jset_validate_entries(struct bch_fs *c, struct jset *jset, ++ int write) ++{ ++ char buf[100]; ++ struct jset_entry *entry; ++ int ret = 0; ++ ++ vstruct_for_each(jset, entry) { ++ scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u", ++ le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s)); ++ ++ if (journal_entry_err_on(vstruct_next(entry) > ++ vstruct_last(jset), c, ++ "journal entry extends past end of jset")) { ++ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); ++ break; ++ } ++ ++ ret = bch2_journal_entry_validate(c, buf, entry, ++ le32_to_cpu(jset->version), ++ JSET_BIG_ENDIAN(jset), write); ++ if (ret) ++ break; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate(struct bch_fs *c, ++ struct bch_dev *ca, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read, ++ int write) ++{ ++ size_t bytes = vstruct_bytes(jset); ++ struct bch_csum csum; ++ unsigned version; ++ int ret = 0; ++ ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, c, ++ "%s sector %llu seq %llu: unknown journal entry version %u", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), ++ version)) { ++ /* don't try to continue: */ ++ return EINVAL; ++ } ++ ++ if (bytes > (sectors_read << 9) && ++ sectors_read < bucket_sectors_left) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, ++ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), bytes)) { ++ ret = JOURNAL_ENTRY_BAD; ++ le32_add_cpu(&jset->u64s, ++ -((bytes - (bucket_sectors_left << 9)) / 8)); ++ } ++ ++ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq), ++ JSET_CSUM_TYPE(jset))) { ++ ret = JOURNAL_ENTRY_BAD; ++ goto csum_done; ++ } ++ ++ if (write) ++ goto csum_done; ++ ++ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); ++ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, ++ "%s sector %llu seq %llu: journal checksum bad", ++ ca ? ca->name : c->name, ++ sector, le64_to_cpu(jset->seq))) ++ ret = JOURNAL_ENTRY_BAD; ++ ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret); ++csum_done: ++ /* last_seq is ignored when JSET_NO_FLUSH is true */ ++ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && ++ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq (%llu > %llu)", ++ le64_to_cpu(jset->last_seq), ++ le64_to_cpu(jset->seq))) { ++ jset->last_seq = jset->seq; ++ return JOURNAL_ENTRY_BAD; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) ++{ ++ unsigned sectors = vstruct_sectors(jset, c->block_bits); ++ ++ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: ++ jset_validate_entries(c, jset, WRITE); ++} ++ ++struct journal_read_buf { ++ void *data; ++ size_t size; ++}; ++ ++static int journal_read_buf_realloc(struct journal_read_buf *b, ++ size_t new_size) ++{ ++ void *n; ++ ++ /* the bios are sized for this many pages, max: */ ++ if (new_size > JOURNAL_ENTRY_SIZE_MAX) ++ return -ENOMEM; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = kvpmalloc(new_size, GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ kvpfree(b->data, b->size); ++ b->data = n; ++ b->size = new_size; ++ return 0; ++} ++ ++static int journal_read_bucket(struct bch_dev *ca, ++ struct journal_read_buf *buf, ++ struct journal_list *jlist, ++ unsigned bucket) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct jset *j = NULL; ++ unsigned sectors, sectors_read = 0; ++ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), ++ end = offset + ca->mi.bucket_size; ++ bool saw_bad = false; ++ int ret = 0; ++ ++ pr_debug("reading %u", bucket); ++ ++ while (offset < end) { ++ if (!sectors_read) { ++ struct bio *bio; ++reread: ++ sectors_read = min_t(unsigned, ++ end - offset, buf->size >> 9); ++ ++ bio = bio_kmalloc(GFP_KERNEL, ++ buf_pages(buf->data, ++ sectors_read << 9)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch2_bio_map(bio, buf->data, sectors_read << 9); ++ ++ ret = submit_bio_wait(bio); ++ bio_put(bio); ++ ++ if (bch2_dev_io_err_on(ret, ca, ++ "journal read error: sector %llu", ++ offset) || ++ bch2_meta_read_fault("journal")) { ++ /* ++ * We don't error out of the recovery process ++ * here, since the relevant journal entry may be ++ * found on a different device, and missing or ++ * no journal entries will be handled later ++ */ ++ return 0; ++ } ++ ++ j = buf->data; ++ } ++ ++ ret = jset_validate(c, ca, j, offset, ++ end - offset, sectors_read, ++ READ); ++ switch (ret) { ++ case 0: ++ sectors = vstruct_sectors(j, c->block_bits); ++ break; ++ case JOURNAL_ENTRY_REREAD: ++ if (vstruct_bytes(j) > buf->size) { ++ ret = journal_read_buf_realloc(buf, ++ vstruct_bytes(j)); ++ if (ret) ++ return ret; ++ } ++ goto reread; ++ case JOURNAL_ENTRY_NONE: ++ if (!saw_bad) ++ return 0; ++ sectors = block_sectors(c); ++ goto next_block; ++ case JOURNAL_ENTRY_BAD: ++ saw_bad = true; ++ /* ++ * On checksum error we don't really trust the size ++ * field of the journal entry we read, so try reading ++ * again at next block boundary: ++ */ ++ sectors = block_sectors(c); ++ break; ++ default: ++ return ret; ++ } ++ ++ /* ++ * This happens sometimes if we don't have discards on - ++ * when we've partially overwritten a bucket with new ++ * journal entries. We don't need the rest of the ++ * bucket: ++ */ ++ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) ++ return 0; ++ ++ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ++ ++ mutex_lock(&jlist->lock); ++ ret = journal_entry_add(c, ca, (struct journal_ptr) { ++ .dev = ca->dev_idx, ++ .bucket = bucket, ++ .bucket_offset = offset - ++ bucket_to_sector(ca, ja->buckets[bucket]), ++ .sector = offset, ++ }, jlist, j, ret != 0); ++ mutex_unlock(&jlist->lock); ++ ++ switch (ret) { ++ case JOURNAL_ENTRY_ADD_OK: ++ break; ++ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: ++ break; ++ default: ++ return ret; ++ } ++next_block: ++ pr_debug("next"); ++ offset += sectors; ++ sectors_read -= sectors; ++ j = ((void *) j) + (sectors << 9); ++ } ++ ++ return 0; ++} ++ ++static void bch2_journal_read_device(struct closure *cl) ++{ ++ struct journal_device *ja = ++ container_of(cl, struct journal_device, read); ++ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct bch_fs *c = ca->fs; ++ struct journal_list *jlist = ++ container_of(cl->parent, struct journal_list, cl); ++ struct journal_replay *r, **_r; ++ struct genradix_iter iter; ++ struct journal_read_buf buf = { NULL, 0 }; ++ u64 min_seq = U64_MAX; ++ unsigned i; ++ int ret = 0; ++ ++ if (!ja->nr) ++ goto out; ++ ++ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ if (ret) ++ goto err; ++ ++ pr_debug("%u journal buckets", ja->nr); ++ ++ for (i = 0; i < ja->nr; i++) { ++ ret = journal_read_bucket(ca, &buf, jlist, i); ++ if (ret) ++ goto err; ++ } ++ ++ /* Find the journal bucket with the highest sequence number: */ ++ for (i = 0; i < ja->nr; i++) { ++ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) ++ ja->cur_idx = i; ++ ++ min_seq = min(ja->bucket_seq[i], min_seq); ++ } ++ ++ /* ++ * If there's duplicate journal entries in multiple buckets (which ++ * definitely isn't supposed to happen, but...) - make sure to start ++ * cur_idx at the last of those buckets, so we don't deadlock trying to ++ * allocate ++ */ ++ while (ja->bucket_seq[ja->cur_idx] > min_seq && ++ ja->bucket_seq[ja->cur_idx] == ++ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ mutex_lock(&jlist->lock); ++ genradix_for_each(&c->journal_entries, iter, _r) { ++ r = *_r; ++ ++ if (!r) ++ continue; ++ ++ for (i = 0; i < r->nr_ptrs; i++) { ++ if (r->ptrs[i].dev == ca->dev_idx && ++ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { ++ unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + ++ vstruct_sectors(&r->j, c->block_bits); ++ ++ ja->sectors_free = min(ja->sectors_free, ++ ca->mi.bucket_size - wrote); ++ } ++ } ++ } ++ mutex_unlock(&jlist->lock); ++ ++ if (ja->bucket_seq[ja->cur_idx] && ++ ja->sectors_free == ca->mi.bucket_size) { ++ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); ++ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); ++ for (i = 0; i < 3; i++) { ++ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; ++ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); ++ } ++ ja->sectors_free = 0; ++ } ++ ++ /* ++ * Set dirty_idx to indicate the entire journal is full and needs to be ++ * reclaimed - journal reclaim will immediately reclaim whatever isn't ++ * pinned when it first runs: ++ */ ++ ja->discard_idx = ja->dirty_idx_ondisk = ++ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; ++out: ++ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); ++ kvpfree(buf.data, buf.size); ++ percpu_ref_put(&ca->io_ref); ++ closure_return(cl); ++ return; ++err: ++ mutex_lock(&jlist->lock); ++ jlist->ret = ret; ++ mutex_unlock(&jlist->lock); ++ goto out; ++} ++ ++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct journal_replay *j) ++{ ++ unsigned i; ++ ++ for (i = 0; i < j->nr_ptrs; i++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); ++ u64 offset; ++ ++ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); ++ ++ if (i) ++ prt_printf(out, " "); ++ prt_printf(out, "%u:%u:%u (sector %llu)", ++ j->ptrs[i].dev, ++ j->ptrs[i].bucket, ++ j->ptrs[i].bucket_offset, ++ j->ptrs[i].sector); ++ } ++} ++ ++int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) ++{ ++ struct journal_list jlist; ++ struct journal_replay *i, **_i, *prev = NULL; ++ struct genradix_iter radix_iter; ++ struct bch_dev *ca; ++ unsigned iter; ++ struct printbuf buf = PRINTBUF; ++ size_t keys = 0, entries = 0; ++ bool degraded = false; ++ u64 seq, last_seq = 0; ++ int ret = 0; ++ ++ closure_init_stack(&jlist.cl); ++ mutex_init(&jlist.lock); ++ jlist.last_seq = 0; ++ jlist.ret = 0; ++ ++ for_each_member_device(ca, c, iter) { ++ if (!c->opts.fsck && ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) ++ continue; ++ ++ if ((ca->mi.state == BCH_MEMBER_STATE_rw || ++ ca->mi.state == BCH_MEMBER_STATE_ro) && ++ percpu_ref_tryget(&ca->io_ref)) ++ closure_call(&ca->journal.read, ++ bch2_journal_read_device, ++ system_unbound_wq, ++ &jlist.cl); ++ else ++ degraded = true; ++ } ++ ++ closure_sync(&jlist.cl); ++ ++ if (jlist.ret) ++ return jlist.ret; ++ ++ *start_seq = 0; ++ ++ /* ++ * Find most recent flush entry, and ignore newer non flush entries - ++ * those entries will be blacklisted: ++ */ ++ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ if (!*start_seq) ++ *start_seq = le64_to_cpu(i->j.seq) + 1; ++ ++ if (!JSET_NO_FLUSH(&i->j)) { ++ last_seq = le64_to_cpu(i->j.last_seq); ++ *blacklist_seq = le64_to_cpu(i->j.seq) + 1; ++ break; ++ } ++ ++ journal_replay_free(c, i); ++ } ++ ++ if (!*start_seq) { ++ bch_info(c, "journal read done, but no entries found"); ++ return 0; ++ } ++ ++ if (!last_seq) { ++ fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); ++ ret = -1; ++ goto err; ++ } ++ ++ /* Drop blacklisted entries and entries older than last_seq: */ ++ genradix_for_each(&c->journal_entries, radix_iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ seq = le64_to_cpu(i->j.seq); ++ if (seq < last_seq) { ++ journal_replay_free(c, i); ++ continue; ++ } ++ ++ if (bch2_journal_seq_is_blacklisted(c, seq, true)) { ++ fsck_err_on(!JSET_NO_FLUSH(&i->j), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ journal_replay_free(c, i); ++ } ++ } ++ ++ /* Check for missing entries: */ ++ seq = last_seq; ++ genradix_for_each(&c->journal_entries, radix_iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ BUG_ON(seq > le64_to_cpu(i->j.seq)); ++ ++ while (seq < le64_to_cpu(i->j.seq)) { ++ u64 missing_start, missing_end; ++ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; ++ ++ while (seq < le64_to_cpu(i->j.seq) && ++ bch2_journal_seq_is_blacklisted(c, seq, false)) ++ seq++; ++ ++ if (seq == le64_to_cpu(i->j.seq)) ++ break; ++ ++ missing_start = seq; ++ ++ while (seq < le64_to_cpu(i->j.seq) && ++ !bch2_journal_seq_is_blacklisted(c, seq, false)) ++ seq++; ++ ++ if (prev) { ++ bch2_journal_ptrs_to_text(&buf1, c, prev); ++ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); ++ } else ++ prt_printf(&buf1, "(none)"); ++ bch2_journal_ptrs_to_text(&buf2, c, i); ++ ++ missing_end = seq - 1; ++ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" ++ " prev at %s\n" ++ " next at %s", ++ missing_start, missing_end, ++ last_seq, *blacklist_seq - 1, ++ buf1.buf, buf2.buf); ++ ++ printbuf_exit(&buf1); ++ printbuf_exit(&buf2); ++ } ++ ++ prev = i; ++ seq++; ++ } ++ ++ genradix_for_each(&c->journal_entries, radix_iter, _i) { ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct bch_replicas_padded replicas = { ++ .e.data_type = BCH_DATA_journal, ++ .e.nr_required = 1, ++ }; ++ unsigned ptr; ++ ++ i = *_i; ++ if (!i || i->ignore) ++ continue; ++ ++ ret = jset_validate_entries(c, &i->j, READ); ++ if (ret) ++ goto err; ++ ++ for (ptr = 0; ptr < i->nr_ptrs; ptr++) ++ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; ++ ++ bch2_replicas_entry_sort(&replicas.e); ++ ++ /* ++ * If we're mounting in degraded mode - if we didn't read all ++ * the devices - this is wrong: ++ */ ++ ++ printbuf_reset(&buf); ++ bch2_replicas_entry_to_text(&buf, &replicas.e); ++ ++ if (!degraded && ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, ++ "superblock not marked as containing replicas %s", ++ buf.buf)) { ++ ret = bch2_mark_replicas(c, &replicas.e); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys++; ++ entries++; ++ } ++ ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, *start_seq); ++ ++ if (*start_seq != *blacklist_seq) ++ bch_info(c, "dropped unflushed entries %llu-%llu", ++ *blacklist_seq, *start_seq - 1); ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++/* journal write: */ ++ ++static void __journal_write_alloc(struct journal *j, ++ struct journal_buf *w, ++ struct dev_alloc_list *devs_sorted, ++ unsigned sectors, ++ unsigned *replicas, ++ unsigned replicas_want) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (*replicas >= replicas_want) ++ return; ++ ++ for (i = 0; i < devs_sorted->nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ /* ++ * Check that we can use this device, and aren't already using ++ * it: ++ */ ++ if (!ca->mi.durability || ++ ca->mi.state != BCH_MEMBER_STATE_rw || ++ !ja->nr || ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ++ ca->dev_idx) || ++ sectors > ja->sectors_free) ++ continue; ++ ++ bch2_dev_stripe_increment(ca, &j->wp.stripe); ++ ++ bch2_bkey_append_ptr(&w->key, ++ (struct bch_extent_ptr) { ++ .offset = bucket_to_sector(ca, ++ ja->buckets[ja->cur_idx]) + ++ ca->mi.bucket_size - ++ ja->sectors_free, ++ .dev = ca->dev_idx, ++ }); ++ ++ ja->sectors_free -= sectors; ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ ++ *replicas += ca->mi.durability; ++ ++ if (*replicas >= replicas_want) ++ break; ++ } ++} ++ ++/** ++ * journal_next_bucket - move on to the next journal bucket if possible ++ */ ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned sectors) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_devs_mask devs; ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ struct dev_alloc_list devs_sorted; ++ unsigned target = c->opts.metadata_target ?: ++ c->opts.foreground_target; ++ unsigned i, replicas = 0, replicas_want = ++ READ_ONCE(c->opts.metadata_replicas); ++ ++ rcu_read_lock(); ++retry: ++ devs = target_rw_devs(c, BCH_DATA_journal, target); ++ ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas >= replicas_want) ++ goto done; ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ } ++ } ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas < replicas_want && target) { ++ /* Retry from all devices: */ ++ target = 0; ++ goto retry; ++ } ++done: ++ rcu_read_unlock(); ++ ++ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); ++ ++ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; ++} ++ ++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) ++{ ++ /* we aren't holding j->lock: */ ++ unsigned new_size = READ_ONCE(j->buf_size_want); ++ void *new_buf; ++ ++ if (buf->buf_size >= new_size) ++ return; ++ ++ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); ++ if (!new_buf) ++ return; ++ ++ memcpy(new_buf, buf->data, buf->buf_size); ++ ++ spin_lock(&j->lock); ++ swap(buf->data, new_buf); ++ swap(buf->buf_size, new_size); ++ spin_unlock(&j->lock); ++ ++ kvpfree(new_buf, new_size); ++} ++ ++static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) ++{ ++ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); ++} ++ ++static void journal_write_done(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct bch_replicas_padded replicas; ++ union journal_res_state old, new; ++ u64 v, seq; ++ int err = 0; ++ ++ bch2_time_stats_update(!JSET_NO_FLUSH(w->data) ++ ? j->flush_write_time ++ : j->noflush_write_time, j->write_start_time); ++ ++ if (!w->devs_written.nr) { ++ bch_err(c, "unable to write journal to sufficient devices"); ++ err = -EIO; ++ } else { ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ w->devs_written); ++ if (bch2_mark_replicas(c, &replicas.e)) ++ err = -EIO; ++ } ++ ++ if (err) ++ bch2_fatal_error(c); ++ ++ spin_lock(&j->lock); ++ seq = le64_to_cpu(w->data->seq); ++ ++ if (seq >= j->pin.front) ++ journal_seq_pin(j, seq)->devs = w->devs_written; ++ ++ if (!err) { ++ if (!JSET_NO_FLUSH(w->data)) { ++ j->flushed_seq_ondisk = seq; ++ j->last_seq_ondisk = w->last_seq; ++ ++ bch2_do_discards(c); ++ closure_wake_up(&c->freelist_wait); ++ } ++ } else if (!j->err_seq || seq < j->err_seq) ++ j->err_seq = seq; ++ ++ j->seq_ondisk = seq; ++ ++ /* ++ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard ++ * more buckets: ++ * ++ * Must come before signaling write completion, for ++ * bch2_fs_journal_stop(): ++ */ ++ if (j->watermark) ++ journal_reclaim_kick(&c->journal); ++ ++ /* also must come before signalling write completion: */ ++ closure_debug_destroy(cl); ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ BUG_ON(journal_state_count(new, new.unwritten_idx)); ++ ++ new.unwritten_idx++; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ bch2_journal_space_available(j); ++ ++ closure_wake_up(&w->wait); ++ journal_wake(j); ++ ++ if (!journal_state_count(new, new.unwritten_idx) && ++ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { ++ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); ++ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && ++ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { ++ struct journal_buf *buf = journal_cur_buf(j); ++ long delta = buf->expires - jiffies; ++ ++ /* ++ * We don't close a journal entry to write it while there's ++ * previous entries still in flight - the current journal entry ++ * might want to be written now: ++ */ ++ ++ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++static void journal_write_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ struct journal *j = &ca->fs->journal; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ unsigned long flags; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", ++ le64_to_cpu(w->data->seq), ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("journal")) { ++ spin_lock_irqsave(&j->err_lock, flags); ++ bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); ++ spin_unlock_irqrestore(&j->err_lock, flags); ++ } ++ ++ closure_put(&j->io); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void do_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct bch_extent_ptr *ptr; ++ struct bio *bio; ++ unsigned sectors = vstruct_sectors(w->data, c->block_bits); ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ ++ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); ++ ca->prev_journal_sector = bio->bi_iter.bi_sector; ++ ++ if (!JSET_NO_FLUSH(w->data)) ++ bio->bi_opf |= REQ_FUA; ++ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) ++ bio->bi_opf |= REQ_PREFLUSH; ++ ++ bch2_bio_map(bio, w->data, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = ++ le64_to_cpu(w->data->seq); ++ } ++ ++ continue_at(cl, journal_write_done, c->io_complete_wq); ++ return; ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct jset_entry *start, *end; ++ struct jset *jset; ++ struct bio *bio; ++ struct printbuf journal_debug_buf = PRINTBUF; ++ bool validate_before_checksum = false; ++ unsigned i, sectors, bytes, u64s, nr_rw_members = 0; ++ int ret; ++ ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ ++ journal_buf_realloc(j, w); ++ jset = w->data; ++ ++ j->write_start_time = local_clock(); ++ ++ spin_lock(&j->lock); ++ if (bch2_journal_error(j) || ++ w->noflush || ++ (!w->must_flush && ++ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && ++ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { ++ w->noflush = true; ++ SET_JSET_NO_FLUSH(jset, true); ++ jset->last_seq = 0; ++ w->last_seq = 0; ++ ++ j->nr_noflush_writes++; ++ } else { ++ j->last_flush_write = jiffies; ++ j->nr_flush_writes++; ++ } ++ spin_unlock(&j->lock); ++ ++ /* ++ * New btree roots are set by journalling them; when the journal entry ++ * gets written we have to propagate them to c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the btree roots ++ * (at least for now); so after we copy btree roots to c->btree_roots we ++ * have to get any missing btree roots and add them to this journal ++ * entry: ++ */ ++ ++ bch2_journal_entries_to_btree_roots(c, jset); ++ ++ start = end = vstruct_last(jset); ++ ++ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ ++ bch2_journal_super_entries_add_common(c, &end, ++ le64_to_cpu(jset->seq)); ++ u64s = (u64 *) end - (u64 *) start; ++ BUG_ON(u64s > j->entry_u64s_reserved); ++ ++ le32_add_cpu(&jset->u64s, u64s); ++ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); ++ ++ jset->magic = cpu_to_le64(jset_magic(c)); ++ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber ++ ? cpu_to_le32(BCH_JSET_VERSION_OLD) ++ : cpu_to_le32(c->sb.version); ++ ++ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); ++ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); ++ ++ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) ++ j->last_empty_seq = le64_to_cpu(jset->seq); ++ ++ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) ++ validate_before_checksum = true; ++ ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) ++ validate_before_checksum = true; ++ ++ if (validate_before_checksum && ++ jset_validate_for_write(c, jset)) ++ goto err; ++ ++ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "error decrypting journal entry: %i", ret)) ++ goto err; ++ ++ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), ++ journal_nonce(jset), jset); ++ ++ if (!validate_before_checksum && ++ jset_validate_for_write(c, jset)) ++ goto err; ++ ++ sectors = vstruct_sectors(jset, c->block_bits); ++ BUG_ON(sectors > w->sectors); ++ ++ bytes = vstruct_bytes(jset); ++ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ ++retry_alloc: ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w, sectors); ++ ++ if (ret && j->can_discard) { ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ goto retry_alloc; ++ } ++ ++ if (ret) ++ __bch2_journal_debug_to_text(&journal_debug_buf, j); ++ ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ if (ret) { ++ bch_err(c, "Unable to allocate journal write:\n%s", ++ journal_debug_buf.buf); ++ printbuf_exit(&journal_debug_buf); ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, c->io_complete_wq); ++ return; ++ } ++ ++ w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ ++ if (c->opts.nochanges) ++ goto no_io; ++ ++ for_each_rw_member(ca, c, i) ++ nr_rw_members++; ++ ++ if (nr_rw_members > 1) ++ w->separate_flush = true; ++ ++ if (!JSET_NO_FLUSH(jset) && w->separate_flush) { ++ for_each_rw_member(ca, c, i) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ } ++ ++ continue_at(cl, do_journal_write, c->io_complete_wq); ++ return; ++no_io: ++ continue_at(cl, journal_write_done, c->io_complete_wq); ++ return; ++err: ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, c->io_complete_wq); ++} +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +new file mode 100644 +index 000000000000..30e995c81fc4 +--- /dev/null ++++ b/fs/bcachefs/journal_io.h +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_IO_H ++#define _BCACHEFS_JOURNAL_IO_H ++ ++/* ++ * Only used for holding the journal entries we read in btree_journal_read() ++ * during cache_registration ++ */ ++struct journal_replay { ++ struct journal_ptr { ++ u8 dev; ++ u32 bucket; ++ u32 bucket_offset; ++ u64 sector; ++ } ptrs[BCH_REPLICAS_MAX]; ++ unsigned nr_ptrs; ++ ++ /* checksum error, but we may want to try using it anyways: */ ++ bool bad; ++ bool ignore; ++ /* must be last: */ ++ struct jset j; ++}; ++ ++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ++ struct jset_entry *entry, unsigned type) ++{ ++ while (entry < vstruct_last(jset)) { ++ if (entry->type == type) ++ return entry; ++ ++ entry = vstruct_next(entry); ++ } ++ ++ return NULL; ++} ++ ++#define for_each_jset_entry_type(entry, jset, type) \ ++ for (entry = (jset)->start; \ ++ (entry = __jset_entry_type_next(jset, entry, type)); \ ++ entry = vstruct_next(entry)) ++ ++#define for_each_jset_key(k, _n, entry, jset) \ ++ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ ++ vstruct_for_each_safe(entry, k, _n) ++ ++int bch2_journal_entry_validate(struct bch_fs *, const char *, ++ struct jset_entry *, unsigned, int, int); ++void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, ++ struct jset_entry *); ++ ++void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct journal_replay *); ++ ++int bch2_journal_read(struct bch_fs *, u64 *, u64 *); ++ ++void bch2_journal_write(struct closure *); ++ ++#endif /* _BCACHEFS_JOURNAL_IO_H */ +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +new file mode 100644 +index 000000000000..6f0ab411c98e +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.c +@@ -0,0 +1,852 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_key_cache.h" ++#include "errcode.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++/* Free space calculations: */ ++ ++static unsigned journal_space_from(struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ switch (from) { ++ case journal_space_discarded: ++ return ja->discard_idx; ++ case journal_space_clean_ondisk: ++ return ja->dirty_idx_ondisk; ++ case journal_space_clean: ++ return ja->dirty_idx; ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *j, ++ struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; ++ ++ /* ++ * Don't use the last bucket unless writing the new last_seq ++ * will make another bucket available: ++ */ ++ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) ++ --available; ++ ++ return available; ++} ++ ++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) ++{ ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ new.remaining = u64s_remaining; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++} ++ ++static struct journal_space ++journal_dev_space_available(struct journal *j, struct bch_dev *ca, ++ enum journal_space_from from) ++{ ++ struct journal_device *ja = &ca->journal; ++ unsigned sectors, buckets, unwritten; ++ u64 seq; ++ ++ if (from == journal_space_total) ++ return (struct journal_space) { ++ .next_entry = ca->mi.bucket_size, ++ .total = ca->mi.bucket_size * ja->nr, ++ }; ++ ++ buckets = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors = ja->sectors_free; ++ ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ for (seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j); ++ seq++) { ++ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; ++ ++ if (!unwritten) ++ continue; ++ ++ /* entry won't fit on this device, skip: */ ++ if (unwritten > ca->mi.bucket_size) ++ continue; ++ ++ if (unwritten >= sectors) { ++ if (!buckets) { ++ sectors = 0; ++ break; ++ } ++ ++ buckets--; ++ sectors = ca->mi.bucket_size; ++ } ++ ++ sectors -= unwritten; ++ } ++ ++ if (sectors < ca->mi.bucket_size && buckets) { ++ buckets--; ++ sectors = ca->mi.bucket_size; ++ } ++ ++ return (struct journal_space) { ++ .next_entry = sectors, ++ .total = sectors + buckets * ca->mi.bucket_size, ++ }; ++} ++ ++static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned i, pos, nr_devs = 0; ++ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; ++ ++ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ if (!ca->journal.nr) ++ continue; ++ ++ space = journal_dev_space_available(j, ca, from); ++ if (!space.next_entry) ++ continue; ++ ++ for (pos = 0; pos < nr_devs; pos++) ++ if (space.total > dev_space[pos].total) ++ break; ++ ++ array_insert_item(dev_space, nr_devs, pos, space); ++ } ++ rcu_read_unlock(); ++ ++ if (nr_devs < nr_devs_want) ++ return (struct journal_space) { 0, 0 }; ++ ++ /* ++ * We sorted largest to smallest, and we want the smallest out of the ++ * @nr_devs_want largest devices: ++ */ ++ return dev_space[nr_devs_want - 1]; ++} ++ ++void bch2_journal_space_available(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned clean, clean_ondisk, total; ++ s64 u64s_remaining = 0; ++ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, ++ j->buf[1].buf_size >> 9); ++ unsigned i, nr_online = 0, nr_devs_want; ++ bool can_discard = false; ++ int ret = 0; ++ ++ lockdep_assert_held(&j->lock); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ while (ja->dirty_idx != ja->cur_idx && ++ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ ++ while (ja->dirty_idx_ondisk != ja->dirty_idx && ++ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ ++ if (ja->discard_idx != ja->dirty_idx_ondisk) ++ can_discard = true; ++ ++ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); ++ nr_online++; ++ } ++ rcu_read_unlock(); ++ ++ j->can_discard = can_discard; ++ ++ if (nr_online < c->opts.metadata_replicas_required) { ++ ret = JOURNAL_ERR_insufficient_devices; ++ goto out; ++ } ++ ++ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); ++ ++ for (i = 0; i < journal_space_nr; i++) ++ j->space[i] = __journal_space_available(j, nr_devs_want, i); ++ ++ clean_ondisk = j->space[journal_space_clean_ondisk].total; ++ clean = j->space[journal_space_clean].total; ++ total = j->space[journal_space_total].total; ++ ++ if (!clean_ondisk && ++ journal_cur_seq(j) == j->seq_ondisk) { ++ struct printbuf buf = PRINTBUF; ++ ++ __bch2_journal_debug_to_text(&buf, j); ++ bch_err(c, "journal stuck\n%s", buf.buf); ++ printbuf_exit(&buf); ++ ++ /* ++ * Hack: bch2_fatal_error() calls bch2_journal_halt() which ++ * takes journal lock: ++ */ ++ spin_unlock(&j->lock); ++ bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ ++ ret = JOURNAL_ERR_journal_stuck; ++ } else if (!j->space[journal_space_discarded].next_entry) ++ ret = JOURNAL_ERR_journal_full; ++ ++ if ((j->space[journal_space_clean_ondisk].next_entry < ++ j->space[journal_space_clean_ondisk].total) && ++ (clean - clean_ondisk <= total / 8) && ++ (clean_ondisk * 2 > clean )) ++ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); ++ else ++ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); ++ ++ u64s_remaining = (u64) clean << 6; ++ u64s_remaining -= (u64) total << 3; ++ u64s_remaining = max(0LL, u64s_remaining); ++ u64s_remaining /= 4; ++ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); ++out: ++ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; ++ j->cur_entry_error = ret; ++ journal_set_remaining(j, u64s_remaining); ++ journal_set_watermark(j); ++ ++ if (!ret) ++ journal_wake(j); ++} ++ ++/* Discards - last part of journal reclaim: */ ++ ++static bool should_discard_bucket(struct journal *j, struct journal_device *ja) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = ja->discard_idx != ja->dirty_idx_ondisk; ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * Advance ja->discard_idx as long as it points to buckets that are no longer ++ * dirty, issuing discards if necessary: ++ */ ++void bch2_journal_do_discards(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ mutex_lock(&j->discard_lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ ++ while (should_discard_bucket(j, ja)) { ++ if (!c->opts.nochanges && ++ ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, ++ ja->buckets[ja->discard_idx]), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ spin_lock(&j->lock); ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ } ++ } ++ ++ mutex_unlock(&j->discard_lock); ++} ++ ++/* ++ * Journal entry pinning - machinery for holding a reference on a given journal ++ * entry, holding it open to ensure it gets replayed during recovery: ++ */ ++ ++static void bch2_journal_reclaim_fast(struct journal *j) ++{ ++ struct journal_entry_pin_list temp; ++ bool popped = false; ++ ++ lockdep_assert_held(&j->lock); ++ ++ /* ++ * Unpin journal entries whose reference counts reached zero, meaning ++ * all btree nodes got written out ++ */ ++ while (!fifo_empty(&j->pin) && ++ !atomic_read(&fifo_peek_front(&j->pin).count)) { ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); ++ BUG_ON(!fifo_pop(&j->pin, temp)); ++ popped = true; ++ } ++ ++ if (popped) ++ bch2_journal_space_available(j); ++} ++ ++void __bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) ++ bch2_journal_reclaim_fast(j); ++} ++ ++void bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) { ++ spin_lock(&j->lock); ++ bch2_journal_reclaim_fast(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++static inline void __journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ if (!journal_pin_active(pin)) ++ return; ++ ++ if (j->flush_in_progress == pin) ++ j->flush_in_progress_dropped = true; ++ ++ pin_list = journal_seq_pin(j, pin->seq); ++ pin->seq = 0; ++ list_del_init(&pin->list); ++ ++ /* ++ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * writing a new last_seq will now make another bucket available: ++ */ ++ if (atomic_dec_and_test(&pin_list->count) && ++ pin_list == &fifo_peek_front(&j->pin)) ++ bch2_journal_reclaim_fast(j); ++} ++ ++void bch2_journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_drop(j, pin); ++ spin_unlock(&j->lock); ++} ++ ++void bch2_journal_pin_set(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ spin_lock(&j->lock); ++ ++ if (seq < journal_last_seq(j)) { ++ /* ++ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on ++ * the src pin - with the pin dropped, the entry to pin might no ++ * longer to exist, but that means there's no longer anything to ++ * copy and we can bail out here: ++ */ ++ spin_unlock(&j->lock); ++ return; ++ } ++ ++ pin_list = journal_seq_pin(j, seq); ++ ++ __journal_pin_drop(j, pin); ++ ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ if (flush_fn == bch2_btree_key_cache_journal_flush) ++ list_add(&pin->list, &pin_list->key_cache_list); ++ else if (flush_fn) ++ list_add(&pin->list, &pin_list->list); ++ else ++ list_add(&pin->list, &pin_list->flushed); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++/** ++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running ++ */ ++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) ++{ ++ BUG_ON(journal_pin_active(pin)); ++ ++ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); ++} ++ ++/* ++ * Journal reclaim: flush references to open journal entries to reclaim space in ++ * the journal ++ * ++ * May be done by the journal code in the background as needed to free up space ++ * for more journal entries, or as part of doing a clean shutdown, or to migrate ++ * data off of a specific device: ++ */ ++ ++static struct journal_entry_pin * ++journal_get_next_pin(struct journal *j, ++ bool get_any, ++ bool get_key_cache, ++ u64 max_seq, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *ret = NULL; ++ ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { ++ if (*seq > max_seq && !get_any && !get_key_cache) ++ break; ++ ++ if (*seq <= max_seq || get_any) { ++ ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list); ++ if (ret) ++ return ret; ++ } ++ ++ if (*seq <= max_seq || get_any || get_key_cache) { ++ ret = list_first_entry_or_null(&pin_list->key_cache_list, ++ struct journal_entry_pin, list); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ return NULL; ++} ++ ++/* returns true if we did work */ ++static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_any, ++ unsigned min_key_cache) ++{ ++ struct journal_entry_pin *pin; ++ size_t nr_flushed = 0; ++ journal_pin_flush_fn flush_fn; ++ u64 seq; ++ int err; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ while (1) { ++ cond_resched(); ++ ++ j->last_flushed = jiffies; ++ ++ spin_lock(&j->lock); ++ pin = journal_get_next_pin(j, ++ min_any != 0, ++ min_key_cache != 0, ++ seq_to_flush, &seq); ++ if (pin) { ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = pin; ++ j->flush_in_progress_dropped = false; ++ flush_fn = pin->flush; ++ } ++ spin_unlock(&j->lock); ++ ++ if (!pin) ++ break; ++ ++ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) ++ min_key_cache--; ++ ++ if (min_any) ++ min_any--; ++ ++ err = flush_fn(j, pin, seq); ++ ++ spin_lock(&j->lock); ++ /* Pin might have been dropped or rearmed: */ ++ if (likely(!err && !j->flush_in_progress_dropped)) ++ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); ++ j->flush_in_progress = NULL; ++ j->flush_in_progress_dropped = false; ++ spin_unlock(&j->lock); ++ ++ wake_up(&j->pin_flush_wait); ++ ++ if (err) ++ break; ++ ++ nr_flushed++; ++ } ++ ++ return nr_flushed; ++} ++ ++static u64 journal_seq_to_flush(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ u64 seq_to_flush = 0; ++ unsigned iter; ++ ++ spin_lock(&j->lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ unsigned nr_buckets, bucket_to_flush; ++ ++ if (!ja->nr) ++ continue; ++ ++ /* Try to keep the journal at most half full: */ ++ nr_buckets = ja->nr / 2; ++ ++ /* And include pre-reservations: */ ++ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, ++ (ca->mi.bucket_size << 6) - ++ journal_entry_overhead(j)); ++ ++ nr_buckets = min(nr_buckets, ja->nr); ++ ++ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; ++ seq_to_flush = max(seq_to_flush, ++ ja->bucket_seq[bucket_to_flush]); ++ } ++ ++ /* Also flush if the pin fifo is more than half full */ ++ seq_to_flush = max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); ++ spin_unlock(&j->lock); ++ ++ return seq_to_flush; ++} ++ ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ u64 seq_to_flush; ++ size_t min_nr, min_key_cache, nr_flushed; ++ unsigned flags; ++ int ret = 0; ++ ++ /* ++ * We can't invoke memory reclaim while holding the reclaim_lock - ++ * journal reclaim is required to make progress for memory reclaim ++ * (cleaning the caches), so we can't get stuck in memory reclaim while ++ * we're holding the reclaim lock: ++ */ ++ lockdep_assert_held(&j->reclaim_lock); ++ flags = memalloc_noreclaim_save(); ++ ++ do { ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (bch2_journal_error(j)) { ++ ret = -EIO; ++ break; ++ } ++ ++ bch2_journal_do_discards(j); ++ ++ seq_to_flush = journal_seq_to_flush(j); ++ min_nr = 0; ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(c->opts.journal_reclaim_delay))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 4 > j->prereserved.remaining) ++ min_nr = 1; ++ ++ if (fifo_free(&j->pin) <= 32) ++ min_nr = 1; ++ ++ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) ++ min_nr = 1; ++ ++ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); ++ ++ trace_journal_reclaim_start(c, direct, kicked, ++ min_nr, min_key_cache, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ atomic_read(&c->btree_cache.dirty), ++ c->btree_cache.used, ++ atomic_long_read(&c->btree_key_cache.nr_dirty), ++ atomic_long_read(&c->btree_key_cache.nr_keys)); ++ ++ nr_flushed = journal_flush_pins(j, seq_to_flush, ++ min_nr, min_key_cache); ++ ++ if (direct) ++ j->nr_direct_reclaim += nr_flushed; ++ else ++ j->nr_background_reclaim += nr_flushed; ++ trace_journal_reclaim_finish(c, nr_flushed); ++ ++ if (nr_flushed) ++ wake_up(&j->reclaim_wait); ++ } while ((min_nr || min_key_cache) && nr_flushed && !direct); ++ ++ memalloc_noreclaim_restore(flags); ++ ++ return ret; ++} ++ ++int bch2_journal_reclaim(struct journal *j) ++{ ++ return __bch2_journal_reclaim(j, true, true); ++} ++ ++static int bch2_journal_reclaim_thread(void *arg) ++{ ++ struct journal *j = arg; ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ unsigned long delay, now; ++ bool journal_empty; ++ int ret = 0; ++ ++ set_freezable(); ++ ++ j->last_flushed = jiffies; ++ ++ while (!ret && !kthread_should_stop()) { ++ bool kicked = j->reclaim_kicked; ++ ++ j->reclaim_kicked = false; ++ ++ mutex_lock(&j->reclaim_lock); ++ ret = __bch2_journal_reclaim(j, false, kicked); ++ mutex_unlock(&j->reclaim_lock); ++ ++ now = jiffies; ++ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); ++ j->next_reclaim = j->last_flushed + delay; ++ ++ if (!time_in_range(j->next_reclaim, now, now + delay)) ++ j->next_reclaim = now + delay; ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) ++ break; ++ if (j->reclaim_kicked) ++ break; ++ ++ spin_lock(&j->lock); ++ journal_empty = fifo_empty(&j->pin); ++ spin_unlock(&j->lock); ++ ++ if (journal_empty) ++ freezable_schedule(); ++ else if (time_after(j->next_reclaim, jiffies)) ++ freezable_schedule_timeout(j->next_reclaim - jiffies); ++ else ++ break; ++ } ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ return 0; ++} ++ ++void bch2_journal_reclaim_stop(struct journal *j) ++{ ++ struct task_struct *p = j->reclaim_thread; ++ ++ j->reclaim_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_journal_reclaim_start(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct task_struct *p; ++ int ret; ++ ++ if (j->reclaim_thread) ++ return 0; ++ ++ p = kthread_create(bch2_journal_reclaim_thread, j, ++ "bch-reclaim/%s", c->name); ++ ret = PTR_ERR_OR_ZERO(p); ++ if (ret) { ++ bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ get_task_struct(p); ++ j->reclaim_thread = p; ++ wake_up_process(p); ++ return 0; ++} ++ ++static int journal_flush_done(struct journal *j, u64 seq_to_flush, ++ bool *did_work) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&j->reclaim_lock); ++ ++ if (journal_flush_pins(j, seq_to_flush, 0, 0)) ++ *did_work = true; ++ ++ spin_lock(&j->lock); ++ /* ++ * If journal replay hasn't completed, the unreplayed journal entries ++ * hold refs on their corresponding sequence numbers ++ */ ++ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || ++ journal_last_seq(j) > seq_to_flush || ++ !fifo_used(&j->pin); ++ ++ spin_unlock(&j->lock); ++ mutex_unlock(&j->reclaim_lock); ++ ++ return ret; ++} ++ ++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++{ ++ bool did_work = false; ++ ++ if (!test_bit(JOURNAL_STARTED, &j->flags)) ++ return false; ++ ++ closure_wait_event(&j->async_wait, ++ journal_flush_done(j, seq_to_flush, &did_work)); ++ ++ return did_work; ++} ++ ++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ u64 iter, seq = 0; ++ int ret = 0; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_pins(j, seq); ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->replicas_gc_lock); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); ++ ++ seq = 0; ++ ++ spin_lock(&j->lock); ++ while (!ret) { ++ struct bch_replicas_padded replicas; ++ ++ seq = max(seq, journal_last_seq(j)); ++ if (seq >= j->pin.back) ++ break; ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ journal_seq_pin(j, seq)->devs); ++ seq++; ++ ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } ++ spin_unlock(&j->lock); ++ ++ ret = bch2_replicas_gc_end(c, ret); ++ mutex_unlock(&c->replicas_gc_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +new file mode 100644 +index 000000000000..0fd1af120db5 +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.h +@@ -0,0 +1,86 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H ++#define _BCACHEFS_JOURNAL_RECLAIM_H ++ ++#define JOURNAL_PIN (32 * 1024) ++ ++static inline void journal_reclaim_kick(struct journal *j) ++{ ++ struct task_struct *p = READ_ONCE(j->reclaim_thread); ++ ++ j->reclaim_kicked = true; ++ if (p) ++ wake_up_process(p); ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *, ++ struct journal_device *, ++ enum journal_space_from); ++void bch2_journal_space_available(struct journal *); ++ ++static inline bool journal_pin_active(struct journal_entry_pin *pin) ++{ ++ return pin->seq != 0; ++} ++ ++static inline struct journal_entry_pin_list * ++journal_seq_pin(struct journal *j, u64 seq) ++{ ++ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); ++ ++ return &j->pin.data[seq & j->pin.mask]; ++} ++ ++void __bch2_journal_pin_put(struct journal *, u64); ++void bch2_journal_pin_put(struct journal *, u64); ++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++static inline void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) ++ bch2_journal_pin_set(j, seq, pin, flush_fn); ++} ++ ++static inline void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) ++{ ++ /* Guard against racing with journal_pin_drop(src): */ ++ u64 seq = READ_ONCE(src->seq); ++ ++ if (seq) ++ bch2_journal_pin_add(j, seq, dst, flush_fn); ++} ++ ++static inline void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) ++ bch2_journal_pin_set(j, seq, pin, flush_fn); ++} ++ ++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_do_discards(struct journal *); ++int bch2_journal_reclaim(struct journal *); ++ ++void bch2_journal_reclaim_stop(struct journal *); ++int bch2_journal_reclaim_start(struct journal *); ++ ++bool bch2_journal_flush_pins(struct journal *, u64); ++ ++static inline bool bch2_journal_flush_all_pins(struct journal *j) ++{ ++ return bch2_journal_flush_pins(j, U64_MAX); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *, int); ++ ++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +new file mode 100644 +index 000000000000..001cecec1291 +--- /dev/null ++++ b/fs/bcachefs/journal_sb.c +@@ -0,0 +1,220 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal_sb.h" ++#include "darray.h" ++ ++#include ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ const u64 *l = _l; ++ const u64 *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static int bch2_sb_journal_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ if (!b[0]) { ++ prt_printf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0] < le16_to_cpu(m->first_bucket)) { ++ prt_printf(err, "journal bucket %llu before first bucket %u", ++ b[0], le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { ++ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1], le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) { ++ prt_printf(err, "duplicate journal buckets %llu", b[i]); ++ goto err; ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ unsigned i, nr = bch2_nr_journal_buckets(journal); ++ ++ prt_printf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_journal_validate, ++ .to_text = bch2_sb_journal_to_text, ++}; ++ ++struct u64_range { ++ u64 start; ++ u64 end; ++}; ++ ++static int u64_range_cmp(const void *_l, const void *_r) ++{ ++ const struct u64_range *l = _l; ++ const struct u64_range *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++static int bch2_sb_journal_v2_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ int ret = -EINVAL; ++ unsigned nr; ++ unsigned i; ++ struct u64_range *b; ++ ++ nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ if (!nr) ++ return 0; ++ ++ b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL); ++ if (!b) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr; i++) { ++ b[i].start = le64_to_cpu(journal->d[i].start); ++ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); ++ } ++ ++ sort(b, nr, sizeof(*b), u64_range_cmp, NULL); ++ ++ if (!b[0].start) { ++ prt_printf(err, "journal bucket at sector 0"); ++ goto err; ++ } ++ ++ if (b[0].start < le16_to_cpu(m->first_bucket)) { ++ prt_printf(err, "journal bucket %llu before first bucket %u", ++ b[0].start, le16_to_cpu(m->first_bucket)); ++ goto err; ++ } ++ ++ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { ++ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", ++ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); ++ goto err; ++ } ++ ++ for (i = 0; i + 1 < nr; i++) { ++ if (b[i].end > b[i + 1].start) { ++ prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", ++ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); ++ goto err; ++ } ++ } ++ ++ ret = 0; ++err: ++ kfree(b); ++ return ret; ++} ++ ++static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); ++ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); ++ ++ prt_printf(out, "Buckets: "); ++ for (i = 0; i < nr; i++) ++ prt_printf(out, " %llu-%llu", ++ le64_to_cpu(journal->d[i].start), ++ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { ++ .validate = bch2_sb_journal_v2_validate, ++ .to_text = bch2_sb_journal_v2_to_text, ++}; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal_v2 *j; ++ unsigned i, dst = 0, nr = 1; ++ ++ if (c) ++ lockdep_assert_held(&c->sb_lock); ++ ++ if (!ja->nr) { ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); ++ return 0; ++ } ++ ++ for (i = 0; i + 1 < ja->nr; i++) ++ if (ja->buckets[i] + 1 != ja->buckets[i + 1]) ++ nr++; ++ ++ j = bch2_sb_resize_journal_v2(&ca->disk_sb, ++ (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); ++ if (!j) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); ++ ++ j->d[dst].start = le64_to_cpu(ja->buckets[0]); ++ j->d[dst].nr = le64_to_cpu(1); ++ ++ for (i = 1; i < ja->nr; i++) { ++ if (ja->buckets[i] == ja->buckets[i - 1] + 1) { ++ le64_add_cpu(&j->d[dst].nr, 1); ++ } else { ++ dst++; ++ j->d[dst].start = le64_to_cpu(ja->buckets[i]); ++ j->d[dst].nr = le64_to_cpu(1); ++ } ++ } ++ ++ BUG_ON(dst + 1 != nr); ++ ++ return 0; ++} +diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h +new file mode 100644 +index 000000000000..a39192e9f6f4 +--- /dev/null ++++ b/fs/bcachefs/journal_sb.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#include "super-io.h" ++#include "vstructs.h" ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) ++{ ++ if (!j) ++ return 0; ++ ++ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; ++} ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal; ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; ++ ++int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +new file mode 100644 +index 000000000000..5c555b3703c0 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -0,0 +1,322 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "eytzinger.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++/* ++ * journal_seq_blacklist machinery: ++ * ++ * To guarantee order of btree updates after a crash, we need to detect when a ++ * btree node entry (bset) is newer than the newest journal entry that was ++ * successfully written, and ignore it - effectively ignoring any btree updates ++ * that didn't make it into the journal. ++ * ++ * If we didn't do this, we might have two btree nodes, a and b, both with ++ * updates that weren't written to the journal yet: if b was updated after a, ++ * but b was flushed and not a - oops; on recovery we'll find that the updates ++ * to b happened, but not the updates to a that happened before it. ++ * ++ * Ignoring bsets that are newer than the newest journal entry is always safe, ++ * because everything they contain will also have been journalled - and must ++ * still be present in the journal on disk until a journal entry has been ++ * written _after_ that bset was written. ++ * ++ * To accomplish this, bsets record the newest journal sequence number they ++ * contain updates for; then, on startup, the btree code queries the journal ++ * code to ask "Is this sequence number newer than the newest journal entry? If ++ * so, ignore it." ++ * ++ * When this happens, we must blacklist that journal sequence number: the ++ * journal must not write any entries with that sequence number, and it must ++ * record that it was blacklisted so that a) on recovery we don't think we have ++ * missing journal entries and b) so that the btree code continues to ignore ++ * that bset, until that btree node is rewritten. ++ */ ++ ++static unsigned sb_blacklist_u64s(unsigned nr) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ ++ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); ++} ++ ++static struct bch_sb_field_journal_seq_blacklist * ++blacklist_entry_try_merge(struct bch_fs *c, ++ struct bch_sb_field_journal_seq_blacklist *bl, ++ unsigned i) ++{ ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ if (le64_to_cpu(bl->start[i].end) >= ++ le64_to_cpu(bl->start[i + 1].start)) { ++ bl->start[i].end = bl->start[i + 1].end; ++ --nr; ++ memmove(&bl->start[i], ++ &bl->start[i + 1], ++ sizeof(bl->start[0]) * (nr - i)); ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr)); ++ BUG_ON(!bl); ++ } ++ ++ return bl; ++} ++ ++static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, ++ u64 start, u64 end) ++{ ++ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); ++} ++ ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ unsigned i, nr; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ nr = blacklist_nr_entries(bl); ++ ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (bl_entry_contig_or_overlaps(e, start, end)) { ++ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); ++ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; ++ } ++ } ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr + 1)); ++ if (!bl) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ bl->start[nr].start = cpu_to_le64(start); ++ bl->start[nr].end = cpu_to_le64(end); ++out_write_sb: ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ++ ++ ret = bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret ?: bch2_blacklist_table_initialize(c); ++} ++ ++static int journal_seq_blacklist_table_cmp(const void *_l, ++ const void *_r, size_t size) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, ++ bool dirty) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx; ++ ++ if (!t) ++ return false; ++ ++ idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0) ++ return false; ++ ++ BUG_ON(t->entries[idx].start > seq); ++ ++ if (seq >= t->entries[idx].end) ++ return false; ++ ++ if (dirty) ++ t->entries[idx].dirty = true; ++ return true; ++} ++ ++int bch2_blacklist_table_initialize(struct bch_fs *c) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ struct journal_seq_blacklist_table *t; ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ if (!bl) ++ return 0; ++ ++ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ t->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ t->entries[i].start = le64_to_cpu(bl->start[i].start); ++ t->entries[i].end = le64_to_cpu(bl->start[i].end); ++ } ++ ++ eytzinger0_sort(t->entries, ++ t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ NULL); ++ ++ kfree(c->journal_seq_blacklist_table); ++ c->journal_seq_blacklist_table = t; ++ return 0; ++} ++ ++static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = bl->start + i; ++ ++ if (le64_to_cpu(e->start) >= ++ le64_to_cpu(e->end)) { ++ prt_printf(err, "entry %u start >= end (%llu >= %llu)", ++ i, le64_to_cpu(e->start), le64_to_cpu(e->end)); ++ return -EINVAL; ++ } ++ ++ if (i + 1 < nr && ++ le64_to_cpu(e[0].end) > ++ le64_to_cpu(e[1].start)) { ++ prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", ++ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (i != bl->start) ++ prt_printf(out, " "); ++ ++ prt_printf(out, "%llu-%llu", ++ le64_to_cpu(i->start), ++ le64_to_cpu(i->end)); ++ } ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { ++ .validate = bch2_sb_journal_seq_blacklist_validate, ++ .to_text = bch2_sb_journal_seq_blacklist_to_text ++}; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter iter; ++ struct btree *b; ++ ++ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, ++ 0, 0, BTREE_ITER_PREFETCH); ++retry: ++ bch2_trans_begin(&trans); ++ ++ b = bch2_btree_iter_peek_node(&iter); ++ ++ while (!(ret = PTR_ERR_OR_ZERO(b)) && ++ b && ++ !test_bit(BCH_FS_STOPPING, &c->flags)) ++ b = bch2_btree_iter_next_node(&iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ } ++ ++ bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +new file mode 100644 +index 000000000000..afb886ec8e25 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -0,0 +1,22 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++ ++static inline unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); ++int bch2_blacklist_table_initialize(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; ++ ++void bch2_blacklist_entries_gc(struct work_struct *); ++ ++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +new file mode 100644 +index 000000000000..a6cdb885ad41 +--- /dev/null ++++ b/fs/bcachefs/journal_types.h +@@ -0,0 +1,340 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_TYPES_H ++#define _BCACHEFS_JOURNAL_TYPES_H ++ ++#include ++#include ++ ++#include "alloc_types.h" ++#include "super_types.h" ++#include "fifo.h" ++ ++#define JOURNAL_BUF_BITS 2 ++#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) ++#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) ++ ++/* ++ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to ++ * the journal that are being staged or in flight. ++ */ ++struct journal_buf { ++ struct jset *data; ++ ++ __BKEY_PADDED(key, BCH_REPLICAS_MAX); ++ struct bch_devs_list devs_written; ++ ++ struct closure_waitlist wait; ++ u64 last_seq; /* copy of data->last_seq */ ++ long expires; ++ u64 flush_time; ++ ++ unsigned buf_size; /* size in bytes of @data */ ++ unsigned sectors; /* maximum size for current entry */ ++ unsigned disk_sectors; /* maximum size entry could have been, if ++ buf_size was bigger */ ++ unsigned u64s_reserved; ++ bool noflush; /* write has already been kicked off, and was noflush */ ++ bool must_flush; /* something wants a flush */ ++ bool separate_flush; ++}; ++ ++/* ++ * Something that makes a journal entry dirty - i.e. a btree node that has to be ++ * flushed: ++ */ ++ ++struct journal_entry_pin_list { ++ struct list_head list; ++ struct list_head key_cache_list; ++ struct list_head flushed; ++ atomic_t count; ++ struct bch_devs_list devs; ++}; ++ ++struct journal; ++struct journal_entry_pin; ++typedef int (*journal_pin_flush_fn)(struct journal *j, ++ struct journal_entry_pin *, u64); ++ ++struct journal_entry_pin { ++ struct list_head list; ++ journal_pin_flush_fn flush; ++ u64 seq; ++}; ++ ++struct journal_res { ++ bool ref; ++ u8 idx; ++ u16 u64s; ++ u32 offset; ++ u64 seq; ++}; ++ ++/* ++ * For reserving space in the journal prior to getting a reservation on a ++ * particular journal entry: ++ */ ++struct journal_preres { ++ unsigned u64s; ++}; ++ ++union journal_res_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 cur_entry_offset:20, ++ idx:2, ++ unwritten_idx:2, ++ buf0_count:10, ++ buf1_count:10, ++ buf2_count:10, ++ buf3_count:10; ++ }; ++}; ++ ++union journal_preres_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 waiting:1, ++ reserved:31, ++ remaining:32; ++ }; ++}; ++ ++/* bytes: */ ++#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++ ++/* ++ * We stash some journal state as sentinal values in cur_entry_offset: ++ * note - cur_entry_offset is in units of u64s ++ */ ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++ ++#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) ++#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) ++ ++struct journal_space { ++ /* Units of 512 bytes sectors: */ ++ unsigned next_entry; /* How big the next journal entry can be */ ++ unsigned total; ++}; ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++ journal_space_total, ++ journal_space_nr, ++}; ++ ++enum { ++ JOURNAL_REPLAY_DONE, ++ JOURNAL_STARTED, ++ JOURNAL_MAY_SKIP_FLUSH, ++}; ++ ++#define JOURNAL_WATERMARKS() \ ++ x(any) \ ++ x(copygc) \ ++ x(reserved) ++ ++enum journal_watermark { ++#define x(n) JOURNAL_WATERMARK_##n, ++ JOURNAL_WATERMARKS() ++#undef x ++}; ++ ++#define JOURNAL_WATERMARK_MASK 3 ++ ++/* Reasons we may fail to get a journal reservation: */ ++#define JOURNAL_ERRORS() \ ++ x(ok) \ ++ x(blocked) \ ++ x(max_in_flight) \ ++ x(journal_full) \ ++ x(journal_pin_full) \ ++ x(journal_stuck) \ ++ x(insufficient_devices) ++ ++enum journal_errors { ++#define x(n) JOURNAL_ERR_##n, ++ JOURNAL_ERRORS() ++#undef x ++}; ++ ++/* Embedded in struct bch_fs */ ++struct journal { ++ /* Fastpath stuff up front: */ ++ ++ unsigned long flags; ++ ++ union journal_res_state reservations; ++ enum journal_watermark watermark; ++ ++ /* Max size of current journal entry */ ++ unsigned cur_entry_u64s; ++ unsigned cur_entry_sectors; ++ ++ /* ++ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if ++ * insufficient devices: ++ */ ++ enum journal_errors cur_entry_error; ++ ++ union journal_preres_state prereserved; ++ ++ /* Reserved space in journal entry to be used just prior to write */ ++ unsigned entry_u64s_reserved; ++ ++ unsigned buf_size_want; ++ ++ /* ++ * Two journal entries -- one is currently open for new entries, the ++ * other is possibly being written out. ++ */ ++ struct journal_buf buf[JOURNAL_BUF_NR]; ++ ++ spinlock_t lock; ++ ++ /* if nonzero, we may not open a new journal entry: */ ++ unsigned blocked; ++ ++ /* Used when waiting because the journal was full */ ++ wait_queue_head_t wait; ++ struct closure_waitlist async_wait; ++ struct closure_waitlist preres_wait; ++ ++ struct closure io; ++ struct delayed_work write_work; ++ ++ /* Sequence number of most recent journal entry (last entry in @pin) */ ++ atomic64_t seq; ++ ++ /* seq, last_seq from the most recent journal entry successfully written */ ++ u64 seq_ondisk; ++ u64 flushed_seq_ondisk; ++ u64 last_seq_ondisk; ++ u64 err_seq; ++ u64 last_empty_seq; ++ ++ /* ++ * FIFO of journal entries whose btree updates have not yet been ++ * written out. ++ * ++ * Each entry is a reference count. The position in the FIFO is the ++ * entry's sequence number relative to @seq. ++ * ++ * The journal entry itself holds a reference count, put when the ++ * journal entry is written out. Each btree node modified by the journal ++ * entry also holds a reference count, put when the btree node is ++ * written. ++ * ++ * When a reference count reaches zero, the journal entry is no longer ++ * needed. When all journal entries in the oldest journal bucket are no ++ * longer needed, the bucket can be discarded and reused. ++ */ ++ struct { ++ u64 front, back, size, mask; ++ struct journal_entry_pin_list *data; ++ } pin; ++ ++ struct journal_space space[journal_space_nr]; ++ ++ u64 replay_journal_seq; ++ u64 replay_journal_seq_end; ++ ++ struct write_point wp; ++ spinlock_t err_lock; ++ ++ struct mutex reclaim_lock; ++ /* ++ * Used for waiting until journal reclaim has freed up space in the ++ * journal: ++ */ ++ wait_queue_head_t reclaim_wait; ++ struct task_struct *reclaim_thread; ++ bool reclaim_kicked; ++ unsigned long next_reclaim; ++ u64 nr_direct_reclaim; ++ u64 nr_background_reclaim; ++ ++ unsigned long last_flushed; ++ struct journal_entry_pin *flush_in_progress; ++ bool flush_in_progress_dropped; ++ wait_queue_head_t pin_flush_wait; ++ ++ /* protects advancing ja->discard_idx: */ ++ struct mutex discard_lock; ++ bool can_discard; ++ ++ unsigned long last_flush_write; ++ ++ u64 res_get_blocked_start; ++ u64 write_start_time; ++ ++ u64 nr_flush_writes; ++ u64 nr_noflush_writes; ++ ++ struct time_stats *flush_write_time; ++ struct time_stats *noflush_write_time; ++ struct time_stats *blocked_time; ++ struct time_stats *flush_seq_time; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map res_map; ++#endif ++}; ++ ++/* ++ * Embedded in struct bch_dev. First three fields refer to the array of journal ++ * buckets, in bch_sb. ++ */ ++struct journal_device { ++ /* ++ * For each journal bucket, contains the max sequence number of the ++ * journal writes it contains - so we know when a bucket can be reused. ++ */ ++ u64 *bucket_seq; ++ ++ unsigned sectors_free; ++ ++ /* ++ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: ++ */ ++ unsigned discard_idx; /* Next bucket to discard */ ++ unsigned dirty_idx_ondisk; ++ unsigned dirty_idx; ++ unsigned cur_idx; /* Journal bucket we're currently writing to */ ++ unsigned nr; ++ ++ u64 *buckets; ++ ++ /* Bio for journal reads/writes to this device */ ++ struct bio *bio; ++ ++ /* for bch_journal_read_device */ ++ struct closure read; ++}; ++ ++/* ++ * journal_entry_res - reserve space in every journal entry: ++ */ ++struct journal_entry_res { ++ unsigned u64s; ++}; ++ ++#endif /* _BCACHEFS_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +new file mode 100644 +index 000000000000..cda77835b9ea +--- /dev/null ++++ b/fs/bcachefs/keylist.c +@@ -0,0 +1,67 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "keylist.h" ++ ++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, ++ size_t nr_inline_u64s, size_t new_u64s) ++{ ++ size_t oldsize = bch2_keylist_u64s(l); ++ size_t newsize = oldsize + new_u64s; ++ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; ++ u64 *new_keys; ++ ++ newsize = roundup_pow_of_two(newsize); ++ ++ if (newsize <= nr_inline_u64s || ++ (old_buf && roundup_pow_of_two(oldsize) == newsize)) ++ return 0; ++ ++ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); ++ if (!new_keys) ++ return -ENOMEM; ++ ++ if (!old_buf) ++ memcpy_u64s(new_keys, inline_u64s, oldsize); ++ ++ l->keys_p = new_keys; ++ l->top_p = new_keys + oldsize; ++ ++ return 0; ++} ++ ++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) ++{ ++ struct bkey_i *where; ++ ++ for_each_keylist_key(l, where) ++ if (bkey_cmp(insert->k.p, where->k.p) < 0) ++ break; ++ ++ memmove_u64s_up((u64 *) where + insert->k.u64s, ++ where, ++ ((u64 *) l->top) - ((u64 *) where)); ++ ++ l->top_p += insert->k.u64s; ++ bkey_copy(where, insert); ++} ++ ++void bch2_keylist_pop_front(struct keylist *l) ++{ ++ l->top_p -= bch2_keylist_front(l)->k.u64s; ++ ++ memmove_u64s_down(l->keys, ++ bkey_next(l->keys), ++ bch2_keylist_u64s(l)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *l) ++{ ++ struct bkey_i *k; ++ ++ for_each_keylist_key(l, k) ++ BUG_ON(bkey_next(k) != l->top && ++ bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++} ++#endif +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +new file mode 100644 +index 000000000000..195799bb20bc +--- /dev/null ++++ b/fs/bcachefs/keylist.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_H ++#define _BCACHEFS_KEYLIST_H ++ ++#include "keylist_types.h" ++ ++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); ++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); ++void bch2_keylist_pop_front(struct keylist *); ++ ++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) ++{ ++ l->top_p = l->keys_p = inline_keys; ++} ++ ++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) ++{ ++ if (l->keys_p != inline_keys) ++ kfree(l->keys_p); ++ bch2_keylist_init(l, inline_keys); ++} ++ ++static inline void bch2_keylist_push(struct keylist *l) ++{ ++ l->top = bkey_next(l->top); ++} ++ ++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) ++{ ++ bkey_copy(l->top, k); ++ bch2_keylist_push(l); ++} ++ ++static inline bool bch2_keylist_empty(struct keylist *l) ++{ ++ return l->top == l->keys; ++} ++ ++static inline size_t bch2_keylist_u64s(struct keylist *l) ++{ ++ return l->top_p - l->keys_p; ++} ++ ++static inline size_t bch2_keylist_bytes(struct keylist *l) ++{ ++ return bch2_keylist_u64s(l) * sizeof(u64); ++} ++ ++static inline struct bkey_i *bch2_keylist_front(struct keylist *l) ++{ ++ return l->keys; ++} ++ ++#define for_each_keylist_key(_keylist, _k) \ ++ for (_k = (_keylist)->keys; \ ++ _k != (_keylist)->top; \ ++ _k = bkey_next(_k)) ++ ++static inline u64 keylist_sectors(struct keylist *keys) ++{ ++ struct bkey_i *k; ++ u64 ret = 0; ++ ++ for_each_keylist_key(keys, k) ++ ret += k->k.size; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *); ++#else ++static inline void bch2_verify_keylist_sorted(struct keylist *l) {} ++#endif ++ ++#endif /* _BCACHEFS_KEYLIST_H */ +diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h +new file mode 100644 +index 000000000000..4b3ff7d8a875 +--- /dev/null ++++ b/fs/bcachefs/keylist_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_TYPES_H ++#define _BCACHEFS_KEYLIST_TYPES_H ++ ++struct keylist { ++ union { ++ struct bkey_i *keys; ++ u64 *keys_p; ++ }; ++ union { ++ struct bkey_i *top; ++ u64 *top_p; ++ }; ++}; ++ ++#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +new file mode 100644 +index 000000000000..53e607d72274 +--- /dev/null ++++ b/fs/bcachefs/lru.c +@@ -0,0 +1,206 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "error.h" ++#include "lru.h" ++#include "recovery.h" ++ ++int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*lru)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*lru)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; ++ ++ prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); ++} ++ ++int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time, ++ struct bkey_s_c orig_k) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 existing_idx; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ if (!time) ++ return 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, ++ POS(id, time), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_lru) { ++ bch2_bkey_val_to_text(&buf, trans->c, orig_k); ++ bch2_trans_inconsistent(trans, ++ "pointer to nonexistent lru %llu:%llu\n%s", ++ id, time, buf.buf); ++ ret = -EIO; ++ goto err; ++ } ++ ++ existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); ++ if (existing_idx != idx) { ++ bch2_bkey_val_to_text(&buf, trans->c, orig_k); ++ bch2_trans_inconsistent(trans, ++ "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s", ++ id, time, existing_idx, idx, buf.buf); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_lru *lru; ++ int ret = 0; ++ ++ if (!*time) ++ return 0; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, ++ POS(lru_id, *time), ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES, k, ret) ++ if (bkey_deleted(k.k)) ++ break; ++ ++ if (ret) ++ goto err; ++ ++ BUG_ON(iter.pos.inode != lru_id); ++ *time = iter.pos.offset; ++ ++ lru = bch2_trans_kmalloc(trans, sizeof(*lru)); ++ ret = PTR_ERR_OR_ZERO(lru); ++ if (ret) ++ goto err; ++ ++ bkey_lru_init(&lru->k_i); ++ lru->k.p = iter.pos; ++ lru->v.idx = cpu_to_le64(idx); ++ ++ ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, ++ u64 old_time, u64 *new_time, ++ struct bkey_s_c k) ++{ ++ if (old_time == *new_time) ++ return 0; ++ ++ return bch2_lru_delete(trans, id, idx, old_time, k) ?: ++ bch2_lru_set(trans, id, idx, new_time); ++} ++ ++static int bch2_check_lru_key(struct btree_trans *trans, ++ struct btree_iter *lru_iter, ++ struct bkey_s_c lru_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ struct bpos alloc_pos; ++ int ret; ++ ++ alloc_pos = POS(lru_k.k->p.inode, ++ le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx)); ++ ++ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, ++ "lru key points to nonexistent device:bucket %llu:%llu", ++ alloc_pos.inode, alloc_pos.offset)) ++ return bch2_btree_delete_at(trans, lru_iter, 0); ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if (fsck_err_on(a.data_type != BCH_DATA_cached || ++ a.io_time[READ] != lru_k.k->p.offset, c, ++ "incorrect lru entry %s\n" ++ " for %s", ++ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), ++ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { ++ struct bkey_i *update = ++ bch2_trans_kmalloc(trans, sizeof(*update)); ++ ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ goto err; ++ ++ bkey_init(&update->k); ++ update->k.p = lru_iter->pos; ++ ++ ret = bch2_trans_update(trans, lru_iter, update, 0); ++ if (ret) ++ goto err; ++ } ++err: ++fsck_err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++int bch2_check_lrus(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ bch2_check_lru_key(&trans, &iter, k)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++ ++} +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +new file mode 100644 +index 000000000000..3decb7b1dde2 +--- /dev/null ++++ b/fs/bcachefs/lru.h +@@ -0,0 +1,19 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_LRU_H ++#define _BCACHEFS_LRU_H ++ ++int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_lru (struct bkey_ops) { \ ++ .key_invalid = bch2_lru_invalid, \ ++ .val_to_text = bch2_lru_to_text, \ ++} ++ ++int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c); ++int bch2_lru_set(struct btree_trans *, u64, u64, u64 *); ++int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c); ++ ++int bch2_check_lrus(struct bch_fs *); ++ ++#endif /* _BCACHEFS_LRU_H */ +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +new file mode 100644 +index 000000000000..8b258d966d04 +--- /dev/null ++++ b/fs/bcachefs/migrate.c +@@ -0,0 +1,186 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for moving data off a device. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "errcode.h" ++#include "extents.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "migrate.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ++ unsigned dev_idx, int flags, bool metadata) ++{ ++ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; ++ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; ++ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; ++ unsigned nr_good; ++ ++ bch2_bkey_drop_device(k, dev_idx); ++ ++ nr_good = bch2_bkey_durability(c, k.s_c); ++ if ((!nr_good && !(flags & lost)) || ++ (nr_good < replicas && !(flags & degraded))) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ unsigned dev_idx, ++ int flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i *n; ++ int ret; ++ ++ if (!bch2_bkey_has_device(k, dev_idx)) ++ return 0; ++ ++ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(n, k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); ++ if (ret) ++ return ret; ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(n)); ++ ++ /* ++ * Since we're not inserting through an extent iterator ++ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), ++ * we aren't using the extent overwrite path to delete, we're ++ * just using the normal key deletion path: ++ */ ++ if (bkey_deleted(&n->k)) ++ n->k.size = 0; ++ ++ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++} ++ ++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ enum btree_id id; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!btree_type_has_ptrs(id)) ++ continue; ++ ++ ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct closure cl; ++ struct btree *b; ++ struct bkey_buf k; ++ unsigned id; ++ int ret; ++ ++ /* don't handle this yet: */ ++ if (flags & BCH_FORCE_IF_METADATA_LOST) ++ return -EINVAL; ++ ++ bch2_bkey_buf_init(&k); ++ bch2_trans_init(&trans, c, 0, 0); ++ closure_init_stack(&cl); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, ++ BTREE_ITER_PREFETCH); ++retry: ++ ret = 0; ++ while (bch2_trans_begin(&trans), ++ (b = bch2_btree_iter_peek_node(&iter)) && ++ !(ret = PTR_ERR_OR_ZERO(b))) { ++ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), ++ dev_idx)) ++ goto next; ++ ++ bch2_bkey_buf_copy(&k, c, &b->key); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), ++ dev_idx, flags, true); ++ if (ret) { ++ bch_err(c, "Cannot drop device without losing data"); ++ break; ++ } ++ ++ ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ ret = 0; ++ continue; ++ } ++ ++ if (ret) { ++ bch_err(c, "Error updating btree node key: %s", ++ bch2_err_str(ret)); ++ break; ++ } ++next: ++ bch2_btree_iter_next_node(&iter); ++ } ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) ++ goto err; ++ } ++ ++ bch2_btree_interior_updates_flush(c); ++ ret = 0; ++err: ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&k, c); ++ ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); ++ ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, dev_idx, flags); ++} +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +new file mode 100644 +index 000000000000..027efaa0d575 +--- /dev/null ++++ b/fs/bcachefs/migrate.h +@@ -0,0 +1,7 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MIGRATE_H ++#define _BCACHEFS_MIGRATE_H ++ ++int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++ ++#endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +new file mode 100644 +index 000000000000..2fc247451390 +--- /dev/null ++++ b/fs/bcachefs/move.c +@@ -0,0 +1,952 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "backpointers.h" ++#include "bkey_buf.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "errcode.h" ++#include "error.h" ++#include "inode.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "keylist.h" ++ ++#include ++#include ++ ++#include ++ ++static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) ++{ ++ mutex_lock(&c->data_progress_lock); ++ list_add(&stats->list, &c->data_progress_list); ++ mutex_unlock(&c->data_progress_lock); ++} ++ ++static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) ++{ ++ mutex_lock(&c->data_progress_lock); ++ list_del(&stats->list); ++ mutex_unlock(&c->data_progress_lock); ++} ++ ++struct moving_io { ++ struct list_head list; ++ struct closure cl; ++ bool read_completed; ++ ++ unsigned read_sectors; ++ unsigned write_sectors; ++ ++ struct bch_read_bio rbio; ++ ++ struct data_update write; ++ /* Must be last since it is variable size */ ++ struct bio_vec bi_inline_vecs[0]; ++}; ++ ++static void move_free(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ struct bch_fs *c = ctxt->c; ++ ++ bch2_data_update_exit(&io->write); ++ wake_up(&ctxt->wait); ++ percpu_ref_put(&c->writes); ++ kfree(io); ++} ++ ++static void move_write_done(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ if (io->write.op.error) ++ ctxt->write_error = true; ++ ++ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_return_with_destructor(cl, move_free); ++} ++ ++static void move_write(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ closure_return_with_destructor(cl, move_free); ++ return; ++ } ++ ++ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); ++ ++ bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl); ++ continue_at(cl, move_write_done, NULL); ++} ++ ++static inline struct moving_io *next_pending_write(struct moving_context *ctxt) ++{ ++ struct moving_io *io = ++ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); ++ ++ return io && io->read_completed ? io : NULL; ++} ++ ++static void move_read_endio(struct bio *bio) ++{ ++ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ atomic_sub(io->read_sectors, &ctxt->read_sectors); ++ io->read_completed = true; ++ ++ wake_up(&ctxt->wait); ++ closure_put(&ctxt->cl); ++} ++ ++static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) ++{ ++ struct moving_io *io; ++ ++ if (trans) ++ bch2_trans_unlock(trans); ++ ++ while ((io = next_pending_write(ctxt))) { ++ list_del(&io->list); ++ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ } ++} ++ ++#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ ++do { \ ++ do_pending_writes(_ctxt, _trans); \ ++ \ ++ if (_cond) \ ++ break; \ ++ __wait_event((_ctxt)->wait, \ ++ next_pending_write(_ctxt) || (_cond)); \ ++} while (1) ++ ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, ++ struct btree_trans *trans) ++{ ++ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); ++ ++ move_ctxt_wait_event(ctxt, trans, ++ !atomic_read(&ctxt->write_sectors) || ++ atomic_read(&ctxt->write_sectors) != sectors_pending); ++} ++ ++void bch2_moving_ctxt_exit(struct moving_context *ctxt) ++{ ++ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); ++ closure_sync(&ctxt->cl); ++ EBUG_ON(atomic_read(&ctxt->write_sectors)); ++ ++ if (ctxt->stats) { ++ progress_list_del(ctxt->c, ctxt->stats); ++ ++ trace_move_data(ctxt->c, ++ atomic64_read(&ctxt->stats->sectors_moved), ++ atomic64_read(&ctxt->stats->keys_moved)); ++ } ++} ++ ++void bch2_moving_ctxt_init(struct moving_context *ctxt, ++ struct bch_fs *c, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc) ++{ ++ memset(ctxt, 0, sizeof(*ctxt)); ++ ++ ctxt->c = c; ++ ctxt->rate = rate; ++ ctxt->stats = stats; ++ ctxt->wp = wp; ++ ctxt->wait_on_copygc = wait_on_copygc; ++ ++ closure_init_stack(&ctxt->cl); ++ INIT_LIST_HEAD(&ctxt->reads); ++ init_waitqueue_head(&ctxt->wait); ++ ++ if (stats) { ++ progress_list_add(c, stats); ++ stats->data_type = BCH_DATA_user; ++ } ++} ++ ++void bch_move_stats_init(struct bch_move_stats *stats, char *name) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ scnprintf(stats->name, sizeof(stats->name), "%s", name); ++} ++ ++static int bch2_move_extent(struct btree_trans *trans, ++ struct moving_context *ctxt, ++ struct bch_io_opts io_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct data_update_opts data_opts) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct moving_io *io; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned sectors = k.k->size, pages; ++ int ret = -ENOMEM; ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return -EROFS; ++ ++ /* write path might have to decompress data: */ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ++ ++ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ io = kzalloc(sizeof(struct moving_io) + ++ sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ if (!io) ++ goto err; ++ ++ io->write.ctxt = ctxt; ++ io->read_sectors = k.k->size; ++ io->write_sectors = k.k->size; ++ ++ bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); ++ bio_set_prio(&io->write.op.wbio.bio, ++ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ ++ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, ++ GFP_KERNEL)) ++ goto err_free; ++ ++ io->rbio.c = c; ++ io->rbio.opts = io_opts; ++ bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); ++ io->rbio.bio.bi_vcnt = pages; ++ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ ++ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); ++ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ io->rbio.bio.bi_end_io = move_read_endio; ++ ++ ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, ++ data_opts, btree_id, k); ++ if (ret) ++ goto err_free_pages; ++ ++ io->write.ctxt = ctxt; ++ ++ atomic64_inc(&ctxt->stats->keys_moved); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); ++ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); ++ ++ trace_move_extent(k.k); ++ ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ list_add_tail(&io->list, &ctxt->reads); ++ ++ /* ++ * dropped by move_read_endio() - guards against use after free of ++ * ctxt when doing wakeup ++ */ ++ closure_get(&ctxt->cl); ++ bch2_read_extent(trans, &io->rbio, ++ bkey_start_pos(k.k), ++ btree_id, k, 0, ++ BCH_READ_NODECODE| ++ BCH_READ_LAST_FRAGMENT); ++ return 0; ++err_free_pages: ++ bio_free_pages(&io->write.op.wbio.bio); ++err_free: ++ kfree(io); ++err: ++ percpu_ref_put(&c->writes); ++ trace_move_alloc_mem_fail(k.k); ++ return ret; ++} ++ ++static int lookup_inode(struct btree_trans *trans, struct bpos pos, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, ++ BTREE_ITER_ALL_SNAPSHOTS); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || bkey_cmp(k.k->p, pos)) { ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bkey_is_inode(k.k) ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(k, inode); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int move_ratelimit(struct btree_trans *trans, ++ struct moving_context *ctxt) ++{ ++ struct bch_fs *c = trans->c; ++ u64 delay; ++ ++ if (ctxt->wait_on_copygc) { ++ bch2_trans_unlock(trans); ++ wait_event_killable(c->copygc_running_wq, ++ !c->copygc_running || ++ kthread_should_stop()); ++ } ++ ++ do { ++ delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; ++ ++ if (delay) { ++ bch2_trans_unlock(trans); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 1; ++ } ++ ++ if (delay) ++ schedule_timeout(delay); ++ ++ if (unlikely(freezing(current))) { ++ move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); ++ try_to_freeze(); ++ } ++ } while (delay); ++ ++ move_ctxt_wait_event(ctxt, trans, ++ atomic_read(&ctxt->write_sectors) < ++ c->opts.move_bytes_in_flight >> 9); ++ ++ move_ctxt_wait_event(ctxt, trans, ++ atomic_read(&ctxt->read_sectors) < ++ c->opts.move_bytes_in_flight >> 9); ++ ++ return 0; ++} ++ ++static int move_get_io_opts(struct btree_trans *trans, ++ struct bch_io_opts *io_opts, ++ struct bkey_s_c k, u64 *cur_inum) ++{ ++ struct bch_inode_unpacked inode; ++ int ret; ++ ++ if (*cur_inum == k.k->p.inode) ++ return 0; ++ ++ *io_opts = bch2_opts_to_inode_opts(trans->c->opts); ++ ++ ret = lookup_inode(trans, ++ SPOS(0, k.k->p.inode, k.k->p.snapshot), ++ &inode); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; ++ ++ if (!ret) ++ bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); ++ ++ *cur_inum = k.k->p.inode; ++ return 0; ++} ++ ++static int __bch2_move_data(struct moving_context *ctxt, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ enum btree_id btree_id) ++{ ++ struct bch_fs *c = ctxt->c; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct bkey_buf sk; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct data_update_opts data_opts; ++ u64 cur_inum = U64_MAX; ++ int ret = 0, ret2; ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ctxt->stats->data_type = BCH_DATA_user; ++ ctxt->stats->btree_id = btree_id; ++ ctxt->stats->pos = start; ++ ++ bch2_trans_iter_init(&trans, &iter, btree_id, start, ++ BTREE_ITER_PREFETCH| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ ++ if (ctxt->rate) ++ bch2_ratelimit_reset(ctxt->rate); ++ ++ while (!move_ratelimit(&trans, ctxt)) { ++ bch2_trans_begin(&trans); ++ ++ k = bch2_btree_iter_peek(&iter); ++ if (!k.k) ++ break; ++ ++ ret = bkey_err(k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ ctxt->stats->pos = iter.pos; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ goto next_nondata; ++ ++ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); ++ if (ret) ++ continue; ++ ++ memset(&data_opts, 0, sizeof(data_opts)); ++ if (!pred(c, arg, k, &io_opts, &data_opts)) ++ goto next; ++ ++ /* ++ * The iterator gets unlocked by __bch2_read_extent - need to ++ * save a copy of @k elsewhere: ++ */ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ ret2 = bch2_move_extent(&trans, ctxt, io_opts, ++ btree_id, k, data_opts); ++ if (ret2) { ++ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) ++ continue; ++ ++ if (ret2 == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt, &trans); ++ continue; ++ } ++ ++ /* XXX signal failure */ ++ goto next; ++ } ++ ++ if (ctxt->rate) ++ bch2_ratelimit_increment(ctxt->rate, k.k->size); ++next: ++ atomic64_add(k.k->size, &ctxt->stats->sectors_seen); ++next_nondata: ++ bch2_btree_iter_advance(&iter); ++ } ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ ++ return ret; ++} ++ ++int bch2_move_data(struct bch_fs *c, ++ enum btree_id start_btree_id, struct bpos start_pos, ++ enum btree_id end_btree_id, struct bpos end_pos, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc, ++ move_pred_fn pred, void *arg) ++{ ++ struct moving_context ctxt; ++ enum btree_id id; ++ int ret; ++ ++ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); ++ ++ for (id = start_btree_id; ++ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id++) { ++ stats->btree_id = id; ++ ++ if (id != BTREE_ID_extents && ++ id != BTREE_ID_reflink) ++ continue; ++ ++ ret = __bch2_move_data(&ctxt, ++ id == start_btree_id ? start_pos : POS_MIN, ++ id == end_btree_id ? end_pos : POS_MAX, ++ pred, arg, id); ++ if (ret) ++ break; ++ } ++ ++ bch2_moving_ctxt_exit(&ctxt); ++ ++ return ret; ++} ++ ++static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, ++ bucket, BTREE_ITER_CACHED); ++again: ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ ++ if (!ret && k.k->type == KEY_TYPE_alloc_v4) { ++ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); ++ ++ if (a.v->gen == gen && ++ a.v->dirty_sectors) { ++ struct printbuf buf = PRINTBUF; ++ ++ if (a.v->data_type == BCH_DATA_btree) { ++ bch2_trans_unlock(trans); ++ if (bch2_btree_interior_updates_flush(c)) ++ goto again; ++ } ++ ++ prt_str(&buf, "failed to evacuate bucket "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ bch2_trans_inconsistent(trans, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int __bch2_evacuate_bucket(struct moving_context *ctxt, ++ struct bpos bucket, int gen, ++ struct data_update_opts _data_opts) ++{ ++ struct bch_fs *c = ctxt->c; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_buf sk; ++ struct bch_backpointer bp; ++ struct data_update_opts data_opts; ++ u64 bp_offset = 0, cur_inum = U64_MAX; ++ int ret = 0; ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ while (!(ret = move_ratelimit(&trans, ctxt))) { ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_get_next_backpointer(&trans, bucket, gen, ++ &bp_offset, &bp); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ goto err; ++ if (bp_offset == U64_MAX) ++ break; ++ ++ if (!bp.level) { ++ const struct bch_extent_ptr *ptr; ++ struct bkey_s_c k; ++ unsigned i = 0; ++ ++ k = bch2_backpointer_get_key(&trans, &iter, ++ bucket, bp_offset, bp); ++ ret = bkey_err(k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ goto err; ++ if (!k.k) ++ continue; ++ ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); ++ if (ret) ++ continue; ++ ++ data_opts = _data_opts; ++ data_opts.target = io_opts.background_target; ++ data_opts.rewrite_ptrs = 0; ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { ++ if (ptr->dev == bucket.inode) ++ data_opts.rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ ++ ret = bch2_move_extent(&trans, ctxt, io_opts, ++ bp.btree_id, k, data_opts); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt, &trans); ++ continue; ++ } ++ if (ret) ++ goto err; ++ ++ if (ctxt->rate) ++ bch2_ratelimit_increment(ctxt->rate, k.k->size); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_seen); ++ } else { ++ struct btree *b; ++ ++ b = bch2_backpointer_get_node(&trans, &iter, ++ bucket, bp_offset, bp); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ goto err; ++ if (!b) ++ continue; ++ ++ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ goto err; ++ ++ if (ctxt->rate) ++ bch2_ratelimit_increment(ctxt->rate, ++ c->opts.btree_node_size >> 9); ++ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); ++ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); ++ } ++ ++ bp_offset++; ++ } ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { ++ bch2_trans_unlock(&trans); ++ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); ++ closure_sync(&ctxt->cl); ++ if (!ctxt->write_error) ++ lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&sk, c); ++ return ret; ++} ++ ++int bch2_evacuate_bucket(struct bch_fs *c, ++ struct bpos bucket, int gen, ++ struct data_update_opts data_opts, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc) ++{ ++ struct moving_context ctxt; ++ int ret; ++ ++ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); ++ ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts); ++ bch2_moving_ctxt_exit(&ctxt); ++ ++ return ret; ++} ++ ++typedef bool (*move_btree_pred)(struct bch_fs *, void *, ++ struct btree *, struct bch_io_opts *, ++ struct data_update_opts *); ++ ++static int bch2_move_btree(struct bch_fs *c, ++ enum btree_id start_btree_id, struct bpos start_pos, ++ enum btree_id end_btree_id, struct bpos end_pos, ++ move_btree_pred pred, void *arg, ++ struct bch_move_stats *stats) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct btree *b; ++ enum btree_id id; ++ struct data_update_opts data_opts; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ progress_list_add(c, stats); ++ ++ stats->data_type = BCH_DATA_btree; ++ ++ for (id = start_btree_id; ++ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id++) { ++ stats->btree_id = id; ++ ++ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, ++ BTREE_ITER_PREFETCH); ++retry: ++ ret = 0; ++ while (bch2_trans_begin(&trans), ++ (b = bch2_btree_iter_peek_node(&iter)) && ++ !(ret = PTR_ERR_OR_ZERO(b))) { ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if ((cmp_int(id, end_btree_id) ?: ++ bpos_cmp(b->key.k.p, end_pos)) > 0) ++ break; ++ ++ stats->pos = iter.pos; ++ ++ if (!pred(c, arg, b, &io_opts, &data_opts)) ++ goto next; ++ ++ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++next: ++ bch2_btree_iter_next_node(&iter); ++ } ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (kthread && kthread_should_stop()) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ ++ bch2_btree_interior_updates_flush(c); ++ ++ progress_list_del(c, stats); ++ return ret; ++} ++ ++static bool rereplicate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ unsigned nr_good = bch2_bkey_durability(c, k); ++ unsigned replicas = bkey_is_btree_ptr(k.k) ++ ? c->opts.metadata_replicas ++ : io_opts->data_replicas; ++ ++ if (!nr_good || nr_good >= replicas) ++ return false; ++ ++ data_opts->target = 0; ++ data_opts->extra_replicas = replicas - nr_good; ++ data_opts->btree_insert_flags = 0; ++ return true; ++} ++ ++static bool migrate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ struct bch_ioctl_data *op = arg; ++ unsigned i = 0; ++ ++ data_opts->rewrite_ptrs = 0; ++ data_opts->target = 0; ++ data_opts->extra_replicas = 0; ++ data_opts->btree_insert_flags = 0; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr->dev == op->migrate.dev) ++ data_opts->rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ ++ return data_opts->rewrite_ptrs != 0;; ++} ++ ++static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); ++} ++ ++static bool migrate_btree_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); ++} ++ ++static bool bformat_needs_redo(struct bkey_format *f) ++{ ++ unsigned i; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > unpacked_bits) ++ return true; ++ ++ if ((f->bits_per_field[i] == unpacked_bits) && field_offset) ++ return true; ++ ++ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & ++ unpacked_mask) < ++ field_offset) ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, ++ struct btree *b, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ if (b->version_ondisk != c->sb.version || ++ btree_node_need_rewrite(b) || ++ bformat_needs_redo(&b->format)) { ++ data_opts->target = 0; ++ data_opts->extra_replicas = 0; ++ data_opts->btree_insert_flags = 0; ++ return true; ++ } ++ ++ return false; ++} ++ ++int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) ++{ ++ int ret; ++ ++ ret = bch2_move_btree(c, ++ 0, POS_MIN, ++ BTREE_ID_NR, SPOS_MAX, ++ rewrite_old_nodes_pred, c, stats); ++ if (!ret) { ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); ++ c->disk_sb.sb->version_min = c->disk_sb.sb->version; ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ return ret; ++} ++ ++int bch2_data_job(struct bch_fs *c, ++ struct bch_move_stats *stats, ++ struct bch_ioctl_data op) ++{ ++ int ret = 0; ++ ++ switch (op.op) { ++ case BCH_DATA_OP_REREPLICATE: ++ bch_move_stats_init(stats, "rereplicate"); ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, -1); ++ ++ ret = bch2_move_btree(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ rereplicate_btree_pred, c, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ NULL, ++ stats, ++ writepoint_hashed((unsigned long) current), ++ true, ++ rereplicate_pred, c) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_MIGRATE: ++ if (op.migrate.dev >= c->sb.nr_devices) ++ return -EINVAL; ++ ++ bch_move_stats_init(stats, "migrate"); ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ++ ++ ret = bch2_move_btree(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ migrate_btree_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, ++ op.start_btree, op.start_pos, ++ op.end_btree, op.end_pos, ++ NULL, ++ stats, ++ writepoint_hashed((unsigned long) current), ++ true, ++ migrate_pred, &op) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_REWRITE_OLD_NODES: ++ bch_move_stats_init(stats, "rewrite_old_nodes"); ++ ret = bch2_scan_old_btree_nodes(c, stats); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +new file mode 100644 +index 000000000000..c0fec69bbb6a +--- /dev/null ++++ b/fs/bcachefs/move.h +@@ -0,0 +1,67 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_H ++#define _BCACHEFS_MOVE_H ++ ++#include "btree_iter.h" ++#include "buckets.h" ++#include "data_update.h" ++#include "move_types.h" ++ ++struct bch_read_bio; ++ ++struct moving_context { ++ struct bch_fs *c; ++ struct bch_ratelimit *rate; ++ struct bch_move_stats *stats; ++ struct write_point_specifier wp; ++ bool wait_on_copygc; ++ bool write_error; ++ ++ /* For waiting on outstanding reads and writes: */ ++ struct closure cl; ++ struct list_head reads; ++ ++ /* in flight sectors: */ ++ atomic_t read_sectors; ++ atomic_t write_sectors; ++ ++ wait_queue_head_t wait; ++}; ++ ++typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, ++ struct bch_io_opts *, struct data_update_opts *); ++ ++void bch2_moving_ctxt_exit(struct moving_context *); ++void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, ++ struct bch_ratelimit *, struct bch_move_stats *, ++ struct write_point_specifier, bool); ++ ++int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); ++ ++int bch2_move_data(struct bch_fs *, ++ enum btree_id, struct bpos, ++ enum btree_id, struct bpos, ++ struct bch_ratelimit *, ++ struct bch_move_stats *, ++ struct write_point_specifier, ++ bool, ++ move_pred_fn, void *); ++ ++int __bch2_evacuate_bucket(struct moving_context *, ++ struct bpos, int, ++ struct data_update_opts); ++int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, ++ struct data_update_opts, ++ struct bch_ratelimit *, ++ struct bch_move_stats *, ++ struct write_point_specifier, ++ bool); ++int bch2_data_job(struct bch_fs *, ++ struct bch_move_stats *, ++ struct bch_ioctl_data); ++ ++inline void bch_move_stats_init(struct bch_move_stats *stats, ++ char *name); ++ ++ ++#endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +new file mode 100644 +index 000000000000..9df6d18137a5 +--- /dev/null ++++ b/fs/bcachefs/move_types.h +@@ -0,0 +1,19 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_TYPES_H ++#define _BCACHEFS_MOVE_TYPES_H ++ ++struct bch_move_stats { ++ enum bch_data_type data_type; ++ enum btree_id btree_id; ++ struct bpos pos; ++ struct list_head list; ++ char name[32]; ++ ++ atomic64_t keys_moved; ++ atomic64_t keys_raced; ++ atomic64_t sectors_moved; ++ atomic64_t sectors_seen; ++ atomic64_t sectors_raced; ++}; ++ ++#endif /* _BCACHEFS_MOVE_TYPES_H */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +new file mode 100644 +index 000000000000..f913864eaa4f +--- /dev/null ++++ b/fs/bcachefs/movinggc.c +@@ -0,0 +1,285 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Moving/copying garbage collector ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "errcode.h" ++#include "error.h" ++#include "extents.h" ++#include "eytzinger.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "movinggc.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int fragmentation_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.fragmentation, r.fragmentation); ++} ++ ++static int find_buckets_to_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_alloc_v4 a; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * Find buckets with lowest sector counts, skipping completely ++ * empty buckets, by building a maxheap sorted by sector count, ++ * and repeatedly replacing the maximum element until all ++ * buckets have been visited. ++ */ ++ h->used = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); ++ struct copygc_heap_entry e; ++ ++ bch2_alloc_to_v4(k, &a); ++ ++ if ((a.data_type != BCH_DATA_btree && ++ a.data_type != BCH_DATA_user) || ++ a.dirty_sectors >= ca->mi.bucket_size || ++ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .dev = iter.pos.inode, ++ .gen = a.gen, ++ .replicas = 1 + a.stripe_redundancy, ++ .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31), ++ ca->mi.bucket_size), ++ .sectors = a.dirty_sectors, ++ .bucket = iter.pos.offset, ++ }; ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); ++ ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int bch2_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct copygc_heap_entry e; ++ struct bch_move_stats move_stats; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ size_t heap_size = 0; ++ struct moving_context ctxt; ++ struct data_update_opts data_opts = { ++ .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, ++ }; ++ int ret = 0; ++ ++ bch_move_stats_init(&move_stats, "copygc"); ++ ++ for_each_rw_member(ca, c, dev_idx) ++ heap_size += ca->mi.nbuckets >> 7; ++ ++ if (h->size < heap_size) { ++ free_heap(&c->copygc_heap); ++ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { ++ bch_err(c, "error allocating copygc heap"); ++ return 0; ++ } ++ } ++ ++ ret = find_buckets_to_copygc(c); ++ if (ret) { ++ bch2_fs_fatal_error(c, "error walking buckets to copygc!"); ++ return ret; ++ } ++ ++ if (!h->used) { ++ s64 wait = S64_MAX, dev_wait; ++ u64 dev_min_wait_fragmented = 0; ++ u64 dev_min_wait_allowed = 0; ++ int dev_min_wait = -1; ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); ++ s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * ++ ca->mi.bucket_size) >> 1); ++ s64 fragmented = usage.d[BCH_DATA_user].fragmented; ++ ++ dev_wait = max(0LL, allowed - fragmented); ++ ++ if (dev_min_wait < 0 || dev_wait < wait) { ++ dev_min_wait = dev_idx; ++ dev_min_wait_fragmented = fragmented; ++ dev_min_wait_allowed = allowed; ++ } ++ } ++ ++ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu", ++ dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed); ++ return 0; ++ } ++ ++ heap_resort(h, fragmentation_cmp, NULL); ++ ++ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, ++ writepoint_ptr(&c->copygc_write_point), ++ false); ++ ++ /* not correct w.r.t. device removal */ ++ while (h->used && !ret) { ++ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); ++ ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen, ++ data_opts); ++ } ++ ++ bch2_moving_ctxt_exit(&ctxt); ++ ++ if (ret < 0) ++ bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); ++ ++ trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); ++ return ret; ++} ++ ++/* ++ * Copygc runs when the amount of fragmented data is above some arbitrary ++ * threshold: ++ * ++ * The threshold at the limit - when the device is full - is the amount of space ++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of ++ * disk space stranded due to fragmentation and store everything we have ++ * promised to store. ++ * ++ * But we don't want to be running copygc unnecessarily when the device still ++ * has plenty of free space - rather, we want copygc to smoothly run every so ++ * often and continually reduce the amount of fragmented space as the device ++ * fills up. So, we increase the threshold by half the current free space. ++ */ ++unsigned long bch2_copygc_wait_amount(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ s64 wait = S64_MAX, fragmented_allowed, fragmented; ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); ++ ++ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * ++ ca->mi.bucket_size) >> 1); ++ fragmented = usage.d[BCH_DATA_user].fragmented; ++ ++ wait = min(wait, max(0LL, fragmented_allowed - fragmented)); ++ } ++ ++ return wait; ++} ++ ++static int bch2_copygc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ u64 last, wait; ++ int ret = 0; ++ ++ set_freezable(); ++ ++ while (!ret && !kthread_should_stop()) { ++ cond_resched(); ++ ++ if (kthread_wait_freezable(c->copy_gc_enabled)) ++ break; ++ ++ last = atomic64_read(&clock->now); ++ wait = bch2_copygc_wait_amount(c); ++ ++ if (wait > clock->max_slop) { ++ trace_copygc_wait(c, wait, last + wait); ++ c->copygc_wait = last + wait; ++ bch2_kthread_io_clock_wait(clock, last + wait, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ c->copygc_wait = 0; ++ ++ c->copygc_running = true; ++ ret = bch2_copygc(c); ++ c->copygc_running = false; ++ ++ wake_up(&c->copygc_running_wq); ++ } ++ ++ return 0; ++} ++ ++void bch2_copygc_stop(struct bch_fs *c) ++{ ++ if (c->copygc_thread) { ++ kthread_stop(c->copygc_thread); ++ put_task_struct(c->copygc_thread); ++ } ++ c->copygc_thread = NULL; ++} ++ ++int bch2_copygc_start(struct bch_fs *c) ++{ ++ struct task_struct *t; ++ int ret; ++ ++ if (c->copygc_thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ if (bch2_fs_init_fault("copygc_start")) ++ return -ENOMEM; ++ ++ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ++ ret = PTR_ERR_OR_ZERO(t); ++ if (ret) { ++ bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ get_task_struct(t); ++ ++ c->copygc_thread = t; ++ wake_up_process(c->copygc_thread); ++ ++ return 0; ++} ++ ++void bch2_fs_copygc_init(struct bch_fs *c) ++{ ++ init_waitqueue_head(&c->copygc_running_wq); ++ c->copygc_running = false; ++} +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +new file mode 100644 +index 000000000000..e85c8136a46e +--- /dev/null ++++ b/fs/bcachefs/movinggc.h +@@ -0,0 +1,10 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVINGGC_H ++#define _BCACHEFS_MOVINGGC_H ++ ++unsigned long bch2_copygc_wait_amount(struct bch_fs *); ++void bch2_copygc_stop(struct bch_fs *); ++int bch2_copygc_start(struct bch_fs *); ++void bch2_fs_copygc_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +new file mode 100644 +index 000000000000..407b221e8f6c +--- /dev/null ++++ b/fs/bcachefs/opts.c +@@ -0,0 +1,578 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++ ++#include "bcachefs.h" ++#include "compress.h" ++#include "disk_groups.h" ++#include "opts.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#define x(t, n) [n] = #t, ++ ++const char * const bch2_metadata_versions[] = { ++ BCH_METADATA_VERSIONS() ++ NULL ++}; ++ ++const char * const bch2_error_actions[] = { ++ BCH_ERROR_ACTIONS() ++ NULL ++}; ++ ++const char * const bch2_sb_features[] = { ++ BCH_SB_FEATURES() ++ NULL ++}; ++ ++const char * const bch2_sb_compat[] = { ++ BCH_SB_COMPAT() ++ NULL ++}; ++ ++const char * const bch2_btree_ids[] = { ++ BCH_BTREE_IDS() ++ "interior btree node", ++ NULL ++}; ++ ++const char * const bch2_csum_types[] = { ++ BCH_CSUM_TYPES() ++ NULL ++}; ++ ++const char * const bch2_csum_opts[] = { ++ BCH_CSUM_OPTS() ++ NULL ++}; ++ ++const char * const bch2_compression_types[] = { ++ BCH_COMPRESSION_TYPES() ++ NULL ++}; ++ ++const char * const bch2_compression_opts[] = { ++ BCH_COMPRESSION_OPTS() ++ NULL ++}; ++ ++const char * const bch2_str_hash_types[] = { ++ BCH_STR_HASH_TYPES() ++ NULL ++}; ++ ++const char * const bch2_str_hash_opts[] = { ++ BCH_STR_HASH_OPTS() ++ NULL ++}; ++ ++const char * const bch2_data_types[] = { ++ BCH_DATA_TYPES() ++ NULL ++}; ++ ++const char * const bch2_member_states[] = { ++ BCH_MEMBER_STATES() ++ NULL ++}; ++ ++const char * const bch2_jset_entry_types[] = { ++ BCH_JSET_ENTRY_TYPES() ++ NULL ++}; ++ ++const char * const bch2_fs_usage_types[] = { ++ BCH_FS_USAGE_TYPES() ++ NULL ++}; ++ ++#undef x ++ ++const char * const bch2_d_types[BCH_DT_MAX] = { ++ [DT_UNKNOWN] = "unknown", ++ [DT_FIFO] = "fifo", ++ [DT_CHR] = "chr", ++ [DT_DIR] = "dir", ++ [DT_BLK] = "blk", ++ [DT_REG] = "reg", ++ [DT_LNK] = "lnk", ++ [DT_SOCK] = "sock", ++ [DT_WHT] = "whiteout", ++ [DT_SUBVOL] = "subvol", ++}; ++ ++u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) ++{ ++ BUG(); ++} ++ ++void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) ++{ ++ BUG(); ++} ++ ++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) ++{ ++#define x(_name, ...) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ ++ BCH_OPTS() ++#undef x ++} ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opt_defined(*opts, _name); ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opts->_name; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ opt_set(*opts, _name, v); \ ++ break; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++const struct bch_option bch2_opt_table[] = { ++#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ ++ .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, \ ++ .min = 0, .max = ARRAY_SIZE(_choices),\ ++ .choices = _choices ++#define OPT_FN(_fn) .type = BCH_OPT_FN, \ ++ .parse = _fn##_parse, \ ++ .to_text = _fn##_to_text ++ ++#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ ++ [Opt_##_name] = { \ ++ .attr = { \ ++ .name = #_name, \ ++ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ ++ }, \ ++ .flags = _flags, \ ++ .hint = _hint, \ ++ .help = _help, \ ++ .get_sb = _sb_opt, \ ++ .set_sb = SET_##_sb_opt, \ ++ _type \ ++ }, ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++int bch2_opt_lookup(const char *name) ++{ ++ const struct bch_option *i; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); ++ i++) ++ if (!strcmp(name, i->attr.name)) ++ return i - bch2_opt_table; ++ ++ return -1; ++} ++ ++struct synonym { ++ const char *s1, *s2; ++}; ++ ++static const struct synonym bch_opt_synonyms[] = { ++ { "quota", "usrquota" }, ++}; ++ ++static int bch2_mount_opt_lookup(const char *name) ++{ ++ const struct synonym *i; ++ ++ for (i = bch_opt_synonyms; ++ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ i++) ++ if (!strcmp(name, i->s1)) ++ name = i->s2; ++ ++ return bch2_opt_lookup(name); ++} ++ ++int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) ++{ ++ if (v < opt->min) { ++ if (err) ++ prt_printf(err, "%s: too small (min %llu)", ++ opt->attr.name, opt->min); ++ return -ERANGE; ++ } ++ ++ if (opt->max && v >= opt->max) { ++ if (err) ++ prt_printf(err, "%s: too big (max %llu)", ++ opt->attr.name, opt->max); ++ return -ERANGE; ++ } ++ ++ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { ++ if (err) ++ prt_printf(err, "%s: not a multiple of 512", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ ++ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { ++ if (err) ++ prt_printf(err, "%s: must be a power of two", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int bch2_opt_parse(struct bch_fs *c, ++ const struct bch_option *opt, ++ const char *val, u64 *res, ++ struct printbuf *err) ++{ ++ ssize_t ret; ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0 || (*res != 0 && *res != 1)) { ++ prt_printf(err, "%s: must be bool", ++ opt->attr.name); ++ return ret; ++ } ++ break; ++ case BCH_OPT_UINT: ++ ret = opt->flags & OPT_HUMAN_READABLE ++ ? bch2_strtou64_h(val, res) ++ : kstrtou64(val, 10, res); ++ if (ret < 0) { ++ if (err) ++ prt_printf(err, "%s: must be a number", ++ opt->attr.name); ++ return ret; ++ } ++ break; ++ case BCH_OPT_STR: ++ ret = match_string(opt->choices, -1, val); ++ if (ret < 0) { ++ if (err) ++ prt_printf(err, "%s: invalid selection", ++ opt->attr.name); ++ return ret; ++ } ++ ++ *res = ret; ++ break; ++ case BCH_OPT_FN: ++ if (!c) ++ return 0; ++ ++ ret = opt->parse(c, val, res); ++ if (ret < 0) { ++ if (err) ++ prt_printf(err, "%s: parse error", ++ opt->attr.name); ++ return ret; ++ } ++ } ++ ++ return bch2_opt_validate(opt, *res, err); ++} ++ ++void bch2_opt_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_sb *sb, ++ const struct bch_option *opt, u64 v, ++ unsigned flags) ++{ ++ if (flags & OPT_SHOW_MOUNT_STYLE) { ++ if (opt->type == BCH_OPT_BOOL) { ++ prt_printf(out, "%s%s", ++ v ? "" : "no", ++ opt->attr.name); ++ return; ++ } ++ ++ prt_printf(out, "%s=", opt->attr.name); ++ } ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ case BCH_OPT_UINT: ++ if (opt->flags & OPT_HUMAN_READABLE) ++ prt_human_readable_u64(out, v); ++ else ++ prt_printf(out, "%lli", v); ++ break; ++ case BCH_OPT_STR: ++ if (flags & OPT_SHOW_FULL_LIST) ++ prt_string_option(out, opt->choices, v); ++ else ++ prt_printf(out, "%s", opt->choices[v]); ++ break; ++ case BCH_OPT_FN: ++ opt->to_text(out, c, sb, v); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) ++{ ++ int ret = 0; ++ ++ switch (id) { ++ case Opt_compression: ++ case Opt_background_compression: ++ ret = bch2_check_set_has_compressed_data(c, v); ++ break; ++ case Opt_erasure_code: ++ if (v) ++ bch2_check_set_feature(c, BCH_FEATURE_ec); ++ break; ++ } ++ ++ return ret; ++} ++ ++int bch2_opts_check_may_set(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ ret = bch2_opt_check_may_set(c, i, ++ bch2_opt_get_by_id(&c->opts, i)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, ++ char *options) ++{ ++ char *copied_opts, *copied_opts_start; ++ char *opt, *name, *val; ++ int ret, id; ++ struct printbuf err = PRINTBUF; ++ u64 v; ++ ++ if (!options) ++ return 0; ++ ++ copied_opts = kstrdup(options, GFP_KERNEL); ++ if (!copied_opts) ++ return -1; ++ copied_opts_start = copied_opts; ++ ++ while ((opt = strsep(&copied_opts, ",")) != NULL) { ++ name = strsep(&opt, "="); ++ val = opt; ++ ++ if (val) { ++ id = bch2_mount_opt_lookup(name); ++ if (id < 0) ++ goto bad_opt; ++ ++ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); ++ if (ret < 0) ++ goto bad_val; ++ } else { ++ id = bch2_mount_opt_lookup(name); ++ v = 1; ++ ++ if (id < 0 && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ v = 0; ++ } ++ ++ if (id < 0) ++ goto bad_opt; ++ ++ if (bch2_opt_table[id].type != BCH_OPT_BOOL) ++ goto no_val; ++ } ++ ++ if (!(bch2_opt_table[id].flags & OPT_MOUNT)) ++ goto bad_opt; ++ ++ if (id == Opt_acl && ++ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) ++ goto bad_opt; ++ ++ if ((id == Opt_usrquota || ++ id == Opt_grpquota) && ++ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) ++ goto bad_opt; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ ret = 0; ++ goto out; ++ ++bad_opt: ++ pr_err("Bad mount option %s", name); ++ ret = -1; ++ goto out; ++bad_val: ++ pr_err("Invalid mount option %s", err.buf); ++ ret = -1; ++ goto out; ++no_val: ++ pr_err("Mount option %s requires a value", name); ++ ret = -1; ++ goto out; ++out: ++ kfree(copied_opts_start); ++ printbuf_exit(&err); ++ return ret; ++} ++ ++u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) ++{ ++ const struct bch_option *opt = bch2_opt_table + id; ++ u64 v; ++ ++ v = opt->get_sb(sb); ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = 1ULL << v; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v <<= 9; ++ ++ return v; ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) ++{ ++ unsigned id; ++ ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; ++ ++ if (opt->get_sb == BCH2_NO_SB_OPT) ++ continue; ++ ++ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); ++ } ++ ++ return 0; ++} ++ ++void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) ++ return; ++ ++ if (opt->flags & OPT_SB_FIELD_SECTORS) ++ v >>= 9; ++ ++ if (opt->flags & OPT_SB_FIELD_ILOG2) ++ v = ilog2(v); ++ ++ opt->set_sb(sb, v); ++} ++ ++void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) ++{ ++ if (opt->set_sb == SET_BCH2_NO_SB_OPT) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ __bch2_opt_set_sb(c->disk_sb.sb, opt, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* io opts: */ ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) ++{ ++ struct bch_io_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) ++{ ++ struct bch_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) ++{ ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++} ++ ++bool bch2_opt_is_inode_opt(enum bch_opt_id id) ++{ ++ static const enum bch_opt_id inode_opt_list[] = { ++#define x(_name, _bits) Opt_##_name, ++ BCH_INODE_OPTS() ++#undef x ++ }; ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) ++ if (inode_opt_list[i] == id) ++ return true; ++ ++ return false; ++} +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +new file mode 100644 +index 000000000000..5b8586ecb374 +--- /dev/null ++++ b/fs/bcachefs/opts.h +@@ -0,0 +1,509 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_OPTS_H ++#define _BCACHEFS_OPTS_H ++ ++#include ++#include ++#include ++#include ++#include "bcachefs_format.h" ++ ++extern const char * const bch2_metadata_versions[]; ++extern const char * const bch2_error_actions[]; ++extern const char * const bch2_sb_features[]; ++extern const char * const bch2_sb_compat[]; ++extern const char * const bch2_btree_ids[]; ++extern const char * const bch2_csum_types[]; ++extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_types[]; ++extern const char * const bch2_compression_opts[]; ++extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_str_hash_opts[]; ++extern const char * const bch2_data_types[]; ++extern const char * const bch2_member_states[]; ++extern const char * const bch2_jset_entry_types[]; ++extern const char * const bch2_fs_usage_types[]; ++extern const char * const bch2_d_types[]; ++ ++static inline const char *bch2_d_type_str(unsigned d_type) ++{ ++ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; ++} ++ ++/* ++ * Mount options; we also store defaults in the superblock. ++ * ++ * Also exposed via sysfs: if an option is writeable, and it's also stored in ++ * the superblock, changing it via sysfs (currently? might change this) also ++ * updates the superblock. ++ * ++ * We store options as signed integers, where -1 means undefined. This means we ++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only ++ * apply the options from that struct that are defined. ++ */ ++ ++/* dummy option, for options that aren't stored in the superblock */ ++u64 BCH2_NO_SB_OPT(const struct bch_sb *); ++void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); ++ ++/* When can be set: */ ++enum opt_flags { ++ OPT_FS = (1 << 0), /* Filesystem option */ ++ OPT_DEVICE = (1 << 1), /* Device option */ ++ OPT_INODE = (1 << 2), /* Inode option */ ++ OPT_FORMAT = (1 << 3), /* May be specified at format time */ ++ OPT_MOUNT = (1 << 4), /* May be specified at mount time */ ++ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ ++ OPT_HUMAN_READABLE = (1 << 6), ++ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ ++ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ ++ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ ++}; ++ ++enum opt_type { ++ BCH_OPT_BOOL, ++ BCH_OPT_UINT, ++ BCH_OPT_STR, ++ BCH_OPT_FN, ++}; ++ ++/** ++ * x(name, shortopt, type, in mem type, mode, sb_opt) ++ * ++ * @name - name of mount option, sysfs attribute, and struct bch_opts ++ * member ++ * ++ * @mode - when opt may be set ++ * ++ * @sb_option - name of corresponding superblock option ++ * ++ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR ++ */ ++ ++/* ++ * XXX: add fields for ++ * - default value ++ * - helptext ++ */ ++ ++#ifdef __KERNEL__ ++#define RATELIMIT_ERRORS_DEFAULT true ++#else ++#define RATELIMIT_ERRORS_DEFAULT false ++#endif ++ ++#define BCH_OPTS() \ ++ x(block_size, u16, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 16), \ ++ BCH_SB_BLOCK_SIZE, 8, \ ++ "size", NULL) \ ++ x(btree_node_size, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(512, 1U << 20), \ ++ BCH_SB_BTREE_NODE_SIZE, 512, \ ++ "size", "Btree node size, default 256k") \ ++ x(errors, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_error_actions), \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ ++ NULL, "Action to take on filesystem error") \ ++ x(metadata_replicas, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_WANT, 1, \ ++ "#", "Number of metadata replicas") \ ++ x(data_replicas, u8, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_WANT, 1, \ ++ "#", "Number of data replicas") \ ++ x(metadata_replicas_required, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(data_replicas_required, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(encoded_extent_max, u32, \ ++ OPT_FS|OPT_FORMAT| \ ++ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ ++ OPT_UINT(4096, 2U << 20), \ ++ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ ++ "size", "Maximum size of checksummed/compressed extents")\ ++ x(metadata_checksum, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ ++ NULL, NULL) \ ++ x(data_checksum, u8, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ ++ NULL, NULL) \ ++ x(compression, u8, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(background_compression, u8, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(str_hash, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_str_hash_opts), \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ ++ NULL, "Hash function for directory entries and xattrs")\ ++ x(metadata_target, u16, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_METADATA_TARGET, 0, \ ++ "(target)", "Device or label for metadata writes") \ ++ x(foreground_target, u16, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_FOREGROUND_TARGET, 0, \ ++ "(target)", "Device or label for foreground writes") \ ++ x(background_target, u16, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_BACKGROUND_TARGET, 0, \ ++ "(target)", "Device or label to move data to in the background")\ ++ x(promote_target, u16, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_PROMOTE_TARGET, 0, \ ++ "(target)", "Device or label to promote data to on read") \ ++ x(erasure_code, u16, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_ERASURE_CODE, false, \ ++ NULL, "Enable erasure coding (DO NOT USE YET)") \ ++ x(inodes_32bit, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODE_32BIT, true, \ ++ NULL, "Constrain inode numbers to 32 bits") \ ++ x(shard_inode_numbers, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_SHARD_INUMS, true, \ ++ NULL, "Shard new inode numbers by CPU id") \ ++ x(inodes_use_key_cache, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODES_USE_KEY_CACHE, true, \ ++ NULL, "Use the btree key cache for the inodes btree") \ ++ x(btree_node_mem_ptr_optimization, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Stash pointer to in memory btree node in btree ptr")\ ++ x(gc_reserve_percent, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(5, 21), \ ++ BCH_SB_GC_RESERVE, 8, \ ++ "%", "Percentage of disk space to reserve for copygc")\ ++ x(gc_reserve_bytes, u64, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ ++ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ ++ OPT_UINT(0, U64_MAX), \ ++ BCH_SB_GC_RESERVE_BYTES, 0, \ ++ "%", "Amount of disk space to reserve for copygc\n" \ ++ "Takes precedence over gc_reserve_percent if set")\ ++ x(root_reserve_percent, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(0, 100), \ ++ BCH_SB_ROOT_RESERVE, 0, \ ++ "%", "Percentage of disk space to reserve for superuser")\ ++ x(wide_macs, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_128_BIT_MACS, false, \ ++ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(inline_data, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Enable inline data extents") \ ++ x(acl, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_POSIX_ACL, true, \ ++ NULL, "Enable POSIX acls") \ ++ x(usrquota, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_USRQUOTA, false, \ ++ NULL, "Enable user quotas") \ ++ x(grpquota, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_GRPQUOTA, false, \ ++ NULL, "Enable group quotas") \ ++ x(prjquota, u8, \ ++ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_PRJQUOTA, false, \ ++ NULL, "Enable project quotas") \ ++ x(degraded, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Allow mounting in degraded mode") \ ++ x(very_degraded, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Allow mounting in when data will be missing") \ ++ x(discard, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Enable discard/TRIM support") \ ++ x(verbose, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_delay, u32, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, U32_MAX), \ ++ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ ++ NULL, "Delay in milliseconds before automatic journal commits")\ ++ x(journal_flush_disabled, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ ++ NULL, "Disable journal flush on sync/fsync\n" \ ++ "If enabled, writes can be lost, but only since the\n"\ ++ "last journal write (default 1 second)") \ ++ x(journal_reclaim_delay, u32, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(0, U32_MAX), \ ++ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ ++ NULL, "Delay in milliseconds before automatic journal reclaim")\ ++ x(move_bytes_in_flight, u32, \ ++ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1024, U32_MAX), \ ++ BCH2_NO_SB_OPT, 1U << 20, \ ++ NULL, "Amount of IO in flight to keep in flight by the move path")\ ++ x(fsck, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Run fsck on mount") \ ++ x(fix_errors, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Fix errors during fsck without asking") \ ++ x(ratelimit_errors, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ ++ NULL, "Ratelimit error messages during fsck") \ ++ x(nochanges, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Super read only mode - no writes at all will be issued,\n"\ ++ "even if we have to replay the journal") \ ++ x(norecovery, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Don't replay the journal") \ ++ x(keep_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Don't free journal entries/keys after startup")\ ++ x(read_entire_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Read all journal entries, not just dirty ones")\ ++ x(read_journal_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Only read the journal, skip the rest of recovery")\ ++ x(noexcl, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Don't open device in exclusive mode") \ ++ x(direct_io, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Use O_DIRECT (userspace only)") \ ++ x(sb, u64, \ ++ OPT_MOUNT, \ ++ OPT_UINT(0, S64_MAX), \ ++ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ ++ "offset", "Sector offset of superblock") \ ++ x(read_only, u8, \ ++ OPT_FS, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(nostart, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Don\'t start filesystem, only open devices") \ ++ x(reconstruct_alloc, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Reconstruct alloc btree") \ ++ x(version_upgrade, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Set superblock to latest version,\n" \ ++ "allowing any new features to be used") \ ++ x(buckets_nouse, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Allocate the buckets_nouse bitmap") \ ++ x(project, u8, \ ++ OPT_INODE, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(fs_size, u64, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, S64_MAX), \ ++ BCH2_NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(bucket, u32, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, S64_MAX), \ ++ BCH2_NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(durability, u8, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ BCH2_NO_SB_OPT, 1, \ ++ "n", "Data written to this device will be considered\n"\ ++ "to have already been replicated n times") ++ ++struct bch_opts { ++#define x(_name, _bits, ...) unsigned _name##_defined:1; ++ BCH_OPTS() ++#undef x ++ ++#define x(_name, _bits, ...) _bits _name; ++ BCH_OPTS() ++#undef x ++}; ++ ++static const struct bch_opts bch2_opts_default = { ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ++ ._name##_defined = true, \ ++ ._name = _default, \ ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++#define opt_defined(_opts, _name) ((_opts)._name##_defined) ++ ++#define opt_get(_opts, _name) \ ++ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) ++ ++#define opt_set(_opts, _name, _v) \ ++do { \ ++ (_opts)._name##_defined = true; \ ++ (_opts)._name = _v; \ ++} while (0) ++ ++static inline struct bch_opts bch2_opts_empty(void) ++{ ++ return (struct bch_opts) { 0 }; ++} ++ ++void bch2_opts_apply(struct bch_opts *, struct bch_opts); ++ ++enum bch_opt_id { ++#define x(_name, ...) Opt_##_name, ++ BCH_OPTS() ++#undef x ++ bch2_opts_nr ++}; ++ ++struct bch_fs; ++struct printbuf; ++ ++struct bch_option { ++ struct attribute attr; ++ u64 (*get_sb)(const struct bch_sb *); ++ void (*set_sb)(struct bch_sb *, u64); ++ enum opt_type type; ++ enum opt_flags flags; ++ u64 min, max; ++ ++ const char * const *choices; ++ int (*parse)(struct bch_fs *, const char *, u64 *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++ ++ const char *hint; ++ const char *help; ++ ++}; ++ ++extern const struct bch_option bch2_opt_table[]; ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); ++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); ++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); ++ ++u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); ++int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); ++void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); ++void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); ++ ++int bch2_opt_lookup(const char *); ++int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, ++ const char *, u64 *, struct printbuf *); ++ ++#define OPT_SHOW_FULL_LIST (1 << 0) ++#define OPT_SHOW_MOUNT_STYLE (1 << 1) ++ ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, ++ const struct bch_option *, u64, unsigned); ++ ++int bch2_opt_check_may_set(struct bch_fs *, int, u64); ++int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); ++ ++/* inode opts: */ ++ ++struct bch_io_opts { ++#define x(_name, _bits) unsigned _name##_defined:1; ++ BCH_INODE_OPTS() ++#undef x ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_OPTS() ++#undef x ++}; ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); ++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); ++bool bch2_opt_is_inode_opt(enum bch_opt_id); ++ ++#endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +new file mode 100644 +index 000000000000..454c76e03be9 +--- /dev/null ++++ b/fs/bcachefs/quota.c +@@ -0,0 +1,823 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "errcode.h" ++#include "inode.h" ++#include "quota.h" ++#include "subvolume.h" ++#include "super-io.h" ++ ++static const char * const bch2_quota_types[] = { ++ "user", ++ "group", ++ "project", ++}; ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ ++ if (vstruct_bytes(&q->field) < sizeof(*q)) { ++ prt_printf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&q->field), sizeof(*q)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ unsigned qtyp, counter; ++ ++ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { ++ prt_printf(out, "%s: flags %llx", ++ bch2_quota_types[qtyp], ++ le64_to_cpu(q->q[qtyp].flags)); ++ ++ for (counter = 0; counter < Q_COUNTERS; counter++) ++ prt_printf(out, " %s timelimit %u warnlimit %u", ++ bch2_quota_counters[counter], ++ le32_to_cpu(q->q[qtyp].c[counter].timelimit), ++ le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); ++ ++ prt_newline(out); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_quota = { ++ .validate = bch2_sb_quota_validate, ++ .to_text = bch2_sb_quota_to_text, ++}; ++ ++int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (k.k->p.inode >= QTYP_NR) { ++ prt_printf(err, "invalid quota type (%llu >= %u)", ++ k.k->p.inode, QTYP_NR); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_quota)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); ++ unsigned i; ++ ++ for (i = 0; i < Q_COUNTERS; i++) ++ prt_printf(out, "%s hardlimit %llu softlimit %llu", ++ bch2_quota_counters[i], ++ le64_to_cpu(dq.v->c[i].hardlimit), ++ le64_to_cpu(dq.v->c[i].softlimit)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++#include ++#include ++#include ++ ++static inline unsigned __next_qtype(unsigned i, unsigned qtypes) ++{ ++ qtypes >>= i; ++ return qtypes ? i + __ffs(qtypes) : QTYP_NR; ++} ++ ++#define for_each_set_qtype(_c, _i, _q, _qtypes) \ ++ for (_i = 0; \ ++ (_i = __next_qtype(_i, _qtypes), \ ++ _q = &(_c)->quotas[_i], \ ++ _i < QTYP_NR); \ ++ _i++) ++ ++static bool ignore_hardlimit(struct bch_memquota_type *q) ++{ ++ if (capable(CAP_SYS_RESOURCE)) ++ return true; ++#if 0 ++ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; ++ ++ return capable(CAP_SYS_RESOURCE) && ++ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || ++ !(info->dqi_flags & DQF_ROOT_SQUASH)); ++#endif ++ return false; ++} ++ ++enum quota_msg { ++ SOFTWARN, /* Softlimit reached */ ++ SOFTLONGWARN, /* Grace time expired */ ++ HARDWARN, /* Hardlimit reached */ ++ ++ HARDBELOW, /* Usage got below inode hardlimit */ ++ SOFTBELOW, /* Usage got below inode softlimit */ ++}; ++ ++static int quota_nl[][Q_COUNTERS] = { ++ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, ++ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, ++ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, ++ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, ++ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, ++ ++ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, ++ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, ++ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, ++ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, ++ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, ++}; ++ ++struct quota_msgs { ++ u8 nr; ++ struct { ++ u8 qtype; ++ u8 msg; ++ } m[QTYP_NR * Q_COUNTERS]; ++}; ++ ++static void prepare_msg(unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); ++ ++ msgs->m[msgs->nr].qtype = qtype; ++ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; ++ msgs->nr++; ++} ++ ++static void prepare_warning(struct memquota_counter *qc, ++ unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ if (qc->warning_issued & (1 << msg_type)) ++ return; ++ ++ prepare_msg(qtype, counter, msgs, msg_type); ++} ++ ++static void flush_warnings(struct bch_qid qid, ++ struct super_block *sb, ++ struct quota_msgs *msgs) ++{ ++ unsigned i; ++ ++ for (i = 0; i < msgs->nr; i++) ++ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), ++ sb->s_dev, msgs->m[i].msg); ++} ++ ++static int bch2_quota_check_limit(struct bch_fs *c, ++ unsigned qtype, ++ struct bch_memquota *mq, ++ struct quota_msgs *msgs, ++ enum quota_counters counter, ++ s64 v, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q = &c->quotas[qtype]; ++ struct memquota_counter *qc = &mq->c[counter]; ++ u64 n = qc->v + v; ++ ++ BUG_ON((s64) n < 0); ++ ++ if (mode == KEY_TYPE_QUOTA_NOCHECK) ++ return 0; ++ ++ if (v <= 0) { ++ if (n < qc->hardlimit && ++ (qc->warning_issued & (1 << HARDWARN))) { ++ qc->warning_issued &= ~(1 << HARDWARN); ++ prepare_msg(qtype, counter, msgs, HARDBELOW); ++ } ++ ++ if (n < qc->softlimit && ++ (qc->warning_issued & (1 << SOFTWARN))) { ++ qc->warning_issued &= ~(1 << SOFTWARN); ++ prepare_msg(qtype, counter, msgs, SOFTBELOW); ++ } ++ ++ qc->warning_issued = 0; ++ return 0; ++ } ++ ++ if (qc->hardlimit && ++ qc->hardlimit < n && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer && ++ ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer == 0) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ ++ /* XXX is this the right one? */ ++ qc->timer = ktime_get_real_seconds() + ++ q->limits[counter].warnlimit; ++ } ++ ++ return 0; ++} ++ ++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ unsigned qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq[QTYP_NR]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); ++ if (!mq[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mq[i]->c[counter].v += v; ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(qid, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static void __bch2_quota_transfer(struct bch_memquota *src_q, ++ struct bch_memquota *dst_q, ++ enum quota_counters counter, s64 v) ++{ ++ BUG_ON(v > src_q->c[counter].v); ++ BUG_ON(v + dst_q->c[counter].v < v); ++ ++ src_q->c[counter].v -= v; ++ dst_q->c[counter].v += v; ++} ++ ++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q; ++ struct bch_memquota *src_q[3], *dst_q[3]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); ++ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); ++ ++ if (!src_q[i] || !dst_q[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, ++ dst_q[i]->c[Q_SPC].v + space, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, ++ dst_q[i]->c[Q_INO].v + 1, ++ mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); ++ } ++ ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(dst, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq; ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq; ++ unsigned i; ++ ++ BUG_ON(k.k->p.inode >= QTYP_NR); ++ ++ if (!((1U << k.k->p.inode) & enabled_qtypes(c))) ++ return 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ dq = bkey_s_c_to_quota(k); ++ q = &c->quotas[k.k->p.inode]; ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); ++ if (!mq) { ++ mutex_unlock(&q->lock); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < Q_COUNTERS; i++) { ++ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); ++ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); ++ } ++ ++ mutex_unlock(&q->lock); ++ } ++ ++ return 0; ++} ++ ++void bch2_fs_quota_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ genradix_free(&c->quotas[i].table); ++} ++ ++void bch2_fs_quota_init(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ mutex_init(&c->quotas[i].lock); ++} ++ ++static void bch2_sb_quota_read(struct bch_fs *c) ++{ ++ struct bch_sb_field_quota *sb_quota; ++ unsigned i, j; ++ ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) ++ return; ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ struct bch_memquota_type *q = &c->quotas[i]; ++ ++ for (j = 0; j < Q_COUNTERS; j++) { ++ q->limits[j].timelimit = ++ le32_to_cpu(sb_quota->q[i].c[j].timelimit); ++ q->limits[j].warnlimit = ++ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); ++ } ++ } ++} ++ ++static int bch2_fs_quota_read_inode(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ struct bch_subvolume subvolume; ++ int ret; ++ ++ ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't do quota accounting in snapshots: ++ */ ++ if (BCH_SUBVOLUME_SNAP(&subvolume)) ++ goto advance; ++ ++ if (!bkey_is_inode(k.k)) ++ goto advance; ++ ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++advance: ++ bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); ++ return 0; ++} ++ ++int bch2_fs_quota_read(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_quota_read(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, ++ POS_MIN, BTREE_ITER_PREFETCH, k, ++ __bch2_quota_set(c, k)) ?: ++ for_each_btree_key2(&trans, iter, BTREE_ID_inodes, ++ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ bch2_fs_quota_read_inode(&trans, &iter, k)); ++ if (ret) ++ bch_err(c, "err in quota_read: %s", bch2_err_str(ret)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++/* Enable/disable/delete quotas for an entire filesystem: */ ++ ++static int bch2_quota_enable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ /* Accounting must be enabled at mount time: */ ++ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) ++ return -EINVAL; ++ ++ /* Can't enable enforcement without accounting: */ ++ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) ++ return -EINVAL; ++ ++ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) ++ return -EINVAL; ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_disable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (uflags & FS_USER_QUOTA) { ++ if (c->opts.usrquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, ++ POS(QTYP_USR, 0), ++ POS(QTYP_USR + 1, 0), ++ 0, NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_GROUP_QUOTA) { ++ if (c->opts.grpquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, ++ POS(QTYP_GRP, 0), ++ POS(QTYP_GRP + 1, 0), ++ 0, NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_PROJ_QUOTA) { ++ if (c->opts.prjquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, ++ POS(QTYP_PRJ, 0), ++ POS(QTYP_PRJ + 1, 0), ++ 0, NULL); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Return quota status information, such as enforcements, quota file inode ++ * numbers etc. ++ */ ++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ unsigned qtypes = enabled_qtypes(c); ++ unsigned i; ++ ++ memset(state, 0, sizeof(*state)); ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ state->s_state[i].flags |= QCI_SYSFILE; ++ ++ if (!(qtypes & (1 << i))) ++ continue; ++ ++ state->s_state[i].flags |= QCI_ACCT_ENABLED; ++ ++ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; ++ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; ++ ++ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; ++ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Adjust quota timers & warnings ++ */ ++static int bch2_quota_set_info(struct super_block *sb, int type, ++ struct qc_info *info) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ struct bch_memquota_type *q; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (type >= QTYP_NR) ++ return -EINVAL; ++ ++ if (!((1 << type) & enabled_qtypes(c))) ++ return -ESRCH; ++ ++ if (info->i_fieldmask & ++ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) ++ return -EINVAL; ++ ++ q = &c->quotas[type]; ++ ++ mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) { ++ sb_quota = bch2_sb_resize_quota(&c->disk_sb, ++ sizeof(*sb_quota) / sizeof(u64)); ++ if (!sb_quota) ++ return -ENOSPC; ++ } ++ ++ if (info->i_fieldmask & QC_SPC_TIMER) ++ sb_quota->q[type].c[Q_SPC].timelimit = ++ cpu_to_le32(info->i_spc_timelimit); ++ ++ if (info->i_fieldmask & QC_SPC_WARNS) ++ sb_quota->q[type].c[Q_SPC].warnlimit = ++ cpu_to_le32(info->i_spc_warnlimit); ++ ++ if (info->i_fieldmask & QC_INO_TIMER) ++ sb_quota->q[type].c[Q_INO].timelimit = ++ cpu_to_le32(info->i_ino_timelimit); ++ ++ if (info->i_fieldmask & QC_INO_WARNS) ++ sb_quota->q[type].c[Q_INO].warnlimit = ++ cpu_to_le32(info->i_ino_warnlimit); ++ ++ bch2_sb_quota_read(c); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* Get/set individual quotas: */ ++ ++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) ++{ ++ dst->d_space = src->c[Q_SPC].v << 9; ++ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; ++ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; ++ dst->d_spc_timer = src->c[Q_SPC].timer; ++ dst->d_spc_warns = src->c[Q_SPC].warns; ++ ++ dst->d_ino_count = src->c[Q_INO].v; ++ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; ++ dst->d_ino_softlimit = src->c[Q_INO].softlimit; ++ dst->d_ino_timer = src->c[Q_INO].timer; ++ dst->d_ino_warns = src->c[Q_INO].warns; ++} ++ ++static int bch2_get_quota(struct super_block *sb, struct kqid kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid.type]; ++ qid_t qid = from_kqid(&init_user_ns, kqid); ++ struct bch_memquota *mq; ++ ++ memset(qdq, 0, sizeof(*qdq)); ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr(&q->table, qid); ++ if (mq) ++ __bch2_quota_get(qdq, mq); ++ mutex_unlock(&q->lock); ++ ++ return 0; ++} ++ ++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid->type]; ++ qid_t qid = from_kqid(&init_user_ns, *kqid); ++ struct genradix_iter iter; ++ struct bch_memquota *mq; ++ int ret = 0; ++ ++ mutex_lock(&q->lock); ++ ++ genradix_for_each_from(&q->table, iter, mq, qid) ++ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { ++ __bch2_quota_get(qdq, mq); ++ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); ++ goto found; ++ } ++ ++ ret = -ENOENT; ++found: ++ mutex_unlock(&q->lock); ++ return ret; ++} ++ ++static int bch2_set_quota_trans(struct btree_trans *trans, ++ struct bkey_i_quota *new_quota, ++ struct qc_dqblk *qdq) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ++ ret = bkey_err(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_quota) ++ new_quota->v = *bkey_s_c_to_quota(k).v; ++ ++ if (qdq->d_fieldmask & QC_SPC_SOFT) ++ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ if (qdq->d_fieldmask & QC_SPC_HARD) ++ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ ++ if (qdq->d_fieldmask & QC_INO_SOFT) ++ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ if (qdq->d_fieldmask & QC_INO_HARD) ++ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bkey_i_quota new_quota; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ return ret; ++} ++ ++const struct quotactl_ops bch2_quotactl_operations = { ++ .quota_enable = bch2_quota_enable, ++ .quota_disable = bch2_quota_disable, ++ .rm_xquota = bch2_quota_remove, ++ ++ .get_state = bch2_quota_get_state, ++ .set_info = bch2_quota_set_info, ++ ++ .get_dqblk = bch2_get_quota, ++ .get_nextdqblk = bch2_get_next_quota, ++ .set_dqblk = bch2_set_quota, ++}; ++ ++#endif /* CONFIG_BCACHEFS_QUOTA */ +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +new file mode 100644 +index 000000000000..8c67ae1da7c7 +--- /dev/null ++++ b/fs/bcachefs/quota.h +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_H ++#define _BCACHEFS_QUOTA_H ++ ++#include "inode.h" ++#include "quota_types.h" ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_quota; ++ ++int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++ .key_invalid = bch2_quota_invalid, \ ++ .val_to_text = bch2_quota_to_text, \ ++} ++ ++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) ++{ ++ return (struct bch_qid) { ++ .q[QTYP_USR] = u->bi_uid, ++ .q[QTYP_GRP] = u->bi_gid, ++ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, ++ }; ++} ++ ++static inline unsigned enabled_qtypes(struct bch_fs *c) ++{ ++ return ((c->opts.usrquota << QTYP_USR)| ++ (c->opts.grpquota << QTYP_GRP)| ++ (c->opts.prjquota << QTYP_PRJ)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, ++ s64, enum quota_acct_mode); ++ ++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, ++ struct bch_qid, u64, enum quota_acct_mode); ++ ++void bch2_fs_quota_exit(struct bch_fs *); ++void bch2_fs_quota_init(struct bch_fs *); ++int bch2_fs_quota_read(struct bch_fs *); ++ ++extern const struct quotactl_ops bch2_quotactl_operations; ++ ++#else ++ ++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline void bch2_fs_quota_exit(struct bch_fs *c) {} ++static inline void bch2_fs_quota_init(struct bch_fs *c) {} ++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } ++ ++#endif ++ ++#endif /* _BCACHEFS_QUOTA_H */ +diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h +new file mode 100644 +index 000000000000..6a136083d389 +--- /dev/null ++++ b/fs/bcachefs/quota_types.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_TYPES_H ++#define _BCACHEFS_QUOTA_TYPES_H ++ ++#include ++ ++struct bch_qid { ++ u32 q[QTYP_NR]; ++}; ++ ++enum quota_acct_mode { ++ KEY_TYPE_QUOTA_PREALLOC, ++ KEY_TYPE_QUOTA_WARN, ++ KEY_TYPE_QUOTA_NOCHECK, ++}; ++ ++struct memquota_counter { ++ u64 v; ++ u64 hardlimit; ++ u64 softlimit; ++ s64 timer; ++ int warns; ++ int warning_issued; ++}; ++ ++struct bch_memquota { ++ struct memquota_counter c[Q_COUNTERS]; ++}; ++ ++typedef GENRADIX(struct bch_memquota) bch_memquota_table; ++ ++struct quota_limit { ++ u32 timelimit; ++ u32 warnlimit; ++}; ++ ++struct bch_memquota_type { ++ struct quota_limit limits[Q_COUNTERS]; ++ bch_memquota_table table; ++ struct mutex lock; ++}; ++ ++#endif /* _BCACHEFS_QUOTA_TYPES_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +new file mode 100644 +index 000000000000..ecc64dd92b05 +--- /dev/null ++++ b/fs/bcachefs/rebalance.c +@@ -0,0 +1,361 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "errcode.h" ++#include "extents.h" ++#include "io.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Check if an extent should be moved: ++ * returns -1 if it should not be moved, or ++ * device of pointer that should be moved, if known, or INT_MAX if unknown ++ */ ++static bool rebalance_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ unsigned i; ++ ++ data_opts->rewrite_ptrs = 0; ++ data_opts->target = io_opts->background_target; ++ data_opts->extra_replicas = 0; ++ data_opts->btree_insert_flags = 0; ++ ++ if (io_opts->background_compression && ++ !bch2_bkey_is_incompressible(k)) { ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ i = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (!p.ptr.cached && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ data_opts->rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ } ++ ++ if (io_opts->background_target) { ++ const struct bch_extent_ptr *ptr; ++ ++ i = 0; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (!ptr->cached && ++ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target)) ++ data_opts->rewrite_ptrs |= 1U << i; ++ i++; ++ } ++ } ++ ++ return data_opts->rewrite_ptrs != 0; ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ struct data_update_opts update_opts = { 0 }; ++ struct bkey_ptrs_c ptrs; ++ const struct bch_extent_ptr *ptr; ++ unsigned i; ++ ++ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) ++ return; ++ ++ i = 0; ++ ptrs = bch2_bkey_ptrs_c(k); ++ bkey_for_each_ptr(ptrs, ptr) { ++ if ((1U << i) && update_opts.rewrite_ptrs) ++ if (atomic64_add_return(k.k->size, ++ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == ++ k.k->size) ++ rebalance_wakeup(c); ++ i++; ++ } ++} ++ ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); ++} ++ ++struct rebalance_work { ++ int dev_most_full_idx; ++ unsigned dev_most_full_percent; ++ u64 dev_most_full_work; ++ u64 dev_most_full_capacity; ++ u64 total_work; ++}; ++ ++static void rebalance_work_accumulate(struct rebalance_work *w, ++ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) ++{ ++ unsigned percent_full; ++ u64 work = dev_work + unknown_dev; ++ ++ if (work < dev_work || work < unknown_dev) ++ work = U64_MAX; ++ work = min(work, capacity); ++ ++ percent_full = div64_u64(work * 100, capacity); ++ ++ if (percent_full >= w->dev_most_full_percent) { ++ w->dev_most_full_idx = idx; ++ w->dev_most_full_percent = percent_full; ++ w->dev_most_full_work = work; ++ w->dev_most_full_capacity = capacity; ++ } ++ ++ if (w->total_work + dev_work >= w->total_work && ++ w->total_work + dev_work >= dev_work) ++ w->total_work += dev_work; ++} ++ ++static struct rebalance_work rebalance_work(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct rebalance_work ret = { .dev_most_full_idx = -1 }; ++ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ rebalance_work_accumulate(&ret, ++ atomic64_read(&ca->rebalance_work), ++ unknown_dev, ++ bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket), ++ i); ++ ++ rebalance_work_accumulate(&ret, ++ unknown_dev, 0, c->capacity, -1); ++ ++ return ret; ++} ++ ++static void rebalance_work_reset(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ atomic64_set(&ca->rebalance_work, 0); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++} ++ ++static unsigned long curr_cputime(void) ++{ ++ u64 utime, stime; ++ ++ task_cputime_adjusted(current, &utime, &stime); ++ return nsecs_to_jiffies(utime + stime); ++} ++ ++static int bch2_rebalance_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct rebalance_work w, p; ++ struct bch_move_stats move_stats; ++ unsigned long start, prev_start; ++ unsigned long prev_run_time, prev_run_cputime; ++ unsigned long cputime, prev_cputime; ++ u64 io_start; ++ long throttle; ++ ++ set_freezable(); ++ ++ io_start = atomic64_read(&clock->now); ++ p = rebalance_work(c); ++ prev_start = jiffies; ++ prev_cputime = curr_cputime(); ++ ++ bch_move_stats_init(&move_stats, "rebalance"); ++ while (!kthread_wait_freezable(r->enabled)) { ++ cond_resched(); ++ ++ start = jiffies; ++ cputime = curr_cputime(); ++ ++ prev_run_time = start - prev_start; ++ prev_run_cputime = cputime - prev_cputime; ++ ++ w = rebalance_work(c); ++ BUG_ON(!w.dev_most_full_capacity); ++ ++ if (!w.total_work) { ++ r->state = REBALANCE_WAITING; ++ kthread_wait_freezable(rebalance_work(c).total_work); ++ continue; ++ } ++ ++ /* ++ * If there isn't much work to do, throttle cpu usage: ++ */ ++ throttle = prev_run_cputime * 100 / ++ max(1U, w.dev_most_full_percent) - ++ prev_run_time; ++ ++ if (w.dev_most_full_percent < 20 && throttle > 0) { ++ r->throttled_until_iotime = io_start + ++ div_u64(w.dev_most_full_capacity * ++ (20 - w.dev_most_full_percent), ++ 50); ++ ++ if (atomic64_read(&clock->now) + clock->max_slop < ++ r->throttled_until_iotime) { ++ r->throttled_until_cputime = start + throttle; ++ r->state = REBALANCE_THROTTLED; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } ++ } ++ ++ /* minimum 1 mb/sec: */ ++ r->pd.rate.rate = ++ max_t(u64, 1 << 11, ++ r->pd.rate.rate * ++ max(p.dev_most_full_percent, 1U) / ++ max(w.dev_most_full_percent, 1U)); ++ ++ io_start = atomic64_read(&clock->now); ++ p = w; ++ prev_start = start; ++ prev_cputime = cputime; ++ ++ r->state = REBALANCE_RUNNING; ++ memset(&move_stats, 0, sizeof(move_stats)); ++ rebalance_work_reset(c); ++ ++ bch2_move_data(c, ++ 0, POS_MIN, ++ BTREE_ID_NR, POS_MAX, ++ /* ratelimiting disabled for now */ ++ NULL, /* &r->pd.rate, */ ++ &move_stats, ++ writepoint_ptr(&c->rebalance_write_point), ++ true, ++ rebalance_pred, NULL); ++ } ++ ++ return 0; ++} ++ ++void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct rebalance_work w = rebalance_work(c); ++ ++ out->tabstops[0] = 20; ++ ++ prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); ++ prt_tab(out); ++ ++ prt_human_readable_u64(out, w.dev_most_full_work << 9); ++ prt_printf(out, "/"); ++ prt_human_readable_u64(out, w.dev_most_full_capacity << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "total work:"); ++ prt_tab(out); ++ ++ prt_human_readable_u64(out, w.total_work << 9); ++ prt_printf(out, "/"); ++ prt_human_readable_u64(out, c->capacity << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "rate:"); ++ prt_tab(out); ++ prt_printf(out, "%u", r->pd.rate.rate); ++ prt_newline(out); ++ ++ switch (r->state) { ++ case REBALANCE_WAITING: ++ prt_printf(out, "waiting"); ++ break; ++ case REBALANCE_THROTTLED: ++ prt_printf(out, "throttled for %lu sec or ", ++ (r->throttled_until_cputime - jiffies) / HZ); ++ prt_human_readable_u64(out, ++ (r->throttled_until_iotime - ++ atomic64_read(&c->io_clock[WRITE].now)) << 9); ++ prt_printf(out, " io"); ++ break; ++ case REBALANCE_RUNNING: ++ prt_printf(out, "running"); ++ break; ++ } ++ prt_newline(out); ++} ++ ++void bch2_rebalance_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ c->rebalance.pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->rebalance.pd.rate); ++ ++ p = rcu_dereference_protected(c->rebalance.thread, 1); ++ c->rebalance.thread = NULL; ++ ++ if (p) { ++ /* for sychronizing with rebalance_wakeup() */ ++ synchronize_rcu(); ++ ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_rebalance_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ int ret; ++ ++ if (c->rebalance.thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ++ ret = PTR_ERR_OR_ZERO(p); ++ if (ret) { ++ bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ get_task_struct(p); ++ rcu_assign_pointer(c->rebalance.thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->rebalance.pd); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); ++} +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +new file mode 100644 +index 000000000000..7ade0bb81cce +--- /dev/null ++++ b/fs/bcachefs/rebalance.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_H ++#define _BCACHEFS_REBALANCE_H ++ ++#include "rebalance_types.h" ++ ++static inline void rebalance_wakeup(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(c->rebalance.thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_opts *); ++void bch2_rebalance_add_work(struct bch_fs *, u64); ++ ++void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_rebalance_stop(struct bch_fs *); ++int bch2_rebalance_start(struct bch_fs *); ++void bch2_fs_rebalance_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +new file mode 100644 +index 000000000000..7462a92e9598 +--- /dev/null ++++ b/fs/bcachefs/rebalance_types.h +@@ -0,0 +1,26 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_TYPES_H ++#define _BCACHEFS_REBALANCE_TYPES_H ++ ++#include "move_types.h" ++ ++enum rebalance_state { ++ REBALANCE_WAITING, ++ REBALANCE_THROTTLED, ++ REBALANCE_RUNNING, ++}; ++ ++struct bch_fs_rebalance { ++ struct task_struct __rcu *thread; ++ struct bch_pd_controller pd; ++ ++ atomic64_t work_unknown_dev; ++ ++ enum rebalance_state state; ++ u64 throttled_until_iotime; ++ unsigned long throttled_until_cputime; ++ ++ unsigned enabled:1; ++}; ++ ++#endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +new file mode 100644 +index 000000000000..b070bdf01500 +--- /dev/null ++++ b/fs/bcachefs/recovery.c +@@ -0,0 +1,1597 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "backpointers.h" ++#include "bkey_buf.h" ++#include "alloc_background.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets.h" ++#include "dirent.h" ++#include "ec.h" ++#include "errcode.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "lru.h" ++#include "move.h" ++#include "quota.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "subvolume.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++/* for -o reconstruct_alloc: */ ++static void drop_alloc_keys(struct journal_keys *keys) ++{ ++ size_t src, dst; ++ ++ for (src = 0, dst = 0; src < keys->nr; src++) ++ if (keys->d[src].btree_id != BTREE_ID_alloc) ++ keys->d[dst++] = keys->d[src]; ++ ++ keys->nr = dst; ++} ++ ++/* ++ * Btree node pointers have a field to stack a pointer to the in memory btree ++ * node; we need to zero out this field when reading in btree nodes, or when ++ * reading in keys from the journal: ++ */ ++static void zero_out_btree_mem_ptr(struct journal_keys *keys) ++{ ++ struct journal_key *i; ++ ++ for (i = keys->d; i < keys->d + keys->nr; i++) ++ if (i->k->k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; ++} ++ ++/* iterate over keys read from the journal: */ ++ ++static int __journal_key_cmp(enum btree_id l_btree_id, ++ unsigned l_level, ++ struct bpos l_pos, ++ const struct journal_key *r) ++{ ++ return (cmp_int(l_btree_id, r->btree_id) ?: ++ cmp_int(l_level, r->level) ?: ++ bpos_cmp(l_pos, r->k->k.p)); ++} ++ ++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) ++{ ++ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); ++} ++ ++static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) ++{ ++ size_t gap_size = keys->size - keys->nr; ++ ++ if (idx >= keys->gap) ++ idx += gap_size; ++ return idx; ++} ++ ++static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) ++{ ++ return keys->d + idx_to_pos(keys, idx); ++} ++ ++static size_t __bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ size_t l = 0, r = keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < keys->nr && ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); ++ ++ BUG_ON(l && ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); ++ ++ return l; ++} ++ ++static size_t bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); ++} ++ ++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos, ++ struct bpos end_pos, size_t *idx) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ unsigned iters = 0; ++ struct journal_key *k; ++search: ++ if (!*idx) ++ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); ++ ++ while (*idx < keys->nr && ++ (k = idx_to_key(keys, *idx), ++ k->btree_id == btree_id && ++ k->level == level && ++ bpos_cmp(k->k->k.p, end_pos) <= 0)) { ++ if (bpos_cmp(k->k->k.p, pos) >= 0 && ++ !k->overwritten) ++ return k->k; ++ ++ (*idx)++; ++ iters++; ++ if (iters == 10) { ++ *idx = 0; ++ goto search; ++ } ++ } ++ ++ return NULL; ++} ++ ++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos) ++{ ++ size_t idx = 0; ++ ++ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); ++} ++ ++static void journal_iters_fix(struct bch_fs *c) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ /* The key we just inserted is immediately before the gap: */ ++ size_t gap_end = keys->gap + (keys->size - keys->nr); ++ struct btree_and_journal_iter *iter; ++ ++ /* ++ * If an iterator points one after the key we just inserted, decrement ++ * the iterator so it points at the key we just inserted - if the ++ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will ++ * handle that: ++ */ ++ list_for_each_entry(iter, &c->journal_iters, journal.list) ++ if (iter->journal.idx == gap_end) ++ iter->journal.idx = keys->gap - 1; ++} ++ ++static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_iter *iter; ++ size_t gap_size = keys->size - keys->nr; ++ ++ list_for_each_entry(iter, &c->journal_iters, list) { ++ if (iter->idx > old_gap) ++ iter->idx -= gap_size; ++ if (iter->idx >= new_gap) ++ iter->idx += gap_size; ++ } ++} ++ ++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct journal_key n = { ++ .btree_id = id, ++ .level = level, ++ .k = k, ++ .allocated = true, ++ /* ++ * Ensure these keys are done last by journal replay, to unblock ++ * journal reclaim: ++ */ ++ .journal_seq = U32_MAX, ++ }; ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); ++ ++ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); ++ ++ if (idx < keys->size && ++ journal_key_cmp(&n, &keys->d[idx]) == 0) { ++ if (keys->d[idx].allocated) ++ kfree(keys->d[idx].k); ++ keys->d[idx] = n; ++ return 0; ++ } ++ ++ if (idx > keys->gap) ++ idx -= keys->size - keys->nr; ++ ++ if (keys->nr == keys->size) { ++ struct journal_keys new_keys = { ++ .nr = keys->nr, ++ .size = max_t(size_t, keys->size, 8) * 2, ++ .journal_seq_base = keys->journal_seq_base, ++ }; ++ ++ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); ++ if (!new_keys.d) { ++ bch_err(c, "%s: error allocating new key array (size %zu)", ++ __func__, new_keys.size); ++ return -ENOMEM; ++ } ++ ++ /* Since @keys was full, there was no gap: */ ++ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); ++ kvfree(keys->d); ++ *keys = new_keys; ++ ++ /* And now the gap is at the end: */ ++ keys->gap = keys->nr; ++ } ++ ++ journal_iters_move_gap(c, keys->gap, idx); ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); ++ keys->gap = idx; ++ ++ keys->nr++; ++ keys->d[keys->gap++] = n; ++ ++ journal_iters_fix(c); ++ ++ return 0; ++} ++ ++/* ++ * Can only be used from the recovery thread while we're still RO - can't be ++ * used once we've got RW, as journal_keys is at that point used by multiple ++ * threads: ++ */ ++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct bkey_i *n; ++ int ret; ++ ++ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ bkey_copy(n, k); ++ ret = bch2_journal_key_insert_take(c, id, level, n); ++ if (ret) ++ kfree(n); ++ return ret; ++} ++ ++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bpos pos) ++{ ++ struct bkey_i whiteout; ++ ++ bkey_init(&whiteout.k); ++ whiteout.k.p = pos; ++ ++ return bch2_journal_key_insert(c, id, level, &whiteout); ++} ++ ++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, btree, level, pos); ++ ++ if (idx < keys->size && ++ keys->d[idx].btree_id == btree && ++ keys->d[idx].level == level && ++ !bpos_cmp(keys->d[idx].k->k.p, pos)) ++ keys->d[idx].overwritten = true; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->idx < iter->keys->size) { ++ iter->idx++; ++ if (iter->idx == iter->keys->gap) ++ iter->idx += iter->keys->size - iter->keys->nr; ++ } ++} ++ ++struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ struct journal_key *k = iter->keys->d + iter->idx; ++ ++ while (k < iter->keys->d + iter->keys->size && ++ k->btree_id == iter->btree_id && ++ k->level == iter->level) { ++ if (!k->overwritten) ++ return bkey_i_to_s_c(k->k); ++ ++ bch2_journal_iter_advance(iter); ++ k = iter->keys->d + iter->idx; ++ } ++ ++ return bkey_s_c_null; ++} ++ ++static void bch2_journal_iter_exit(struct journal_iter *iter) ++{ ++ list_del(&iter->list); ++} ++ ++static void bch2_journal_iter_init(struct bch_fs *c, ++ struct journal_iter *iter, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = &c->journal_keys; ++ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} ++ ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++} ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ if (!bpos_cmp(iter->pos, SPOS_MAX)) ++ iter->at_end = true; ++ else ++ iter->pos = bpos_successor(iter->pos); ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c btree_k, journal_k, ret; ++again: ++ if (iter->at_end) ++ return bkey_s_c_null; ++ ++ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && ++ bpos_cmp(btree_k.k->p, iter->pos) < 0) ++ bch2_journal_iter_advance_btree(iter); ++ ++ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && ++ bpos_cmp(journal_k.k->p, iter->pos) < 0) ++ bch2_journal_iter_advance(&iter->journal); ++ ++ ret = journal_k.k && ++ (!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0) ++ ? journal_k ++ : btree_k; ++ ++ if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) ++ ret = bkey_s_c_null; ++ ++ if (ret.k) { ++ iter->pos = ret.k->p; ++ if (bkey_deleted(ret.k)) { ++ bch2_btree_and_journal_iter_advance(iter); ++ goto again; ++ } ++ } else { ++ iter->pos = SPOS_MAX; ++ iter->at_end = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) ++{ ++ bch2_journal_iter_exit(&iter->journal); ++} ++ ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct bpos pos) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ iter->node_iter = node_iter; ++ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); ++ INIT_LIST_HEAD(&iter->journal.list); ++ iter->pos = b->data->min_key; ++ iter->at_end = false; ++} ++ ++/* ++ * this version is used by btree_gc before filesystem has gone RW and ++ * multithreaded, so uses the journal_iters list: ++ */ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b) ++{ ++ struct btree_node_iter node_iter; ++ ++ bch2_btree_node_iter_init_from_start(&node_iter, b); ++ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); ++ list_add(&iter->journal.list, &c->journal_iters); ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++void bch2_journal_entries_free(struct bch_fs *c) ++{ ++ struct journal_replay **i; ++ struct genradix_iter iter; ++ ++ genradix_for_each(&c->journal_entries, iter, i) ++ if (*i) ++ kvpfree(*i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&(*i)->j)); ++ genradix_free(&c->journal_entries); ++} ++ ++/* ++ * When keys compare equal, oldest compares first: ++ */ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return journal_key_cmp(l, r) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++void bch2_journal_keys_free(struct journal_keys *keys) ++{ ++ struct journal_key *i; ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); ++ keys->gap = keys->nr; ++ ++ for (i = keys->d; i < keys->d + keys->nr; i++) ++ if (i->allocated) ++ kfree(i->k); ++ ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = keys->gap = keys->size = 0; ++} ++ ++static int journal_keys_sort(struct bch_fs *c) ++{ ++ struct genradix_iter iter; ++ struct journal_replay *i, **_i; ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key *src, *dst; ++ size_t nr_keys = 0; ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ if (!keys->journal_seq_base) ++ keys->journal_seq_base = le64_to_cpu(i->j.seq); ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ nr_keys++; ++ } ++ ++ if (!nr_keys) ++ return 0; ++ ++ keys->size = roundup_pow_of_two(nr_keys); ++ ++ keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL); ++ if (!keys->d) ++ return -ENOMEM; ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX); ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys->d[keys->nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .k = k, ++ .journal_seq = le64_to_cpu(i->j.seq) - ++ keys->journal_seq_base, ++ .journal_offset = k->_data - i->j._data, ++ }; ++ } ++ ++ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); ++ ++ src = dst = keys->d; ++ while (src < keys->d + keys->nr) { ++ while (src + 1 < keys->d + keys->nr && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && ++ !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) ++ src++; ++ ++ *dst++ = *src++; ++ } ++ ++ keys->nr = dst - keys->d; ++ keys->gap = keys->nr; ++ return 0; ++} ++ ++/* journal replay: */ ++ ++static void replay_now_at(struct journal *j, u64 seq) ++{ ++ BUG_ON(seq < j->replay_journal_seq); ++ ++ seq = min(seq, j->replay_journal_seq_end); ++ ++ while (j->replay_journal_seq < seq) ++ bch2_journal_pin_put(j, j->replay_journal_seq++); ++} ++ ++static int bch2_journal_replay_key(struct btree_trans *trans, ++ struct journal_key *k) ++{ ++ struct btree_iter iter; ++ unsigned iter_flags = ++ BTREE_ITER_INTENT| ++ BTREE_ITER_NOT_EXTENTS; ++ int ret; ++ ++ if (!k->level && k->btree_id == BTREE_ID_alloc) ++ iter_flags |= BTREE_ITER_CACHED; ++ ++ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, ++ BTREE_MAX_DEPTH, k->level, ++ iter_flags); ++ ret = bch2_btree_iter_traverse(&iter); ++ if (ret) ++ goto out; ++ ++ /* Must be checked with btree locked: */ ++ if (k->overwritten) ++ goto out; ++ ++ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = *((const struct journal_key **)_l); ++ const struct journal_key *r = *((const struct journal_key **)_r); ++ ++ return cmp_int(l->journal_seq, r->journal_seq); ++} ++ ++static int bch2_journal_replay(struct bch_fs *c) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_key **keys_sorted, *k; ++ struct journal *j = &c->journal; ++ size_t i; ++ int ret; ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); ++ keys->gap = keys->nr; ++ ++ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ++ if (!keys_sorted) ++ return -ENOMEM; ++ ++ for (i = 0; i < keys->nr; i++) ++ keys_sorted[i] = &keys->d[i]; ++ ++ sort(keys_sorted, keys->nr, ++ sizeof(keys_sorted[0]), ++ journal_sort_seq_cmp, NULL); ++ ++ if (keys->nr) ++ replay_now_at(j, keys->journal_seq_base); ++ ++ for (i = 0; i < keys->nr; i++) { ++ k = keys_sorted[i]; ++ ++ cond_resched(); ++ ++ replay_now_at(j, keys->journal_seq_base + k->journal_seq); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_NOFAIL| ++ (!k->allocated ++ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved ++ : 0), ++ bch2_journal_replay_key(&trans, k)); ++ if (ret) { ++ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", ++ ret, bch2_btree_ids[k->btree_id], k->level); ++ goto err; ++ } ++ } ++ ++ replay_now_at(j, j->replay_journal_seq_end); ++ j->replay_journal_seq = 0; ++ ++ bch2_journal_set_replay_done(j); ++ bch2_journal_flush_all_pins(j); ++ ret = bch2_journal_error(j); ++ ++ if (keys->nr && !ret) ++ bch2_journal_log_msg(&c->journal, "journal replay finished"); ++err: ++ kvfree(keys_sorted); ++ return ret; ++} ++ ++/* journal replay early: */ ++ ++static int journal_replay_entry_early(struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ int ret = 0; ++ ++ switch (entry->type) { ++ case BCH_JSET_ENTRY_btree_root: { ++ struct btree_root *r; ++ ++ if (entry->btree_id >= BTREE_ID_NR) { ++ bch_err(c, "filesystem has unknown btree type %u", ++ entry->btree_id); ++ return -EINVAL; ++ } ++ ++ r = &c->btree_roots[entry->btree_id]; ++ ++ if (entry->u64s) { ++ r->level = entry->level; ++ bkey_copy(&r->key, &entry->start[0]); ++ r->error = 0; ++ } else { ++ r->error = -EIO; ++ } ++ r->alive = true; ++ break; ++ } ++ case BCH_JSET_ENTRY_usage: { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ switch (entry->btree_id) { ++ case BCH_FS_USAGE_reserved: ++ if (entry->level < BCH_REPLICAS_MAX) ++ c->usage_base->persistent_reserved[entry->level] = ++ le64_to_cpu(u->v); ++ break; ++ case BCH_FS_USAGE_inodes: ++ c->usage_base->nr_inodes = le64_to_cpu(u->v); ++ break; ++ case BCH_FS_USAGE_key_version: ++ atomic64_set(&c->key_version, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_data_usage: { ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ ret = bch2_replicas_set_usage(c, &u->r, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ case BCH_JSET_ENTRY_dev_usage: { ++ struct jset_entry_dev_usage *u = ++ container_of(entry, struct jset_entry_dev_usage, entry); ++ struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); ++ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ++ ++ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); ++ ++ for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ++ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ++ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); ++ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist: { ++ struct jset_entry_blacklist *bl_entry = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->seq), ++ le64_to_cpu(bl_entry->seq) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist_v2: { ++ struct jset_entry_blacklist_v2 *bl_entry = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->start), ++ le64_to_cpu(bl_entry->end) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_clock: { ++ struct jset_entry_clock *clock = ++ container_of(entry, struct jset_entry_clock, entry); ++ ++ atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); ++ } ++ } ++ ++ return ret; ++} ++ ++static int journal_replay_early(struct bch_fs *c, ++ struct bch_sb_field_clean *clean) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ if (clean) { ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } else { ++ struct genradix_iter iter; ++ struct journal_replay *i, **_i; ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ vstruct_for_each(&i->j, entry) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } ++ } ++ ++ bch2_fs_usage_initialize(c); ++ ++ return 0; ++} ++ ++/* sb clean section: */ ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++static int verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret = 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ if (k1) ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); ++ else ++ prt_printf(&buf1, "(none)"); ++ ++ if (k2) ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); ++ else ++ prt_printf(&buf2, "(none)"); ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(k1)) || ++ l1 != l2, c, ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, buf1.buf, ++ l2, buf2.buf); ++ } ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ ret = bch2_sb_clean_validate_late(c, clean, READ); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static bool btree_id_is_alloc(enum btree_id id) ++{ ++ switch (id) { ++ case BTREE_ID_alloc: ++ case BTREE_ID_backpointers: ++ case BTREE_ID_need_discard: ++ case BTREE_ID_freespace: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static int read_btree_roots(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_root *r = &c->btree_roots[i]; ++ ++ if (!r->alive) ++ continue; ++ ++ if (btree_id_is_alloc(i) && ++ c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ continue; ++ } ++ ++ if (r->error) { ++ __fsck_err(c, btree_id_is_alloc(i) ++ ? FSCK_CAN_IGNORE : 0, ++ "invalid btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_alloc) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ } ++ ++ ret = bch2_btree_root_read(c, i, &r->key, r->level); ++ if (ret) { ++ __fsck_err(c, ++ btree_id_is_alloc(i) ++ ? FSCK_CAN_IGNORE : 0, ++ "error reading btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_alloc) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ } ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (!c->btree_roots[i].b) ++ bch2_btree_root_alloc(c, i); ++fsck_err: ++ return ret; ++} ++ ++static int bch2_fs_initialize_subvolumes(struct bch_fs *c) ++{ ++ struct bkey_i_snapshot root_snapshot; ++ struct bkey_i_subvolume root_volume; ++ int ret; ++ ++ bkey_snapshot_init(&root_snapshot.k_i); ++ root_snapshot.k.p.offset = U32_MAX; ++ root_snapshot.v.flags = 0; ++ root_snapshot.v.parent = 0; ++ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; ++ root_snapshot.v.pad = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_snapshots, ++ &root_snapshot.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ bkey_subvolume_init(&root_volume.k_i); ++ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; ++ root_volume.v.flags = 0; ++ root_volume.v.snapshot = cpu_to_le32(U32_MAX); ++ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_subvolumes, ++ &root_volume.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked inode; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!bkey_is_inode(k.k)) { ++ bch_err(trans->c, "root inode not found"); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ ret = bch2_inode_unpack(k, &inode); ++ BUG_ON(ret); ++ ++ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; ++ ++ ret = bch2_inode_write(trans, &iter, &inode); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_fs_recovery(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_clean *clean = NULL; ++ struct jset *last_journal_entry = NULL; ++ u64 blacklist_seq, journal_seq; ++ bool write_sb = false; ++ int ret = 0; ++ ++ if (c->sb.clean) ++ clean = read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean) ++ bch_info(c, "recovering from clean shutdown, journal seq %llu", ++ le64_to_cpu(clean->journal_seq)); ++ else ++ bch_info(c, "recovering from unclean shutdown"); ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { ++ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { ++ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { ++ bch_info(c, "alloc_v2 feature bit not set, fsck required"); ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ } ++ ++ if (!c->opts.nochanges) { ++ if (c->sb.version < bcachefs_metadata_version_backpointers) { ++ bch_info(c, "version prior to backpointers, upgrade and fsck required"); ++ c->opts.version_upgrade = true; ++ c->opts.fsck = true; ++ c->opts.fix_errors = FSCK_OPT_YES; ++ } ++ } ++ ++ if (c->opts.fsck && c->opts.norecovery) { ++ bch_err(c, "cannot select both norecovery and fsck"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_blacklist_table_initialize(c); ++ if (ret) { ++ bch_err(c, "error initializing blacklist table"); ++ goto err; ++ } ++ ++ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { ++ struct genradix_iter iter; ++ struct journal_replay **i; ++ ++ bch_verbose(c, "starting journal read"); ++ ret = bch2_journal_read(c, &blacklist_seq, &journal_seq); ++ if (ret) ++ goto err; ++ ++ genradix_for_each_reverse(&c->journal_entries, iter, i) ++ if (*i && !(*i)->ignore) { ++ last_journal_entry = &(*i)->j; ++ break; ++ } ++ ++ if (mustfix_fsck_err_on(c->sb.clean && ++ last_journal_entry && ++ !journal_entry_empty(last_journal_entry), c, ++ "filesystem marked clean but journal not empty")) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ } ++ ++ if (!last_journal_entry) { ++ fsck_err_on(!c->sb.clean, c, "no journal entries found"); ++ goto use_clean; ++ } ++ ++ ret = journal_keys_sort(c); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean && last_journal_entry) { ++ ret = verify_superblock_clean(c, &clean, ++ last_journal_entry); ++ if (ret) ++ goto err; ++ } ++ } else { ++use_clean: ++ if (!clean) { ++ bch_err(c, "no superblock clean section found"); ++ ret = -BCH_ERR_fsck_repair_impossible; ++ goto err; ++ ++ } ++ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ ++ if (c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ drop_alloc_keys(&c->journal_keys); ++ } ++ ++ zero_out_btree_mem_ptr(&c->journal_keys); ++ ++ ret = journal_replay_early(c, clean); ++ if (ret) ++ goto err; ++ ++ /* ++ * After an unclean shutdown, skip then next few journal sequence ++ * numbers as they may have been referenced by btree writes that ++ * happened before their corresponding journal writes - those btree ++ * writes need to be ignored, by skipping and blacklisting the next few ++ * journal sequence numbers: ++ */ ++ if (!c->sb.clean) ++ journal_seq += 8; ++ ++ if (blacklist_seq != journal_seq) { ++ ret = bch2_journal_seq_blacklist_add(c, ++ blacklist_seq, journal_seq); ++ if (ret) { ++ bch_err(c, "error creating new journal seq blacklist entry"); ++ goto err; ++ } ++ } ++ ++ /* ++ * note: cmd_list_journal needs the blacklist table fully up to date so ++ * it can asterisk ignored journal entries: ++ */ ++ if (c->opts.read_journal_only) ++ goto out; ++ ++ ret = bch2_fs_journal_start(&c->journal, journal_seq); ++ if (ret) ++ goto err; ++ ++ /* ++ * Skip past versions that might have possibly been used (as nonces), ++ * but hadn't had their pointers written: ++ */ ++ if (c->sb.encryption_type && !c->sb.clean) ++ atomic64_add(1 << 16, &c->key_version); ++ ++ ret = read_btree_roots(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "starting alloc read"); ++ err = "error reading allocation information"; ++ ++ down_read(&c->gc_lock); ++ ret = bch2_alloc_read(c); ++ up_read(&c->gc_lock); ++ ++ if (ret) ++ goto err; ++ bch_verbose(c, "alloc read done"); ++ ++ bch_verbose(c, "starting stripes_read"); ++ err = "error reading stripes"; ++ ret = bch2_stripes_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "stripes_read done"); ++ ++ bch2_stripes_heap_start(c); ++ ++ if (c->opts.fsck) { ++ bool metadata_only = c->opts.norecovery; ++ ++ bch_info(c, "checking allocations"); ++ err = "error checking allocations"; ++ ret = bch2_gc(c, true, metadata_only); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking allocations"); ++ ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ bch_info(c, "checking need_discard and freespace btrees"); ++ err = "error checking need_discard and freespace btrees"; ++ ret = bch2_check_alloc_info(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking need_discard and freespace btrees"); ++ ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ ++ bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c); ++ if (ret) ++ goto err; ++ if (c->opts.verbose || !c->sb.clean) ++ bch_info(c, "journal replay done"); ++ ++ bch_info(c, "checking lrus"); ++ err = "error checking lrus"; ++ ret = bch2_check_lrus(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking lrus"); ++ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); ++ ++ bch_info(c, "checking backpointers to alloc keys"); ++ err = "error checking backpointers to alloc keys"; ++ ret = bch2_check_btree_backpointers(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking backpointers to alloc keys"); ++ ++ bch_info(c, "checking backpointers to extents"); ++ err = "error checking backpointers to extents"; ++ ret = bch2_check_backpointers_to_extents(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking backpointers to extents"); ++ ++ bch_info(c, "checking extents to backpointers"); ++ err = "error checking extents to backpointers"; ++ ret = bch2_check_extents_to_backpointers(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking extents to backpointers"); ++ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); ++ ++ bch_info(c, "checking alloc to lru refs"); ++ err = "error checking alloc to lru refs"; ++ ret = bch2_check_alloc_to_lru_refs(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "done checking alloc to lru refs"); ++ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); ++ } else { ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); ++ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ ++ if (c->opts.norecovery) ++ goto out; ++ ++ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c); ++ if (ret) ++ goto err; ++ if (c->opts.verbose || !c->sb.clean) ++ bch_info(c, "journal replay done"); ++ } ++ ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; ++ ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ bch2_fs_lazy_rw(c); ++ ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ } ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ ++ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { ++ /* set bi_subvol on root inode */ ++ err = "error upgrade root inode for subvolumes"; ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, ++ bch2_fs_upgrade_for_subvolumes(&trans)); ++ if (ret) ++ goto err; ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "starting fsck"); ++ err = "error in fsck"; ++ ret = bch2_fsck_full(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "fsck done"); ++ } else if (!c->sb.clean) { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ } ++ ++ if (enabled_qtypes(c)) { ++ bch_verbose(c, "reading quotas"); ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "quotas done"); ++ } ++ ++ mutex_lock(&c->sb_lock); ++ if (c->opts.version_upgrade) { ++ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ write_sb = true; ++ } ++ ++ if (!test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); ++ write_sb = true; ++ } ++ ++ if (c->opts.fsck && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); ++ write_sb = true; ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || ++ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ struct bch_move_stats stats; ++ ++ bch_move_stats_init(&stats, "recovery"); ++ ++ bch_info(c, "scanning for old btree nodes"); ++ ret = bch2_fs_read_write(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_scan_old_btree_nodes(c, &stats); ++ if (ret) ++ goto err; ++ bch_info(c, "scanning for old btree nodes done"); ++ } ++ ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++ ++ ret = 0; ++out: ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ bch2_flush_fsck_errs(c); ++ ++ if (!c->opts.keep_journal) { ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(c); ++ } ++ kfree(clean); ++ ++ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { ++ bch2_fs_read_write_early(c); ++ bch2_delete_dead_snapshots_async(c); ++ } ++ ++ if (ret) ++ bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret)); ++ else ++ bch_verbose(c, "ret %s", bch2_err_str(ret)); ++ return ret; ++err: ++fsck_err: ++ bch2_fs_emergency_read_only(c); ++ goto out; ++} ++ ++int bch2_fs_initialize(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bkey_inode_buf packed_inode; ++ struct qstr lostfound = QSTR("lost+found"); ++ const char *err = "cannot allocate memory"; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ bch_notice(c, "initializing new filesystem"); ++ ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); ++ ++ if (c->sb.version < bcachefs_metadata_version_backpointers) ++ c->opts.version_upgrade = true; ++ ++ if (c->opts.version_upgrade) { ++ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ bch2_btree_root_alloc(c, i); ++ ++ for_each_online_member(ca, c, i) ++ bch2_dev_usage_init(ca); ++ ++ err = "unable to allocate journal buckets"; ++ for_each_online_member(ca, c, i) { ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ bch2_fs_journal_start(&c->journal, 1); ++ bch2_journal_set_replay_done(&c->journal); ++ ++ err = "error going read-write"; ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * Write out the superblock and journal buckets, now that we can do ++ * btree updates ++ */ ++ bch_verbose(c, "marking superblocks"); ++ err = "error marking superblock and journal"; ++ for_each_member_device(ca, c, i) { ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ percpu_ref_put(&ca->ref); ++ goto err; ++ } ++ ++ ca->new_fs_bucket_idx = 0; ++ } ++ ++ bch_verbose(c, "initializing freespace"); ++ err = "error initializing freespace"; ++ ret = bch2_fs_freespace_init(c); ++ if (ret) ++ goto err; ++ ++ err = "error creating root snapshot node"; ++ ret = bch2_fs_initialize_subvolumes(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "reading snapshots table"); ++ err = "error reading snapshots table"; ++ ret = bch2_fs_snapshots_start(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "reading snapshots done"); ++ ++ bch2_inode_init(c, &root_inode, 0, 0, ++ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; ++ bch2_inode_pack(c, &packed_inode, &root_inode); ++ packed_inode.inode.k.p.snapshot = U32_MAX; ++ ++ err = "error creating root directory"; ++ ret = bch2_btree_insert(c, BTREE_ID_inodes, ++ &packed_inode.inode.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_early(c, &lostfound_inode); ++ ++ err = "error creating lost+found"; ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_SUBVOL_INUM, ++ &root_inode, &lostfound_inode, ++ &lostfound, ++ 0, 0, S_IFDIR|0700, 0, ++ NULL, NULL, (subvol_inum) { 0 }, 0)); ++ if (ret) { ++ bch_err(c, "error creating lost+found"); ++ goto err; ++ } ++ ++ if (enabled_qtypes(c)) { ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ } ++ ++ err = "error writing first journal entry"; ++ ret = bch2_journal_flush(&c->journal); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++err: ++ pr_err("Error initializing new filesystem: %s (%i)", err, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +new file mode 100644 +index 000000000000..8c0348e8b84c +--- /dev/null ++++ b/fs/bcachefs/recovery.h +@@ -0,0 +1,58 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_H ++#define _BCACHEFS_RECOVERY_H ++ ++struct journal_iter { ++ struct list_head list; ++ enum btree_id btree_id; ++ unsigned level; ++ size_t idx; ++ struct journal_keys *keys; ++}; ++ ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ ++ ++struct btree_and_journal_iter { ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ ++ struct journal_iter journal; ++ struct bpos pos; ++ bool at_end; ++}; ++ ++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos, struct bpos, size_t *); ++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++ ++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); ++int bch2_journal_key_insert(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); ++int bch2_journal_key_delete(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++ ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, struct btree *, ++ struct btree_node_iter, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, ++ struct btree *); ++ ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct bch_fs *); ++ ++int bch2_fs_recovery(struct bch_fs *); ++int bch2_fs_initialize(struct bch_fs *); ++ ++#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +new file mode 100644 +index 000000000000..d5c14bb2992d +--- /dev/null ++++ b/fs/bcachefs/reflink.c +@@ -0,0 +1,422 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "reflink.h" ++#include "subvolume.h" ++ ++#include ++ ++static inline unsigned bkey_type_to_indirect(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ return KEY_TYPE_reflink_v; ++ case KEY_TYPE_inline_data: ++ return KEY_TYPE_indirect_inline_data; ++ default: ++ return 0; ++ } ++} ++ ++/* reflink pointers */ ++ ++int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(p.k), sizeof(*p.v)); ++ return -EINVAL; ++ } ++ ++ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && ++ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { ++ prt_printf(err, "idx < front_pad (%llu < %u)", ++ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ prt_printf(out, "idx %llu front_pad %u back_pad %u", ++ le64_to_cpu(p.v->idx), ++ le32_to_cpu(p.v->front_pad), ++ le32_to_cpu(p.v->back_pad)); ++} ++ ++bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); ++ struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); ++ ++ /* ++ * Disabled for now, the triggers code needs to be reworked for merging ++ * of reflink pointers to work: ++ */ ++ return false; ++ ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return false; ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ return true; ++} ++ ++/* indirect extents */ ++ ++int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(r.k), sizeof(*r.v)); ++ return -EINVAL; ++ } ++ ++ return bch2_bkey_ptrs_invalid(c, k, rw, err); ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); ++ ++ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); ++} ++ ++int bch2_trans_mark_reflink_v(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ return 0; ++ } ++ } ++ ++ return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); ++} ++ ++/* indirect inline data */ ++ ++int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_indirect_inline_data_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); ++ unsigned datalen = bkey_inline_data_bytes(k.k); ++ ++ prt_printf(out, "refcount %llu datalen %u: %*phN", ++ le64_to_cpu(d.v->refcount), datalen, ++ min(datalen, 32U), d.v->data); ++} ++ ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) ++{ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_indirect_inline_data *r = ++ bkey_i_to_indirect_inline_data(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ } ++ } ++ ++ return 0; ++} ++ ++static int bch2_make_extent_indirect(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *orig) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter reflink_iter = { NULL }; ++ struct bkey_s_c k; ++ struct bkey_i *r_v; ++ struct bkey_i_reflink_p *r_p; ++ __le64 *refcount; ++ int ret; ++ ++ if (orig->k.type == KEY_TYPE_inline_data) ++ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); ++ ++ for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, ++ POS(0, c->reflink_hint), ++ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { ++ if (reflink_iter.pos.inode) { ++ bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); ++ continue; ++ } ++ ++ if (bkey_deleted(k.k) && orig->k.size <= k.k->size) ++ break; ++ } ++ ++ if (ret) ++ goto err; ++ ++ /* rewind iter to start of hole, if necessary: */ ++ bch2_btree_iter_set_pos_to_extent_start(&reflink_iter); ++ ++ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_init(&r_v->k); ++ r_v->k.type = bkey_type_to_indirect(&orig->k); ++ r_v->k.p = reflink_iter.pos; ++ bch2_key_resize(&r_v->k, orig->k.size); ++ r_v->k.version = orig->k.version; ++ ++ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); ++ ++ refcount = bkey_refcount(r_v); ++ *refcount = 0; ++ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); ++ ++ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); ++ if (ret) ++ goto err; ++ ++ /* ++ * orig is in a bkey_buf which statically allocates 5 64s for the val, ++ * so we know it will be big enough: ++ */ ++ orig->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(orig); ++ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ memset(&r_p->v, 0, sizeof(r_p->v)); ++ ++ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ ++ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++err: ++ c->reflink_hint = reflink_iter.pos.offset; ++ bch2_trans_iter_exit(trans, &reflink_iter); ++ ++ return ret; ++} ++ ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key_continue_norestart(*iter, 0, k, ret) { ++ if (bkey_cmp(iter->pos, end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) ++ return k; ++ } ++ ++ if (bkey_cmp(iter->pos, end) >= 0) ++ bch2_btree_iter_set_pos(iter, end); ++ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; ++} ++ ++s64 bch2_remap_range(struct bch_fs *c, ++ subvol_inum dst_inum, u64 dst_offset, ++ subvol_inum src_inum, u64 src_offset, ++ u64 remap_sectors, ++ u64 new_i_size, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter dst_iter, src_iter; ++ struct bkey_s_c src_k; ++ struct bkey_buf new_dst, new_src; ++ struct bpos dst_start = POS(dst_inum.inum, dst_offset); ++ struct bpos src_start = POS(src_inum.inum, src_offset); ++ struct bpos dst_end = dst_start, src_end = src_start; ++ struct bpos src_want; ++ u64 dst_done; ++ u32 dst_snapshot, src_snapshot; ++ int ret = 0, ret2 = 0; ++ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return -EROFS; ++ ++ bch2_check_set_feature(c, BCH_FEATURE_reflink); ++ ++ dst_end.offset += remap_sectors; ++ src_end.offset += remap_sectors; ++ ++ bch2_bkey_buf_init(&new_dst); ++ bch2_bkey_buf_init(&new_src); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ++ ++ bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, ++ BTREE_ITER_INTENT); ++ bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, ++ BTREE_ITER_INTENT); ++ ++ while ((ret == 0 || ++ bch2_err_matches(ret, BCH_ERR_transaction_restart)) && ++ bkey_cmp(dst_iter.pos, dst_end) < 0) { ++ struct disk_reservation disk_res = { 0 }; ++ ++ bch2_trans_begin(&trans); ++ ++ if (fatal_signal_pending(current)) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, ++ &src_snapshot); ++ if (ret) ++ continue; ++ ++ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, ++ &dst_snapshot); ++ if (ret) ++ continue; ++ ++ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); ++ ++ dst_done = dst_iter.pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(&src_iter, src_want); ++ ++ src_k = get_next_src(&src_iter, src_end); ++ ret = bkey_err(src_k); ++ if (ret) ++ continue; ++ ++ if (bkey_cmp(src_want, src_iter.pos) < 0) { ++ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, ++ min(dst_end.offset, ++ dst_iter.pos.offset + ++ src_iter.pos.offset - src_want.offset), ++ i_sectors_delta); ++ continue; ++ } ++ ++ if (src_k.k->type != KEY_TYPE_reflink_p) { ++ bch2_btree_iter_set_pos_to_extent_start(&src_iter); ++ ++ bch2_bkey_buf_reassemble(&new_src, c, src_k); ++ src_k = bkey_i_to_s_c(new_src.k); ++ ++ ret = bch2_make_extent_indirect(&trans, &src_iter, ++ new_src.k); ++ if (ret) ++ continue; ++ ++ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); ++ } ++ ++ if (src_k.k->type == KEY_TYPE_reflink_p) { ++ struct bkey_s_c_reflink_p src_p = ++ bkey_s_c_to_reflink_p(src_k); ++ struct bkey_i_reflink_p *dst_p = ++ bkey_reflink_p_init(new_dst.k); ++ ++ u64 offset = le64_to_cpu(src_p.v->idx) + ++ (src_want.offset - ++ bkey_start_offset(src_k.k)); ++ ++ dst_p->v.idx = cpu_to_le64(offset); ++ } else { ++ BUG(); ++ } ++ ++ new_dst.k->k.p = dst_iter.pos; ++ bch2_key_resize(&new_dst.k->k, ++ min(src_k.k->p.offset - src_want.offset, ++ dst_end.offset - dst_iter.pos.offset)); ++ ++ ret = bch2_extent_update(&trans, dst_inum, &dst_iter, ++ new_dst.k, &disk_res, NULL, ++ new_i_size, i_sectors_delta, ++ true); ++ bch2_disk_reservation_put(c, &disk_res); ++ } ++ bch2_trans_iter_exit(&trans, &dst_iter); ++ bch2_trans_iter_exit(&trans, &src_iter); ++ ++ BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end)); ++ BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0); ++ ++ dst_done = dst_iter.pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter.pos.offset << 9, new_i_size); ++ ++ do { ++ struct bch_inode_unpacked inode_u; ++ struct btree_iter inode_iter = { NULL }; ++ ++ bch2_trans_begin(&trans); ++ ++ ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, ++ dst_inum, BTREE_ITER_INTENT); ++ ++ if (!ret2 && ++ inode_u.bi_size < new_i_size) { ++ inode_u.bi_size = new_i_size; ++ ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ } ++ ++ bch2_trans_iter_exit(&trans, &inode_iter); ++ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); ++ ++ bch2_trans_exit(&trans); ++ bch2_bkey_buf_exit(&new_src, c); ++ bch2_bkey_buf_exit(&new_dst, c); ++ ++ percpu_ref_put(&c->writes); ++ ++ return dst_done ?: ret ?: ret2; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +new file mode 100644 +index 000000000000..f9848dc3eebb +--- /dev/null ++++ b/fs/bcachefs/reflink.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REFLINK_H ++#define _BCACHEFS_REFLINK_H ++ ++int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); ++ ++#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_p_invalid, \ ++ .val_to_text = bch2_reflink_p_to_text, \ ++ .key_merge = bch2_reflink_p_merge, \ ++ .trans_trigger = bch2_trans_mark_reflink_p, \ ++ .atomic_trigger = bch2_mark_reflink_p, \ ++} ++ ++int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, ++ struct bkey_s_c, struct bkey_i *, unsigned); ++ ++#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_v_invalid, \ ++ .val_to_text = bch2_reflink_v_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .trans_trigger = bch2_trans_mark_reflink_v, \ ++ .atomic_trigger = bch2_mark_extent, \ ++} ++ ++int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, ++ int, struct printbuf *); ++void bch2_indirect_inline_data_to_text(struct printbuf *, ++ struct bch_fs *, struct bkey_s_c); ++int bch2_trans_mark_indirect_inline_data(struct btree_trans *, ++ enum btree_id, unsigned, ++ struct bkey_s_c, struct bkey_i *, ++ unsigned); ++ ++#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ ++ .key_invalid = bch2_indirect_inline_data_invalid, \ ++ .val_to_text = bch2_indirect_inline_data_to_text, \ ++ .trans_trigger = bch2_trans_mark_indirect_inline_data, \ ++} ++ ++static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_reflink_v: ++ return &bkey_s_c_to_reflink_v(k).v->refcount; ++ case KEY_TYPE_indirect_inline_data: ++ return &bkey_s_c_to_indirect_inline_data(k).v->refcount; ++ default: ++ return NULL; ++ } ++} ++ ++static inline __le64 *bkey_refcount(struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_reflink_v: ++ return &bkey_i_to_reflink_v(k)->v.refcount; ++ case KEY_TYPE_indirect_inline_data: ++ return &bkey_i_to_indirect_inline_data(k)->v.refcount; ++ default: ++ return NULL; ++ } ++} ++ ++s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, ++ subvol_inum, u64, u64, u64, s64 *); ++ ++#endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +new file mode 100644 +index 000000000000..9cb47ba62bc3 +--- /dev/null ++++ b/fs/bcachefs/replicas.c +@@ -0,0 +1,1073 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, ++ struct bch_replicas_cpu *); ++ ++/* Replicas tracking - in memory: */ ++ ++static void verify_replicas_entry(struct bch_replicas_entry *e) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ BUG_ON(e->data_type >= BCH_DATA_NR); ++ BUG_ON(!e->nr_devs); ++ BUG_ON(e->nr_required > 1 && ++ e->nr_required >= e->nr_devs); ++ ++ for (i = 0; i + 1 < e->nr_devs; i++) ++ BUG_ON(e->devs[i] >= e->devs[i + 1]); ++#endif ++} ++ ++void bch2_replicas_entry_sort(struct bch_replicas_entry *e) ++{ ++ bubble_sort(e->devs, e->nr_devs, u8_cmp); ++} ++ ++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) ++{ ++ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); ++} ++ ++void bch2_replicas_entry_v0_to_text(struct printbuf *out, ++ struct bch_replicas_entry_v0 *e) ++{ ++ unsigned i; ++ ++ if (e->data_type < BCH_DATA_NR) ++ prt_printf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ prt_printf(out, "(invalid data type %u)", e->data_type); ++ ++ prt_printf(out, ": %u [", e->nr_devs); ++ for (i = 0; i < e->nr_devs; i++) ++ prt_printf(out, i ? " %u" : "%u", e->devs[i]); ++ prt_printf(out, "]"); ++} ++ ++void bch2_replicas_entry_to_text(struct printbuf *out, ++ struct bch_replicas_entry *e) ++{ ++ unsigned i; ++ ++ if (e->data_type < BCH_DATA_NR) ++ prt_printf(out, "%s", bch2_data_types[e->data_type]); ++ else ++ prt_printf(out, "(invalid data type %u)", e->data_type); ++ ++ prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); ++ for (i = 0; i < e->nr_devs; i++) ++ prt_printf(out, i ? " %u" : "%u", e->devs[i]); ++ prt_printf(out, "]"); ++} ++ ++void bch2_cpu_replicas_to_text(struct printbuf *out, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_cpu_replicas_entry(r, e) { ++ if (!first) ++ prt_printf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++static void extent_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ r->nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (!p.has_ec) ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ else ++ r->nr_required = 0; ++ } ++} ++ ++static void stripe_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ const struct bch_extent_ptr *ptr; ++ ++ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; ++ ++ for (ptr = s.v->ptrs; ++ ptr < s.v->ptrs + s.v->nr_blocks; ++ ptr++) ++ r->devs[r->nr_devs++] = ptr->dev; ++} ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ++ struct bkey_s_c k) ++{ ++ e->nr_devs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ e->data_type = BCH_DATA_btree; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ e->data_type = BCH_DATA_user; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_stripe: ++ e->data_type = BCH_DATA_parity; ++ stripe_to_replicas(k, e); ++ break; ++ } ++ ++ bch2_replicas_entry_sort(e); ++} ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *e, ++ enum bch_data_type data_type, ++ struct bch_devs_list devs) ++{ ++ unsigned i; ++ ++ BUG_ON(!data_type || ++ data_type == BCH_DATA_sb || ++ data_type >= BCH_DATA_NR); ++ ++ e->data_type = data_type; ++ e->nr_devs = 0; ++ e->nr_required = 1; ++ ++ for (i = 0; i < devs.nr; i++) ++ e->devs[e->nr_devs++] = devs.devs[i]; ++ ++ bch2_replicas_entry_sort(e); ++} ++ ++static struct bch_replicas_cpu ++cpu_replicas_add_entry(struct bch_replicas_cpu *old, ++ struct bch_replicas_entry *new_entry) ++{ ++ unsigned i; ++ struct bch_replicas_cpu new = { ++ .nr = old->nr + 1, ++ .entry_size = max_t(unsigned, old->entry_size, ++ replicas_entry_bytes(new_entry)), ++ }; ++ ++ BUG_ON(!new_entry->data_type); ++ verify_replicas_entry(new_entry); ++ ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) ++ return new; ++ ++ for (i = 0; i < old->nr; i++) ++ memcpy(cpu_replicas_entry(&new, i), ++ cpu_replicas_entry(old, i), ++ old->entry_size); ++ ++ memcpy(cpu_replicas_entry(&new, old->nr), ++ new_entry, ++ replicas_entry_bytes(new_entry)); ++ ++ bch2_cpu_replicas_sort(&new); ++ return new; ++} ++ ++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ int idx, entry_size = replicas_entry_bytes(search); ++ ++ if (unlikely(entry_size > r->entry_size)) ++ return -1; ++ ++ verify_replicas_entry(search); ++ ++#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) ++ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, ++ entry_cmp, search); ++#undef entry_cmp ++ ++ return idx < r->nr ? idx : -1; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ bch2_replicas_entry_sort(search); ++ ++ return __replicas_entry_idx(&c->replicas, search); ++} ++ ++static bool __replicas_has_entry(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ return __replicas_entry_idx(r, search) >= 0; ++} ++ ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ bool marked; ++ ++ if (!search->nr_devs) ++ return true; ++ ++ verify_replicas_entry(search); ++ ++ percpu_down_read(&c->mark_lock); ++ marked = __replicas_has_entry(&c->replicas, search) && ++ (likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++static void __replicas_table_update(struct bch_fs_usage *dst, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage *src, ++ struct bch_replicas_cpu *src_r) ++{ ++ int src_idx, dst_idx; ++ ++ *dst = *src; ++ ++ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { ++ if (!src->replicas[src_idx]) ++ continue; ++ ++ dst_idx = __replicas_entry_idx(dst_r, ++ cpu_replicas_entry(src_r, src_idx)); ++ BUG_ON(dst_idx < 0); ++ ++ dst->replicas[dst_idx] = src->replicas[src_idx]; ++ } ++} ++ ++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage __percpu *src_p, ++ struct bch_replicas_cpu *src_r) ++{ ++ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; ++ struct bch_fs_usage *dst, *src = (void *) ++ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ ++ preempt_disable(); ++ dst = this_cpu_ptr(dst_p); ++ preempt_enable(); ++ ++ __replicas_table_update(dst, dst_r, src, src_r); ++} ++ ++/* ++ * Resize filesystem accounting: ++ */ ++static int replicas_table_update(struct bch_fs *c, ++ struct bch_replicas_cpu *new_r) ++{ ++ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; ++ struct bch_fs_usage_online *new_scratch = NULL; ++ struct bch_fs_usage __percpu *new_gc = NULL; ++ struct bch_fs_usage *new_base = NULL; ++ unsigned i, bytes = sizeof(struct bch_fs_usage) + ++ sizeof(u64) * new_r->nr; ++ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + ++ sizeof(u64) * new_r->nr; ++ int ret = 0; ++ ++ memset(new_usage, 0, sizeof(new_usage)); ++ ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ if (!(new_usage[i] = __alloc_percpu_gfp(bytes, ++ sizeof(u64), GFP_KERNEL))) ++ goto err; ++ ++ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || ++ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || ++ (c->usage_gc && ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) ++ goto err; ++ ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ if (c->usage[i]) ++ __replicas_table_update_pcpu(new_usage[i], new_r, ++ c->usage[i], &c->replicas); ++ if (c->usage_base) ++ __replicas_table_update(new_base, new_r, ++ c->usage_base, &c->replicas); ++ if (c->usage_gc) ++ __replicas_table_update_pcpu(new_gc, new_r, ++ c->usage_gc, &c->replicas); ++ ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ swap(c->usage[i], new_usage[i]); ++ swap(c->usage_base, new_base); ++ swap(c->usage_scratch, new_scratch); ++ swap(c->usage_gc, new_gc); ++ swap(c->replicas, *new_r); ++out: ++ free_percpu(new_gc); ++ kfree(new_scratch); ++ for (i = 0; i < ARRAY_SIZE(new_usage); i++) ++ free_percpu(new_usage[i]); ++ kfree(new_base); ++ return ret; ++err: ++ bch_err(c, "error updating replicas table: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static unsigned reserve_journal_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ unsigned journal_res_u64s = 0; ++ ++ /* nr_inodes: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* key_version: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* persistent_reserved: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * ++ BCH_REPLICAS_MAX; ++ ++ for_each_cpu_replicas_entry(r, e) ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + ++ e->nr_devs, sizeof(u64)); ++ return journal_res_u64s; ++} ++ ++noinline ++static int bch2_mark_replicas_slowpath(struct bch_fs *c, ++ struct bch_replicas_entry *new_entry) ++{ ++ struct bch_replicas_cpu new_r, new_gc; ++ int ret = 0; ++ ++ verify_replicas_entry(new_entry); ++ ++ memset(&new_r, 0, sizeof(new_r)); ++ memset(&new_gc, 0, sizeof(new_gc)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (c->replicas_gc.entries && ++ !__replicas_has_entry(&c->replicas_gc, new_entry)) { ++ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); ++ if (!new_gc.entries) ++ goto err; ++ } ++ ++ if (!__replicas_has_entry(&c->replicas, new_entry)) { ++ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); ++ if (!new_r.entries) ++ goto err; ++ ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); ++ if (ret) ++ goto err; ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &new_r)); ++ } ++ ++ if (!new_r.entries && ++ !new_gc.entries) ++ goto out; ++ ++ /* allocations done, now commit: */ ++ ++ if (new_r.entries) ++ bch2_write_super(c); ++ ++ /* don't update in memory replicas until changes are persistent */ ++ percpu_down_write(&c->mark_lock); ++ if (new_r.entries) ++ ret = replicas_table_update(c, &new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ percpu_up_write(&c->mark_lock); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ kfree(new_r.entries); ++ kfree(new_gc.entries); ++ ++ return ret; ++err: ++ bch_err(c, "error adding replicas entry: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) ++{ ++ return likely(bch2_replicas_marked(c, r)) ++ ? 0 : bch2_mark_replicas_slowpath(c, r); ++} ++ ++/* replicas delta list: */ ++ ++int bch2_replicas_delta_list_mark(struct bch_fs *c, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ int ret = 0; ++ ++ for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) ++ ret = bch2_mark_replicas(c, &d->r); ++ return ret; ++} ++ ++/* ++ * Old replicas_gc mechanism: only used for journal replicas entries now, should ++ * die at some point: ++ */ ++ ++int bch2_replicas_gc_end(struct bch_fs *c, int ret) ++{ ++ unsigned i; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * this is kind of crappy; the replicas gc mechanism needs to be ripped ++ * out ++ */ ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct bch_replicas_cpu n; ++ ++ if (!__replicas_has_entry(&c->replicas_gc, e) && ++ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { ++ n = cpu_replicas_add_entry(&c->replicas_gc, e); ++ if (!n.entries) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ swap(n, c->replicas_gc); ++ kfree(n.entries); ++ } ++ } ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &c->replicas_gc); ++err: ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i = 0; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ BUG_ON(c->replicas_gc.entries); ++ ++ c->replicas_gc.nr = 0; ++ c->replicas_gc.entry_size = 0; ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) { ++ c->replicas_gc.nr++; ++ c->replicas_gc.entry_size = ++ max_t(unsigned, c->replicas_gc.entry_size, ++ replicas_entry_bytes(e)); ++ } ++ ++ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, ++ c->replicas_gc.entry_size, ++ GFP_KERNEL); ++ if (!c->replicas_gc.entries) { ++ mutex_unlock(&c->sb_lock); ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) ++ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), ++ e, c->replicas_gc.entry_size); ++ ++ bch2_cpu_replicas_sort(&c->replicas_gc); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* New much simpler mechanism for clearing out unneeded replicas entries: */ ++ ++int bch2_replicas_gc2(struct bch_fs *c) ++{ ++ struct bch_replicas_cpu new = { 0 }; ++ unsigned i, nr; ++ int ret = 0; ++ ++ bch2_journal_meta(&c->journal); ++retry: ++ nr = READ_ONCE(c->replicas.nr); ++ new.entry_size = READ_ONCE(c->replicas.entry_size); ++ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) { ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sb_lock); ++ kfree(new.entries); ++ goto retry; ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (e->data_type == BCH_DATA_journal || ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]) || ++ percpu_u64_get(&c->usage[2]->replicas[i]) || ++ percpu_u64_get(&c->usage[3]->replicas[i])) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } ++ ++ bch2_cpu_replicas_sort(&new); ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &new); ++err: ++ kfree(new.entries); ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_set_usage(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ u64 sectors) ++{ ++ int ret, idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) { ++ struct bch_replicas_cpu n; ++ ++ n = cpu_replicas_add_entry(&c->replicas, r); ++ if (!n.entries) ++ return -ENOMEM; ++ ++ ret = replicas_table_update(c, &n); ++ if (ret) ++ return ret; ++ ++ kfree(n.entries); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ BUG_ON(ret < 0); ++ } ++ ++ c->usage_base->replicas[idx] = sectors; ++ ++ return 0; ++} ++ ++/* Replicas tracking - superblock: */ ++ ++static int ++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry *e, *dst; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ dst = cpu_replicas_entry(cpu_r, idx++); ++ memcpy(dst, e, replicas_entry_bytes(e)); ++ bch2_replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++static int ++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry_v0 *e; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ entry_size += sizeof(struct bch_replicas_entry) - ++ sizeof(struct bch_replicas_entry_v0); ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ struct bch_replicas_entry *dst = ++ cpu_replicas_entry(cpu_r, idx++); ++ ++ dst->data_type = e->data_type; ++ dst->nr_devs = e->nr_devs; ++ dst->nr_required = 1; ++ memcpy(dst->devs, e->devs, e->nr_devs); ++ bch2_replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ++{ ++ struct bch_sb_field_replicas *sb_v1; ++ struct bch_sb_field_replicas_v0 *sb_v0; ++ struct bch_replicas_cpu new_r = { 0, 0, NULL }; ++ int ret = 0; ++ ++ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); ++ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); ++ ++ if (ret) ++ return -ENOMEM; ++ ++ bch2_cpu_replicas_sort(&new_r); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ ret = replicas_table_update(c, &new_r); ++ percpu_up_write(&c->mark_lock); ++ ++ kfree(new_r.entries); ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r; ++ struct bch_replicas_entry_v0 *dst; ++ struct bch_replicas_entry *src; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) ++ bytes += replicas_entry_bytes(src) - 1; ++ ++ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); ++ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ dst->data_type = src->data_type; ++ dst->nr_devs = src->nr_devs; ++ memcpy(dst->devs, src->devs, src->nr_devs); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas *sb_r; ++ struct bch_replicas_entry *dst, *src; ++ bool need_v1 = false; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) { ++ bytes += replicas_entry_bytes(src); ++ if (src->nr_required != 1) ++ need_v1 = true; ++ } ++ ++ if (!need_v1) ++ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); ++ ++ sb_r = bch2_sb_resize_replicas(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); ++ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ memcpy(dst, src, replicas_entry_bytes(src)); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, ++ struct bch_sb *sb, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ unsigned i, j; ++ ++ sort_cmp_size(cpu_r->entries, ++ cpu_r->nr, ++ cpu_r->entry_size, ++ memcmp, NULL); ++ ++ for (i = 0; i < cpu_r->nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(cpu_r, i); ++ ++ if (e->data_type >= BCH_DATA_NR) { ++ prt_printf(err, "invalid data type in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ if (!e->nr_devs) { ++ prt_printf(err, "no devices in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) { ++ prt_printf(err, "bad nr_required in entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ for (j = 0; j < e->nr_devs; j++) ++ if (!bch2_dev_exists(sb, mi, e->devs[j])) { ++ prt_printf(err, "invalid device %u in entry ", e->devs[j]); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ ++ if (i + 1 < cpu_r->nr) { ++ struct bch_replicas_entry *n = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(e, n, cpu_r->entry_size)) { ++ prt_printf(err, "duplicate replicas entry "); ++ bch2_replicas_entry_to_text(err, e); ++ return -EINVAL; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_replicas_cpu cpu_r; ++ int ret; ++ ++ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) ++ return -ENOMEM; ++ ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); ++ kfree(cpu_r.entries); ++ return ret; ++} ++ ++static void bch2_sb_replicas_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *r = field_to_type(f, replicas); ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_replicas_entry(r, e) { ++ if (!first) ++ prt_printf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas = { ++ .validate = bch2_sb_replicas_validate, ++ .to_text = bch2_sb_replicas_to_text, ++}; ++ ++static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_replicas_cpu cpu_r; ++ int ret; ++ ++ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) ++ return -ENOMEM; ++ ++ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); ++ kfree(cpu_r.entries); ++ return ret; ++} ++ ++static void bch2_sb_replicas_v0_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_replicas_entry_v0 *e; ++ bool first = true; ++ ++ for_each_replicas_entry(sb_r, e) { ++ if (!first) ++ prt_printf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_v0_to_text(out, e); ++ } ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { ++ .validate = bch2_sb_replicas_v0_validate, ++ .to_text = bch2_sb_replicas_v0_to_text, ++}; ++ ++/* Query replicas: */ ++ ++bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, ++ unsigned flags, bool print) ++{ ++ struct bch_replicas_entry *e; ++ bool ret = true; ++ ++ percpu_down_read(&c->mark_lock); ++ for_each_cpu_replicas_entry(&c->replicas, e) { ++ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; ++ bool metadata = e->data_type < BCH_DATA_user; ++ ++ if (e->data_type == BCH_DATA_cached) ++ continue; ++ ++ for (i = 0; i < e->nr_devs; i++) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); ++ ++ nr_online += test_bit(e->devs[i], devs.d); ++ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; ++ } ++ ++ if (nr_failed == e->nr_devs) ++ continue; ++ ++ if (nr_online < e->nr_required) ++ dflags |= metadata ++ ? BCH_FORCE_IF_METADATA_LOST ++ : BCH_FORCE_IF_DATA_LOST; ++ ++ if (nr_online < e->nr_devs) ++ dflags |= metadata ++ ? BCH_FORCE_IF_METADATA_DEGRADED ++ : BCH_FORCE_IF_DATA_DEGRADED; ++ ++ if (dflags & ~flags) { ++ if (print) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_replicas_entry_to_text(&buf, e); ++ bch_err(c, "insufficient devices online (%u) for replicas entry %s", ++ nr_online, buf.buf); ++ printbuf_exit(&buf); ++ } ++ ret = false; ++ break; ++ } ++ ++ } ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) ++{ ++ struct bch_sb_field_replicas *replicas; ++ struct bch_sb_field_replicas_v0 *replicas_v0; ++ unsigned i, data_has = 0; ++ ++ replicas = bch2_sb_get_replicas(sb); ++ replicas_v0 = bch2_sb_get_replicas_v0(sb); ++ ++ if (replicas) { ++ struct bch_replicas_entry *r; ++ ++ for_each_replicas_entry(replicas, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } else if (replicas_v0) { ++ struct bch_replicas_entry_v0 *r; ++ ++ for_each_replicas_entry_v0(replicas_v0, r) ++ for (i = 0; i < r->nr_devs; i++) ++ if (r->devs[i] == dev) ++ data_has |= 1 << r->data_type; ++ } ++ ++ ++ return data_has; ++} ++ ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned ret; ++ ++ mutex_lock(&c->sb_lock); ++ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++void bch2_fs_replicas_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ kfree(c->usage_scratch); ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ free_percpu(c->usage[i]); ++ kfree(c->usage_base); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ ++ mempool_exit(&c->replicas_delta_pool); ++} ++ ++int bch2_fs_replicas_init(struct bch_fs *c) ++{ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &c->replicas)); ++ ++ return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, ++ REPLICAS_DELTA_LIST_MAX) ?: ++ replicas_table_update(c, &c->replicas); ++} +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +new file mode 100644 +index 000000000000..87820b2e1ad3 +--- /dev/null ++++ b/fs/bcachefs/replicas.h +@@ -0,0 +1,106 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REPLICAS_H ++#define _BCACHEFS_REPLICAS_H ++ ++#include "eytzinger.h" ++#include "replicas_types.h" ++ ++void bch2_replicas_entry_sort(struct bch_replicas_entry *); ++void bch2_replicas_entry_to_text(struct printbuf *, ++ struct bch_replicas_entry *); ++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); ++ ++static inline struct bch_replicas_entry * ++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) ++{ ++ return (void *) r->entries + r->entry_size * i; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *, ++ enum bch_data_type, ++ struct bch_devs_list); ++bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); ++int bch2_mark_replicas(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); ++ ++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, ++ unsigned dev) ++{ ++ e->data_type = BCH_DATA_cached; ++ e->nr_devs = 1; ++ e->nr_required = 1; ++ e->devs[0] = dev; ++} ++ ++bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, ++ unsigned, bool); ++ ++unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); ++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); ++ ++int bch2_replicas_gc_end(struct bch_fs *, int); ++int bch2_replicas_gc_start(struct bch_fs *, unsigned); ++int bch2_replicas_gc2(struct bch_fs *); ++ ++int bch2_replicas_set_usage(struct bch_fs *, ++ struct bch_replicas_entry *, ++ u64); ++ ++#define for_each_cpu_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ ++ _i = (void *) (_i) + (_r)->entry_size) ++ ++/* iterate over superblock replicas - used by userspace tools: */ ++ ++#define replicas_entry_next(_i) \ ++ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) ++ ++#define for_each_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++#define for_each_replicas_entry_v0(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; ++ ++void bch2_fs_replicas_exit(struct bch_fs *); ++int bch2_fs_replicas_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h +new file mode 100644 +index 000000000000..0535b1d3760e +--- /dev/null ++++ b/fs/bcachefs/replicas_types.h +@@ -0,0 +1,10 @@ ++#ifndef _BCACHEFS_REPLICAS_TYPES_H ++#define _BCACHEFS_REPLICAS_TYPES_H ++ ++struct bch_replicas_cpu { ++ unsigned nr; ++ unsigned entry_size; ++ struct bch_replicas_entry *entries; ++}; ++ ++#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c +new file mode 100644 +index 000000000000..c062edb3fbc2 +--- /dev/null ++++ b/fs/bcachefs/siphash.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: BSD-3-Clause ++/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ ++ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++/* ++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d ++ * are the number of compression rounds and the number of finalization rounds. ++ * A compression round is identical to a finalization round and this round ++ * function is called SipRound. Given a 128-bit key k and a (possibly empty) ++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). ++ * ++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, ++ * by Jean-Philippe Aumasson and Daniel J. Bernstein, ++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa ++ * https://131002.net/siphash/siphash.pdf ++ * https://131002.net/siphash/ ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "siphash.h" ++ ++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) ++{ ++ while (rounds--) { ++ ctx->v[0] += ctx->v[1]; ++ ctx->v[2] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 13); ++ ctx->v[3] = rol64(ctx->v[3], 16); ++ ++ ctx->v[1] ^= ctx->v[0]; ++ ctx->v[3] ^= ctx->v[2]; ++ ctx->v[0] = rol64(ctx->v[0], 32); ++ ++ ctx->v[2] += ctx->v[1]; ++ ctx->v[0] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 17); ++ ctx->v[3] = rol64(ctx->v[3], 21); ++ ++ ctx->v[1] ^= ctx->v[2]; ++ ctx->v[3] ^= ctx->v[0]; ++ ctx->v[2] = rol64(ctx->v[2], 32); ++ } ++} ++ ++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) ++{ ++ u64 m = get_unaligned_le64(ptr); ++ ++ ctx->v[3] ^= m; ++ SipHash_Rounds(ctx, rounds); ++ ctx->v[0] ^= m; ++} ++ ++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ++{ ++ u64 k0, k1; ++ ++ k0 = le64_to_cpu(key->k0); ++ k1 = le64_to_cpu(key->k1); ++ ++ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ++ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ++ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ++ ctx->v[3] = 0x7465646279746573ULL ^ k1; ++ ++ memset(ctx->buf, 0, sizeof(ctx->buf)); ++ ctx->bytes = 0; ++} ++ ++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, ++ const void *src, size_t len) ++{ ++ const u8 *ptr = src; ++ size_t left, used; ++ ++ if (len == 0) ++ return; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ ctx->bytes += len; ++ ++ if (used > 0) { ++ left = sizeof(ctx->buf) - used; ++ ++ if (len >= left) { ++ memcpy(&ctx->buf[used], ptr, left); ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ len -= left; ++ ptr += left; ++ } else { ++ memcpy(&ctx->buf[used], ptr, len); ++ return; ++ } ++ } ++ ++ while (len >= sizeof(ctx->buf)) { ++ SipHash_CRounds(ctx, ptr, rc); ++ len -= sizeof(ctx->buf); ++ ptr += sizeof(ctx->buf); ++ } ++ ++ if (len > 0) ++ memcpy(&ctx->buf[used], ptr, len); ++} ++ ++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ ++ r = SipHash_End(ctx, rc, rf); ++ ++ *((__le64 *) dst) = cpu_to_le64(r); ++} ++ ++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ size_t left, used; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ left = sizeof(ctx->buf) - used; ++ memset(&ctx->buf[used], 0, left - 1); ++ ctx->buf[7] = ctx->bytes; ++ ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ ctx->v[2] ^= 0xff; ++ SipHash_Rounds(ctx, rf); ++ ++ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); ++ memset(ctx, 0, sizeof(*ctx)); ++ return (r); ++} ++ ++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) ++{ ++ SIPHASH_CTX ctx; ++ ++ SipHash_Init(&ctx, key); ++ SipHash_Update(&ctx, rc, rf, src, len); ++ return SipHash_End(&ctx, rc, rf); ++} +diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h +new file mode 100644 +index 000000000000..3dfaf34a43b2 +--- /dev/null ++++ b/fs/bcachefs/siphash.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: BSD-3-Clause */ ++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ * ++ * $FreeBSD$ ++ */ ++ ++/* ++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) ++ * optimized for speed on short messages returning a 64bit hash/digest value. ++ * ++ * The number of rounds is defined during the initialization: ++ * SipHash24_Init() for the fast and resonable strong version ++ * SipHash48_Init() for the strong version (half as fast) ++ * ++ * struct SIPHASH_CTX ctx; ++ * SipHash24_Init(&ctx); ++ * SipHash_SetKey(&ctx, "16bytes long key"); ++ * SipHash_Update(&ctx, pointer_to_string, length_of_string); ++ * SipHash_Final(output, &ctx); ++ */ ++ ++#ifndef _SIPHASH_H_ ++#define _SIPHASH_H_ ++ ++#include ++ ++#define SIPHASH_BLOCK_LENGTH 8 ++#define SIPHASH_KEY_LENGTH 16 ++#define SIPHASH_DIGEST_LENGTH 8 ++ ++typedef struct _SIPHASH_CTX { ++ u64 v[4]; ++ u8 buf[SIPHASH_BLOCK_LENGTH]; ++ u32 bytes; ++} SIPHASH_CTX; ++ ++typedef struct { ++ __le64 k0; ++ __le64 k1; ++} SIPHASH_KEY; ++ ++void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); ++void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); ++u64 SipHash_End(SIPHASH_CTX *, int, int); ++void SipHash_Final(void *, SIPHASH_CTX *, int, int); ++u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); ++ ++#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) ++#define SipHash24_End(_d) SipHash_End((_d), 2, 4) ++#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) ++#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) ++ ++#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) ++#define SipHash48_End(_d) SipHash_End((_d), 4, 8) ++#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) ++#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) ++ ++#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +new file mode 100644 +index 000000000000..591bbb9f8beb +--- /dev/null ++++ b/fs/bcachefs/str_hash.h +@@ -0,0 +1,351 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_STR_HASH_H ++#define _BCACHEFS_STR_HASH_H ++ ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "checksum.h" ++#include "error.h" ++#include "inode.h" ++#include "siphash.h" ++#include "subvolume.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++static inline enum bch_str_hash_type ++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) ++{ ++ switch (opt) { ++ case BCH_STR_HASH_OPT_crc32c: ++ return BCH_STR_HASH_crc32c; ++ case BCH_STR_HASH_OPT_crc64: ++ return BCH_STR_HASH_crc64; ++ case BCH_STR_HASH_OPT_siphash: ++ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ++ ? BCH_STR_HASH_siphash ++ : BCH_STR_HASH_siphash_old; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_info { ++ u8 type; ++ /* ++ * For crc32 or crc64 string hashes the first key value of ++ * the siphash_key (k0) is used as the key. ++ */ ++ SIPHASH_KEY siphash_key; ++}; ++ ++static inline struct bch_hash_info ++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) ++{ ++ /* XXX ick */ ++ struct bch_hash_info info = { ++ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & ++ ~(~0U << INODE_STR_HASH_BITS), ++ .siphash_key = { .k0 = bi->bi_hash_seed } ++ }; ++ ++ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { ++ SHASH_DESC_ON_STACK(desc, c->sha256); ++ u8 digest[SHA256_DIGEST_SIZE]; ++ ++ desc->tfm = c->sha256; ++ ++ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, ++ sizeof(bi->bi_hash_seed), digest); ++ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); ++ } ++ ++ return info; ++} ++ ++struct bch_str_hash_ctx { ++ union { ++ u32 crc32c; ++ u64 crc64; ++ SIPHASH_CTX siphash; ++ }; ++}; ++ ++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_crc32c: ++ ctx->crc32c = crc32c(~0, &info->siphash_key.k0, ++ sizeof(info->siphash_key.k0)); ++ break; ++ case BCH_STR_HASH_crc64: ++ ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, ++ sizeof(info->siphash_key.k0)); ++ break; ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: ++ SipHash24_Init(&ctx->siphash, &info->siphash_key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info, ++ const void *data, size_t len) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_crc32c: ++ ctx->crc32c = crc32c(ctx->crc32c, data, len); ++ break; ++ case BCH_STR_HASH_crc64: ++ ctx->crc64 = crc64_be(ctx->crc64, data, len); ++ break; ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: ++ SipHash24_Update(&ctx->siphash, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_crc32c: ++ return ctx->crc32c; ++ case BCH_STR_HASH_crc64: ++ return ctx->crc64 >> 1; ++ case BCH_STR_HASH_siphash_old: ++ case BCH_STR_HASH_siphash: ++ return SipHash24_End(&ctx->siphash) >> 1; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_desc { ++ enum btree_id btree_id; ++ u8 key_type; ++ ++ u64 (*hash_key)(const struct bch_hash_info *, const void *); ++ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); ++ bool (*cmp_key)(struct bkey_s_c, const void *); ++ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++ bool (*is_visible)(subvol_inum inum, struct bkey_s_c); ++}; ++ ++static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) ++{ ++ return k.k->type == desc.key_type && ++ (!desc.is_visible || desc.is_visible(inum, k)); ++} ++ ++static __always_inline int ++bch2_hash_lookup(struct btree_trans *trans, ++ struct btree_iter *iter, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ subvol_inum inum, const void *key, ++ unsigned flags) ++{ ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, ++ SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ POS(inum.inum, U64_MAX), ++ BTREE_ITER_SLOTS|flags, k, ret) { ++ if (is_visible_key(desc, inum, k)) { ++ if (!desc.cmp_key(k, key)) ++ return 0; ++ } else if (k.k->type == KEY_TYPE_hash_whiteout) { ++ ; ++ } else { ++ /* hole, not found */ ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, iter); ++ ++ return ret ?: -ENOENT; ++} ++ ++static __always_inline int ++bch2_hash_hole(struct btree_trans *trans, ++ struct btree_iter *iter, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ subvol_inum inum, const void *key) ++{ ++ struct bkey_s_c k; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, ++ SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ POS(inum.inum, U64_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) ++ if (!is_visible_key(desc, inum, k)) ++ return 0; ++ bch2_trans_iter_exit(trans, iter); ++ ++ return ret ?: -ENOSPC; ++} ++ ++static __always_inline ++int bch2_hash_needs_whiteout(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *start) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_copy_iter(&iter, start); ++ ++ bch2_btree_iter_advance(&iter); ++ ++ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->type != desc.key_type && ++ k.k->type != KEY_TYPE_hash_whiteout) ++ break; ++ ++ if (k.k->type == desc.key_type && ++ desc.hash_bkey(info, k) <= start->pos.offset) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static __always_inline ++int bch2_hash_set(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ subvol_inum inum, ++ struct bkey_i *insert, int flags) ++{ ++ struct btree_iter iter, slot = { NULL }; ++ struct bkey_s_c k; ++ bool found = false; ++ u32 snapshot; ++ int ret; ++ ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, ++ SPOS(inum.inum, ++ desc.hash_bkey(info, bkey_i_to_s_c(insert)), ++ snapshot), ++ POS(inum.inum, U64_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (is_visible_key(desc, inum, k)) { ++ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) ++ goto found; ++ ++ /* hash collision: */ ++ continue; ++ } ++ ++ if (!slot.path && ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) ++ bch2_trans_copy_iter(&slot, &iter); ++ ++ if (k.k->type != KEY_TYPE_hash_whiteout) ++ goto not_found; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++out: ++ bch2_trans_iter_exit(trans, &slot); ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++found: ++ found = true; ++not_found: ++ ++ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { ++ ret = -ENOENT; ++ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ++ ret = -EEXIST; ++ } else { ++ if (!found && slot.path) ++ swap(iter, slot); ++ ++ insert->k.p = iter.pos; ++ ret = bch2_trans_update(trans, &iter, insert, 0); ++ } ++ ++ goto out; ++} ++ ++static __always_inline ++int bch2_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *iter, ++ unsigned update_flags) ++{ ++ struct bkey_i *delete; ++ int ret; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ ret = PTR_ERR_OR_ZERO(delete); ++ if (ret) ++ return ret; ++ ++ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); ++ if (ret < 0) ++ return ret; ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter->pos; ++ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; ++ ++ return bch2_trans_update(trans, iter, delete, update_flags); ++} ++ ++static __always_inline ++int bch2_hash_delete(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ subvol_inum inum, const void *key) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, ++ BTREE_ITER_INTENT); ++ if (ret) ++ return ret; ++ ++ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++#endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +new file mode 100644 +index 000000000000..b5b0f5e39f97 +--- /dev/null ++++ b/fs/bcachefs/subvolume.c +@@ -0,0 +1,1108 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "errcode.h" ++#include "error.h" ++#include "fs.h" ++#include "subvolume.h" ++ ++/* Snapshot tree: */ ++ ++void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); ++ ++ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u", ++ BCH_SNAPSHOT_SUBVOL(s.v), ++ BCH_SNAPSHOT_DELETED(s.v), ++ le32_to_cpu(s.v->parent), ++ le32_to_cpu(s.v->children[0]), ++ le32_to_cpu(s.v->children[1]), ++ le32_to_cpu(s.v->subvol)); ++} ++ ++int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ struct bkey_s_c_snapshot s; ++ u32 i, id; ++ ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || ++ bkey_cmp(k.k->p, POS(0, 1)) < 0) { ++ prt_printf(err, "bad pos"); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) { ++ prt_printf(err, "bad val size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_snapshot)); ++ return -EINVAL; ++ } ++ ++ s = bkey_s_c_to_snapshot(k); ++ ++ id = le32_to_cpu(s.v->parent); ++ if (id && id <= k.k->p.offset) { ++ prt_printf(err, "bad parent node (%u <= %llu)", ++ id, k.k->p.offset); ++ return -EINVAL; ++ } ++ ++ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { ++ prt_printf(err, "children not normalized"); ++ return -EINVAL; ++ } ++ ++ if (s.v->children[0] && ++ s.v->children[0] == s.v->children[1]) { ++ prt_printf(err, "duplicate child nodes"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < 2; i++) { ++ id = le32_to_cpu(s.v->children[i]); ++ ++ if (id >= k.k->p.offset) { ++ prt_printf(err, "bad child node (%u >= %llu)", ++ id, k.k->p.offset); ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++int bch2_mark_snapshot(struct btree_trans *trans, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_t *t; ++ ++ t = genradix_ptr_alloc(&c->snapshots, ++ U32_MAX - new.k->p.offset, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ if (new.k->type == KEY_TYPE_snapshot) { ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); ++ ++ t->parent = le32_to_cpu(s.v->parent); ++ t->children[0] = le32_to_cpu(s.v->children[0]); ++ t->children[1] = le32_to_cpu(s.v->children[1]); ++ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; ++ } else { ++ t->parent = 0; ++ t->children[0] = 0; ++ t->children[1] = 0; ++ t->subvol = 0; ++ } ++ ++ return 0; ++} ++ ++static int snapshot_lookup(struct btree_trans *trans, u32 id, ++ struct bch_snapshot *s) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_WITH_UPDATES); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; ++ ++ if (!ret) ++ *s = *bkey_s_c_to_snapshot(k).v; ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int snapshot_live(struct btree_trans *trans, u32 id) ++{ ++ struct bch_snapshot v; ++ int ret; ++ ++ if (!id) ++ return 0; ++ ++ ret = snapshot_lookup(trans, id, &v); ++ if (ret == -ENOENT) ++ bch_err(trans->c, "snapshot node %u not found", id); ++ if (ret) ++ return ret; ++ ++ return !BCH_SNAPSHOT_DELETED(&v); ++} ++ ++static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i, nr_live = 0, live_idx = 0; ++ struct bkey_s_c_snapshot snap; ++ u32 id = k.k->p.offset, child[2]; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ ++ child[0] = le32_to_cpu(snap.v->children[0]); ++ child[1] = le32_to_cpu(snap.v->children[1]); ++ ++ for (i = 0; i < 2; i++) { ++ int ret = snapshot_live(trans, child[i]); ++ if (ret < 0) ++ return ret; ++ ++ if (ret) ++ live_idx = i; ++ nr_live += ret; ++ } ++ ++ snapshot_t(c, id)->equiv = nr_live == 1 ++ ? snapshot_t(c, child[live_idx])->equiv ++ : id; ++ return 0; ++} ++ ++/* fsck: */ ++static int check_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_snapshot s; ++ struct bch_subvolume subvol; ++ struct bch_snapshot v; ++ struct printbuf buf = PRINTBUF; ++ bool should_have_subvol; ++ u32 i, id; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ s = bkey_s_c_to_snapshot(k); ++ id = le32_to_cpu(s.v->parent); ++ if (id) { ++ ret = snapshot_lookup(trans, id, &v); ++ if (ret == -ENOENT) ++ bch_err(c, "snapshot with nonexistent parent:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); ++ if (ret) ++ goto err; ++ ++ if (le32_to_cpu(v.children[0]) != s.k->p.offset && ++ le32_to_cpu(v.children[1]) != s.k->p.offset) { ++ bch_err(c, "snapshot parent %u missing pointer to child %llu", ++ id, s.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } ++ ++ for (i = 0; i < 2 && s.v->children[i]; i++) { ++ id = le32_to_cpu(s.v->children[i]); ++ ++ ret = snapshot_lookup(trans, id, &v); ++ if (ret == -ENOENT) ++ bch_err(c, "snapshot node %llu has nonexistent child %u", ++ s.k->p.offset, id); ++ if (ret) ++ goto err; ++ ++ if (le32_to_cpu(v.parent) != s.k->p.offset) { ++ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", ++ id, le32_to_cpu(v.parent), s.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } ++ ++ should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && ++ !BCH_SNAPSHOT_DELETED(s.v); ++ ++ if (should_have_subvol) { ++ id = le32_to_cpu(s.v->subvol); ++ ret = bch2_subvolume_get(trans, id, 0, false, &subvol); ++ if (ret == -ENOENT) ++ bch_err(c, "snapshot points to nonexistent subvolume:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); ++ if (ret) ++ goto err; ++ ++ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { ++ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", ++ s.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } else { ++ if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ++ struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); ++ ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&u->k_i, s.s_c); ++ u->v.subvol = 0; ++ ret = bch2_trans_update(trans, iter, &u->k_i, 0); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ if (BCH_SNAPSHOT_DELETED(s.v)) ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_fs_check_snapshots(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_snapshot(&trans, &iter, k)); ++ ++ if (ret) ++ bch_err(c, "error %i checking snapshots", ret); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int check_subvol(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_subvolume subvol; ++ struct bch_snapshot snapshot; ++ unsigned snapid; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_subvolume) ++ return 0; ++ ++ subvol = bkey_s_c_to_subvolume(k); ++ snapid = le32_to_cpu(subvol.v->snapshot); ++ ret = snapshot_lookup(trans, snapid, &snapshot); ++ ++ if (ret == -ENOENT) ++ bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u", ++ k.k->p.offset, snapid); ++ if (ret) ++ return ret; ++ ++ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ++ ret = bch2_subvolume_delete(trans, iter->pos.offset); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "error deleting subvolume %llu: %s", ++ iter->pos.offset, bch2_err_str(ret)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_fs_check_subvols(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_subvol(&trans, &iter, k)); ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++void bch2_fs_snapshots_exit(struct bch_fs *c) ++{ ++ genradix_free(&c->snapshots); ++} ++ ++int bch2_fs_snapshots_start(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?: ++ bch2_snapshot_set_equiv(&trans, k)); ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ bch_err(c, "error starting snapshots: %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++/* ++ * Mark a snapshot as deleted, for future cleanup: ++ */ ++static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_snapshot *s; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ /* already deleted? */ ++ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) ++ goto err; ++ ++ s = bch2_trans_kmalloc(trans, sizeof(*s)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&s->k_i, k); ++ SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); ++ s->v.subvol = 0; ++ ++ ret = bch2_trans_update(trans, &iter, &s->k_i, 0); ++ if (ret) ++ goto err; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) ++{ ++ struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot s; ++ struct bkey_i_snapshot *parent; ++ u32 parent_id; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ s = bkey_s_c_to_snapshot(k); ++ ++ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); ++ parent_id = le32_to_cpu(s.v->parent); ++ ++ if (parent_id) { ++ bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, ++ POS(0, parent_id), ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&p_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ parent = bch2_trans_kmalloc(trans, sizeof(*parent)); ++ ret = PTR_ERR_OR_ZERO(parent); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&parent->k_i, k); ++ ++ for (i = 0; i < 2; i++) ++ if (le32_to_cpu(parent->v.children[i]) == id) ++ break; ++ ++ if (i == 2) ++ bch_err(trans->c, "snapshot %u missing child pointer to %u", ++ parent_id, id); ++ else ++ parent->v.children[i] = 0; ++ ++ if (le32_to_cpu(parent->v.children[0]) < ++ le32_to_cpu(parent->v.children[1])) ++ swap(parent->v.children[0], ++ parent->v.children[1]); ++ ++ ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &p_iter); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ struct btree_iter iter; ++ struct bkey_i_snapshot *n; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, ++ POS_MIN, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < nr_snapids; i++) { ++ k = bch2_btree_iter_prev_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || !k.k->p.offset) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_snapshot_init(&n->k_i); ++ n->k.p = iter.pos; ++ n->v.flags = 0; ++ n->v.parent = cpu_to_le32(parent); ++ n->v.subvol = cpu_to_le32(snapshot_subvols[i]); ++ n->v.pad = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); ++ ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: ++ bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); ++ if (ret) ++ goto err; ++ ++ new_snapids[i] = iter.pos.offset; ++ } ++ ++ if (parent) { ++ bch2_btree_iter_set_pos(&iter, POS(0, parent)); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_snapshot) { ++ bch_err(trans->c, "snapshot %u not found", parent); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&n->k_i, k); ++ ++ if (n->v.children[0] || n->v.children[1]) { ++ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ n->v.children[0] = cpu_to_le32(new_snapids[0]); ++ n->v.children[1] = cpu_to_le32(new_snapids[1]); ++ n->v.subvol = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0); ++ if (ret) ++ goto err; ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int snapshot_delete_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ snapshot_id_list *deleted, ++ snapshot_id_list *equiv_seen, ++ struct bpos *last_pos) ++{ ++ struct bch_fs *c = trans->c; ++ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; ++ ++ if (bkey_cmp(k.k->p, *last_pos)) ++ equiv_seen->nr = 0; ++ *last_pos = k.k->p; ++ ++ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || ++ snapshot_list_has_id(equiv_seen, equiv)) { ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ } else { ++ return snapshot_list_add(c, equiv_seen, equiv); ++ } ++} ++ ++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot snap; ++ u32 children[2]; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v) || ++ BCH_SNAPSHOT_SUBVOL(snap.v)) ++ return 0; ++ ++ children[0] = le32_to_cpu(snap.v->children[0]); ++ children[1] = le32_to_cpu(snap.v->children[1]); ++ ++ ret = snapshot_live(trans, children[0]) ?: ++ snapshot_live(trans, children[1]); ++ if (ret < 0) ++ return ret; ++ ++ if (!ret) ++ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); ++ return 0; ++} ++ ++int bch2_delete_dead_snapshots(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot snap; ++ snapshot_id_list deleted = { 0 }; ++ u32 i, id; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) ++ return 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ ret = bch2_fs_read_write_early(c); ++ if (ret) { ++ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * For every snapshot node: If we have no live children and it's not ++ * pointed to by a subvolume, delete it: ++ */ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ NULL, NULL, 0, ++ bch2_delete_redundant_snapshot(&trans, &iter, k)); ++ if (ret) { ++ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_snapshot_set_equiv(&trans, k)); ++ if (ret) { ++ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v)) { ++ ret = snapshot_list_add(c, &deleted, k.k->p.offset); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ struct bpos last_pos = POS_MIN; ++ snapshot_id_list equiv_seen = { 0 }; ++ ++ if (!btree_type_has_snapshots(id)) ++ continue; ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); ++ ++ darray_exit(&equiv_seen); ++ ++ if (ret) { ++ bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ for (i = 0; i < deleted.nr; i++) { ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_snapshot_node_delete(&trans, deleted.data[i])); ++ if (ret) { ++ bch_err(c, "error deleting snapshot %u: %s", ++ deleted.data[i], bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++err: ++ darray_exit(&deleted); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static void bch2_delete_dead_snapshots_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); ++ ++ bch2_delete_dead_snapshots(c); ++ percpu_ref_put(&c->writes); ++} ++ ++void bch2_delete_dead_snapshots_async(struct bch_fs *c) ++{ ++ if (!percpu_ref_tryget_live(&c->writes)) ++ return; ++ ++ if (!queue_work(system_long_wq, &c->snapshot_delete_work)) ++ percpu_ref_put(&c->writes); ++} ++ ++static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ struct bch_fs *c = trans->c; ++ ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ ++ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags)) ++ return 0; ++ ++ bch2_delete_dead_snapshots_async(c); ++ return 0; ++} ++ ++/* Subvolumes: */ ++ ++int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 || ++ bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) { ++ prt_printf(err, "invalid pos"); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) { ++ prt_printf(err, "incorrect value size (%zu != %zu)", ++ bkey_val_bytes(k.k), sizeof(struct bch_subvolume)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); ++ ++ prt_printf(out, "root %llu snapshot id %u", ++ le64_to_cpu(s.v->inode), ++ le32_to_cpu(s.v->snapshot)); ++} ++ ++int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, ++ bool inconsistent_if_not_found, ++ int iter_flags, ++ struct bch_subvolume *s) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), ++ iter_flags); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; ++ ++ if (ret == -ENOENT && inconsistent_if_not_found) ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); ++ if (!ret) ++ *s = *bkey_s_c_to_subvolume(k).v; ++ ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, ++ struct bch_subvolume *subvol) ++{ ++ struct bch_snapshot snap; ++ ++ return snapshot_lookup(trans, snapshot, &snap) ?: ++ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); ++} ++ ++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, ++ u32 *snapid) ++{ ++ struct bch_subvolume s; ++ int ret; ++ ++ ret = bch2_subvolume_get(trans, subvol, true, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_WITH_UPDATES, ++ &s); ++ ++ *snapid = le32_to_cpu(s.snapshot); ++ return ret; ++} ++ ++/* ++ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot ++ * deletion/cleanup: ++ */ ++int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_subvolume subvol; ++ struct btree_trans_commit_hook *h; ++ u32 snapid; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); ++ ret = -EIO; ++ goto err; ++ } ++ ++ subvol = bkey_s_c_to_subvolume(k); ++ snapid = le32_to_cpu(subvol.v->snapshot); ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++ if (ret) ++ goto err; ++ ++ ret = bch2_snapshot_node_set_deleted(trans, snapid); ++ ++ h = bch2_trans_kmalloc(trans, sizeof(*h)); ++ ret = PTR_ERR_OR_ZERO(h); ++ if (ret) ++ goto err; ++ ++ h->fn = bch2_delete_dead_snapshots_hook; ++ bch2_trans_commit_hook(trans, h); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ snapshot_wait_for_pagecache_and_delete_work); ++ snapshot_id_list s; ++ u32 *id; ++ int ret = 0; ++ ++ while (!ret) { ++ mutex_lock(&c->snapshots_unlinked_lock); ++ s = c->snapshots_unlinked; ++ darray_init(&c->snapshots_unlinked); ++ mutex_unlock(&c->snapshots_unlinked_lock); ++ ++ if (!s.nr) ++ break; ++ ++ bch2_evict_subvolume_inodes(c, &s); ++ ++ for (id = s.data; id < s.data + s.nr; id++) { ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_subvolume_delete(&trans, *id)); ++ if (ret) { ++ bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ darray_exit(&s); ++ } ++ ++ percpu_ref_put(&c->writes); ++} ++ ++struct subvolume_unlink_hook { ++ struct btree_trans_commit_hook h; ++ u32 subvol; ++}; ++ ++int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *_h) ++{ ++ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); ++ struct bch_fs *c = trans->c; ++ int ret = 0; ++ ++ mutex_lock(&c->snapshots_unlinked_lock); ++ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) ++ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); ++ mutex_unlock(&c->snapshots_unlinked_lock); ++ ++ if (ret) ++ return ret; ++ ++ if (unlikely(!percpu_ref_tryget_live(&c->writes))) ++ return -EROFS; ++ ++ if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) ++ percpu_ref_put(&c->writes); ++ return 0; ++} ++ ++int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_subvolume *n; ++ struct subvolume_unlink_hook *h; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, ++ POS(0, subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); ++ ret = -EIO; ++ goto err; ++ } ++ ++ n = bch2_trans_kmalloc(trans, sizeof(*n)); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&n->k_i, k); ++ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); ++ ++ ret = bch2_trans_update(trans, &iter, &n->k_i, 0); ++ if (ret) ++ goto err; ++ ++ h = bch2_trans_kmalloc(trans, sizeof(*h)); ++ ret = PTR_ERR_OR_ZERO(h); ++ if (ret) ++ goto err; ++ ++ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; ++ h->subvol = subvolid; ++ bch2_trans_commit_hook(trans, &h->h); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ++ u32 src_subvolid, ++ u32 *new_subvolid, ++ u32 *new_snapshotid, ++ bool ro) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; ++ struct bkey_i_subvolume *new_subvol = NULL; ++ struct bkey_i_subvolume *src_subvol = NULL; ++ struct bkey_s_c k; ++ u32 parent = 0, new_nodes[2], snapshot_subvols[2]; ++ int ret = 0; ++ ++ for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) ++ break; ++ ++ /* ++ * bch2_subvolume_delete() doesn't flush the btree key cache - ++ * ideally it would but that's tricky ++ */ ++ if (bkey_deleted(k.k) && ++ !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos)) ++ goto found_slot; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++ goto err; ++found_slot: ++ snapshot_subvols[0] = dst_iter.pos.offset; ++ snapshot_subvols[1] = src_subvolid; ++ ++ if (src_subvolid) { ++ /* Creating a snapshot: */ ++ src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); ++ ret = PTR_ERR_OR_ZERO(src_subvol); ++ if (ret) ++ goto err; ++ ++ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, ++ POS(0, src_subvolid), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&src_iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_subvolume) { ++ bch_err(c, "subvolume %u not found", src_subvolid); ++ ret = -ENOENT; ++ goto err; ++ } ++ ++ bkey_reassemble(&src_subvol->k_i, k); ++ parent = le32_to_cpu(src_subvol->v.snapshot); ++ } ++ ++ ret = bch2_snapshot_node_create(trans, parent, new_nodes, ++ snapshot_subvols, ++ src_subvolid ? 2 : 1); ++ if (ret) ++ goto err; ++ ++ if (src_subvolid) { ++ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); ++ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); ++ if (ret) ++ goto err; ++ } ++ ++ new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); ++ ret = PTR_ERR_OR_ZERO(new_subvol); ++ if (ret) ++ goto err; ++ ++ bkey_subvolume_init(&new_subvol->k_i); ++ new_subvol->v.flags = 0; ++ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); ++ new_subvol->v.inode = cpu_to_le64(inode); ++ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); ++ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); ++ new_subvol->k.p = dst_iter.pos; ++ ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); ++ if (ret) ++ goto err; ++ ++ *new_subvolid = new_subvol->k.p.offset; ++ *new_snapshotid = new_nodes[0]; ++err: ++ bch2_trans_iter_exit(trans, &src_iter); ++ bch2_trans_iter_exit(trans, &dst_iter); ++ return ret; ++} ++ ++int bch2_fs_subvolumes_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); ++ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, ++ bch2_subvolume_wait_for_pagecache_and_delete); ++ mutex_init(&c->snapshots_unlinked_lock); ++ return 0; ++} +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +new file mode 100644 +index 000000000000..02a636644988 +--- /dev/null ++++ b/fs/bcachefs/subvolume.h +@@ -0,0 +1,137 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUBVOLUME_H ++#define _BCACHEFS_SUBVOLUME_H ++ ++#include "darray.h" ++#include "subvolume_types.h" ++ ++void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, ++ int rw, struct printbuf *); ++ ++#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ ++ .key_invalid = bch2_snapshot_invalid, \ ++ .val_to_text = bch2_snapshot_to_text, \ ++} ++ ++int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, ++ struct bkey_s_c, unsigned); ++ ++static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ++{ ++ return genradix_ptr(&c->snapshots, U32_MAX - id); ++} ++ ++static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->parent; ++} ++ ++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->equiv; ++} ++ ++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) ++{ ++ return id == snapshot_t(c, id)->equiv; ++} ++ ++static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) ++{ ++ struct snapshot_t *s = snapshot_t(c, id); ++ ++ return s->children[0] || s->children[1]; ++} ++ ++static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) ++{ ++ struct snapshot_t *s; ++ u32 parent = bch2_snapshot_parent(c, id); ++ ++ if (!parent) ++ return 0; ++ ++ s = snapshot_t(c, bch2_snapshot_parent(c, id)); ++ if (id == s->children[0]) ++ return s->children[1]; ++ if (id == s->children[1]) ++ return s->children[0]; ++ return 0; ++} ++ ++static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ while (id && id < ancestor) ++ id = bch2_snapshot_parent(c, id); ++ ++ return id == ancestor; ++} ++ ++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) ++{ ++ u32 *i; ++ ++ darray_for_each(*s, i) ++ if (*i == id) ++ return true; ++ return false; ++} ++ ++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ u32 *i; ++ ++ darray_for_each(*s, i) ++ if (bch2_snapshot_is_ancestor(c, id, *i)) ++ return true; ++ return false; ++} ++ ++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ int ret; ++ ++ BUG_ON(snapshot_list_has_id(s, id)); ++ ret = darray_push(s, id); ++ if (ret) ++ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); ++ return ret; ++} ++ ++int bch2_fs_check_snapshots(struct bch_fs *); ++int bch2_fs_check_subvols(struct bch_fs *); ++ ++void bch2_fs_snapshots_exit(struct bch_fs *); ++int bch2_fs_snapshots_start(struct bch_fs *); ++ ++int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, ++ int rw, struct printbuf *); ++void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ ++ .key_invalid = bch2_subvolume_invalid, \ ++ .val_to_text = bch2_subvolume_to_text, \ ++} ++ ++int bch2_subvolume_get(struct btree_trans *, unsigned, ++ bool, int, struct bch_subvolume *); ++int bch2_snapshot_get_subvol(struct btree_trans *, u32, ++ struct bch_subvolume *); ++int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); ++ ++/* only exported for tests: */ ++int bch2_snapshot_node_create(struct btree_trans *, u32, ++ u32 *, u32 *, unsigned); ++ ++int bch2_delete_dead_snapshots(struct bch_fs *); ++void bch2_delete_dead_snapshots_async(struct bch_fs *); ++ ++int bch2_subvolume_delete(struct btree_trans *, u32); ++int bch2_subvolume_unlink(struct btree_trans *, u32); ++int bch2_subvolume_create(struct btree_trans *, u64, u32, ++ u32 *, u32 *, bool); ++ ++int bch2_fs_subvolumes_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_SUBVOLUME_H */ +diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h +new file mode 100644 +index 000000000000..f7562b5d51df +--- /dev/null ++++ b/fs/bcachefs/subvolume_types.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUBVOLUME_TYPES_H ++#define _BCACHEFS_SUBVOLUME_TYPES_H ++ ++#include "darray.h" ++ ++typedef DARRAY(u32) snapshot_id_list; ++ ++#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +new file mode 100644 +index 000000000000..8b8130993a59 +--- /dev/null ++++ b/fs/bcachefs/super-io.c +@@ -0,0 +1,1602 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_sb.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++#include "quota.h" ++#include "super-io.h" ++#include "super.h" ++#include "vstructs.h" ++#include "counters.h" ++ ++#include ++#include ++#include ++ ++#include ++ ++const char * const bch2_sb_fields[] = { ++#define x(name, nr) #name, ++ BCH_SB_FIELDS() ++#undef x ++ NULL ++}; ++ ++static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, ++ struct printbuf *); ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f; ++ ++ /* XXX: need locking around superblock to access optional fields */ ++ ++ vstruct_for_each(sb, f) ++ if (le32_to_cpu(f->type) == type) ++ return f; ++ return NULL; ++} ++ ++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, ++ struct bch_sb_field *f, ++ unsigned u64s) ++{ ++ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; ++ ++ BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); ++ ++ if (!f && !u64s) { ++ /* nothing to do: */ ++ } else if (!f) { ++ f = vstruct_last(sb->sb); ++ memset(f, 0, sizeof(u64) * u64s); ++ f->u64s = cpu_to_le32(u64s); ++ f->type = 0; ++ } else { ++ void *src, *dst; ++ ++ src = vstruct_end(f); ++ ++ if (u64s) { ++ f->u64s = cpu_to_le32(u64s); ++ dst = vstruct_end(f); ++ } else { ++ dst = f; ++ } ++ ++ memmove(dst, src, vstruct_end(sb->sb) - src); ++ ++ if (dst > src) ++ memset(src, 0, dst - src); ++ } ++ ++ sb->sb->u64s = cpu_to_le32(sb_u64s); ++ ++ return u64s ? f : NULL; ++} ++ ++void bch2_sb_field_delete(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ++ if (f) ++ __bch2_sb_field_resize(sb, f, 0); ++} ++ ++/* Superblock realloc/free: */ ++ ++void bch2_free_super(struct bch_sb_handle *sb) ++{ ++ if (sb->bio) ++ bio_put(sb->bio); ++ if (!IS_ERR_OR_NULL(sb->bdev)) ++ blkdev_put(sb->bdev, sb->mode); ++ ++ kfree(sb->sb); ++ memset(sb, 0, sizeof(*sb)); ++} ++ ++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) ++{ ++ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); ++ size_t new_buffer_size; ++ struct bch_sb *new_sb; ++ struct bio *bio; ++ ++ if (sb->bdev) ++ new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); ++ ++ new_buffer_size = roundup_pow_of_two(new_bytes); ++ ++ if (sb->sb && sb->buffer_size >= new_buffer_size) ++ return 0; ++ ++ if (sb->have_layout) { ++ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; ++ ++ if (new_bytes > max_bytes) { ++ char buf[BDEVNAME_SIZE]; ++ ++ pr_err("%s: superblock too big: want %zu but have %llu", ++ bdevname(sb->bdev, buf), new_bytes, max_bytes); ++ return -ENOSPC; ++ } ++ } ++ ++ if (sb->buffer_size >= new_buffer_size && sb->sb) ++ return 0; ++ ++ if (dynamic_fault("bcachefs:add:super_realloc")) ++ return -ENOMEM; ++ ++ if (sb->have_bio) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(new_buffer_size, PAGE_SIZE)); ++ if (!bio) ++ return -ENOMEM; ++ ++ if (sb->bio) ++ bio_put(sb->bio); ++ sb->bio = bio; ++ } ++ ++ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); ++ if (!new_sb) ++ return -ENOMEM; ++ ++ sb->sb = new_sb; ++ sb->buffer_size = new_buffer_size; ++ ++ return 0; ++} ++ ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type, ++ unsigned u64s) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ ssize_t d = -old_u64s + u64s; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) ++ return NULL; ++ ++ if (sb->fs_sb) { ++ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* XXX: we're not checking that offline device have enough space */ ++ ++ for_each_online_member(ca, c, i) { ++ struct bch_sb_handle *sb = &ca->disk_sb; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { ++ percpu_ref_put(&ca->ref); ++ return NULL; ++ } ++ } ++ } ++ ++ f = bch2_sb_field_get(sb->sb, type); ++ f = __bch2_sb_field_resize(sb, f, u64s); ++ if (f) ++ f->type = cpu_to_le32(type); ++ return f; ++} ++ ++/* Superblock validate: */ ++ ++static inline void __bch2_sb_layout_size_assert(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++} ++ ++static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) ++{ ++ u64 offset, prev_offset, max_sectors; ++ unsigned i; ++ ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { ++ prt_printf(out, "Not a bcachefs superblock layout"); ++ return -EINVAL; ++ } ++ ++ if (layout->layout_type != 0) { ++ prt_printf(out, "Invalid superblock layout type %u", ++ layout->layout_type); ++ return -EINVAL; ++ } ++ ++ if (!layout->nr_superblocks) { ++ prt_printf(out, "Invalid superblock layout: no superblocks"); ++ return -EINVAL; ++ } ++ ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { ++ prt_printf(out, "Invalid superblock layout: too many superblocks"); ++ return -EINVAL; ++ } ++ ++ max_sectors = 1 << layout->sb_max_size_bits; ++ ++ prev_offset = le64_to_cpu(layout->sb_offset[0]); ++ ++ for (i = 1; i < layout->nr_superblocks; i++) { ++ offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset < prev_offset + max_sectors) { ++ prt_printf(out, "Invalid superblock layout: superblocks overlap\n" ++ " (sb %u ends at %llu next starts at %llu", ++ i - 1, prev_offset + max_sectors, offset); ++ return -EINVAL; ++ } ++ prev_offset = offset; ++ } ++ ++ return 0; ++} ++ ++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, ++ int rw) ++{ ++ struct bch_sb *sb = disk_sb->sb; ++ struct bch_sb_field *f; ++ struct bch_sb_field_members *mi; ++ enum bch_opt_id opt_id; ++ u32 version, version_min; ++ u16 block_size; ++ int ret; ++ ++ version = le16_to_cpu(sb->version); ++ version_min = version >= bcachefs_metadata_version_bkey_renumber ++ ? le16_to_cpu(sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max) { ++ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min < bcachefs_metadata_version_min) { ++ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min > version) { ++ prt_printf(out, "Bad minimum version %u, greater than version field %u", ++ version_min, version); ++ return -EINVAL; ++ } ++ ++ if (sb->features[1] || ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { ++ prt_printf(out, "Filesystem has incompatible features"); ++ return -EINVAL; ++ } ++ ++ block_size = le16_to_cpu(sb->block_size); ++ ++ if (block_size > PAGE_SECTORS) { ++ prt_printf(out, "Block size too big (got %u, max %u)", ++ block_size, PAGE_SECTORS); ++ return -EINVAL; ++ } ++ ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { ++ prt_printf(out, "Bad user UUID (got zeroes)"); ++ return -EINVAL; ++ } ++ ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { ++ prt_printf(out, "Bad intenal UUID (got zeroes)"); ++ return -EINVAL; ++ } ++ ++ if (!sb->nr_devices || ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) { ++ prt_printf(out, "Bad number of member devices %u (max %u)", ++ sb->nr_devices, BCH_SB_MEMBERS_MAX); ++ return -EINVAL; ++ } ++ ++ if (sb->dev_idx >= sb->nr_devices) { ++ prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", ++ sb->dev_idx, sb->nr_devices); ++ return -EINVAL; ++ } ++ ++ if (!sb->time_precision || ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { ++ prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", ++ le32_to_cpu(sb->time_precision), NSEC_PER_SEC); ++ return -EINVAL; ++ } ++ ++ if (rw == READ) { ++ /* ++ * Been seeing a bug where these are getting inexplicably ++ * zeroed, so we'r now validating them, but we have to be ++ * careful not to preven people's filesystems from mounting: ++ */ ++ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); ++ } ++ ++ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { ++ const struct bch_option *opt = bch2_opt_table + opt_id; ++ ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, opt_id); ++ ++ prt_printf(out, "Invalid option "); ++ ret = bch2_opt_validate(opt, v, out); ++ if (ret) ++ return ret; ++ ++ printbuf_reset(out); ++ } ++ } ++ ++ /* validate layout */ ++ ret = validate_sb_layout(&sb->layout, out); ++ if (ret) ++ return ret; ++ ++ vstruct_for_each(sb, f) { ++ if (!f->u64s) { ++ prt_printf(out, "Invalid superblock: optional with size 0 (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } ++ ++ if (vstruct_next(f) > vstruct_last(sb)) { ++ prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", ++ le32_to_cpu(f->type)); ++ return -EINVAL; ++ } ++ } ++ ++ /* members must be validated first: */ ++ mi = bch2_sb_get_members(sb); ++ if (!mi) { ++ prt_printf(out, "Invalid superblock: member info area missing"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_sb_field_validate(sb, &mi->field, out); ++ if (ret) ++ return ret; ++ ++ vstruct_for_each(sb, f) { ++ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) ++ continue; ++ ++ ret = bch2_sb_field_validate(sb, f, out); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* device open: */ ++ ++static void bch2_sb_update(struct bch_fs *c) ++{ ++ struct bch_sb *src = c->disk_sb.sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(src); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->sb.uuid = src->uuid; ++ c->sb.user_uuid = src->user_uuid; ++ c->sb.version = le16_to_cpu(src->version); ++ c->sb.version_min = le16_to_cpu(src->version_min); ++ c->sb.nr_devices = src->nr_devices; ++ c->sb.clean = BCH_SB_CLEAN(src); ++ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); ++ ++ c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); ++ c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; ++ ++ /* XXX this is wrong, we need a 96 or 128 bit integer type */ ++ c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), ++ c->sb.nsec_per_time_unit); ++ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); ++ ++ c->sb.features = le64_to_cpu(src->features[0]); ++ c->sb.compat = le64_to_cpu(src->compat[0]); ++ ++ for_each_member_device(ca, c, i) ++ ca->mi = bch2_mi_to_cpu(mi->members + i); ++} ++ ++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) ++{ ++ struct bch_sb_field *src_f, *dst_f; ++ struct bch_sb *dst = dst_handle->sb; ++ unsigned i; ++ ++ dst->version = src->version; ++ dst->version_min = src->version_min; ++ dst->seq = src->seq; ++ dst->uuid = src->uuid; ++ dst->user_uuid = src->user_uuid; ++ memcpy(dst->label, src->label, sizeof(dst->label)); ++ ++ dst->block_size = src->block_size; ++ dst->nr_devices = src->nr_devices; ++ ++ dst->time_base_lo = src->time_base_lo; ++ dst->time_base_hi = src->time_base_hi; ++ dst->time_precision = src->time_precision; ++ ++ memcpy(dst->flags, src->flags, sizeof(dst->flags)); ++ memcpy(dst->features, src->features, sizeof(dst->features)); ++ memcpy(dst->compat, src->compat, sizeof(dst->compat)); ++ ++ for (i = 0; i < BCH_SB_FIELD_NR; i++) { ++ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) ++ continue; ++ ++ src_f = bch2_sb_field_get(src, i); ++ dst_f = bch2_sb_field_get(dst, i); ++ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, ++ src_f ? le32_to_cpu(src_f->u64s) : 0); ++ ++ if (src_f) ++ memcpy(dst_f, src_f, vstruct_bytes(src_f)); ++ } ++} ++ ++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) ++{ ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(src); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ int ret; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ ret = bch2_sb_realloc(&c->disk_sb, ++ le32_to_cpu(src->u64s) - journal_u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&c->disk_sb, src); ++ ++ ret = bch2_sb_replicas_to_cpu_replicas(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ return ret; ++ ++ bch2_sb_update(c); ++ return 0; ++} ++ ++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(dst); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; ++ int ret; ++ ++ ret = bch2_sb_realloc(&ca->disk_sb, u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&ca->disk_sb, src); ++ return 0; ++} ++ ++/* read superblock: */ ++ ++static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) ++{ ++ struct bch_csum csum; ++ u32 version, version_min; ++ size_t bytes; ++ int ret; ++reread: ++ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); ++ sb->bio->bi_iter.bi_sector = offset; ++ bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); ++ ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ prt_printf(err, "IO error: %i", ret); ++ return ret; ++ } ++ ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { ++ prt_printf(err, "Not a bcachefs superblock"); ++ return -EINVAL; ++ } ++ ++ version = le16_to_cpu(sb->sb->version); ++ version_min = version >= bcachefs_metadata_version_bkey_renumber ++ ? le16_to_cpu(sb->sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max) { ++ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ if (version_min < bcachefs_metadata_version_min) { ++ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", ++ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); ++ return -EINVAL; ++ } ++ ++ bytes = vstruct_bytes(sb->sb); ++ ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { ++ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", ++ bytes, 512UL << sb->sb->layout.sb_max_size_bits); ++ return -EINVAL; ++ } ++ ++ if (bytes > sb->buffer_size) { ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) ++ return -ENOMEM; ++ goto reread; ++ } ++ ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { ++ prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); ++ return -EINVAL; ++ } ++ ++ /* XXX: verify MACs */ ++ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), ++ null_nonce(), sb->sb); ++ ++ if (bch2_crc_cmp(csum, sb->sb->csum)) { ++ prt_printf(err, "bad checksum"); ++ return -EINVAL; ++ } ++ ++ sb->seq = le64_to_cpu(sb->sb->seq); ++ ++ return 0; ++} ++ ++int bch2_read_super(const char *path, struct bch_opts *opts, ++ struct bch_sb_handle *sb) ++{ ++ u64 offset = opt_get(*opts, sb); ++ struct bch_sb_layout layout; ++ struct printbuf err = PRINTBUF; ++ __le64 *i; ++ int ret; ++ ++ pr_verbose_init(*opts, ""); ++ ++ memset(sb, 0, sizeof(*sb)); ++ sb->mode = FMODE_READ; ++ sb->have_bio = true; ++ ++ if (!opt_get(*opts, noexcl)) ++ sb->mode |= FMODE_EXCL; ++ ++ if (!opt_get(*opts, nochanges)) ++ sb->mode |= FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (IS_ERR(sb->bdev) && ++ PTR_ERR(sb->bdev) == -EACCES && ++ opt_get(*opts, read_only)) { ++ sb->mode &= ~FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (!IS_ERR(sb->bdev)) ++ opt_set(*opts, nochanges, true); ++ } ++ ++ if (IS_ERR(sb->bdev)) { ++ ret = PTR_ERR(sb->bdev); ++ goto out; ++ } ++ ++ ret = bch2_sb_realloc(sb, 0); ++ if (ret) { ++ prt_printf(&err, "error allocating memory for superblock"); ++ goto err; ++ } ++ ++ if (bch2_fs_init_fault("read_super")) { ++ prt_printf(&err, "dynamic fault"); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) ++ goto got_super; ++ ++ if (opt_defined(*opts, sb)) ++ goto err; ++ ++ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", ++ path, err.buf); ++ printbuf_reset(&err); ++ ++ /* ++ * Error reading primary superblock - read location of backup ++ * superblocks: ++ */ ++ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); ++ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; ++ /* ++ * use sb buffer to read layout, since sb buffer is page aligned but ++ * layout won't be: ++ */ ++ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); ++ ++ ret = submit_bio_wait(sb->bio); ++ if (ret) { ++ prt_printf(&err, "IO error: %i", ret); ++ goto err; ++ } ++ ++ memcpy(&layout, sb->sb, sizeof(layout)); ++ ret = validate_sb_layout(&layout, &err); ++ if (ret) ++ goto err; ++ ++ for (i = layout.sb_offset; ++ i < layout.sb_offset + layout.nr_superblocks; i++) { ++ offset = le64_to_cpu(*i); ++ ++ if (offset == opt_get(*opts, sb)) ++ continue; ++ ++ ret = read_one_super(sb, offset, &err); ++ if (!ret) ++ goto got_super; ++ } ++ ++ goto err; ++ ++got_super: ++ if (le16_to_cpu(sb->sb->block_size) << 9 < ++ bdev_logical_block_size(sb->bdev)) { ++ prt_printf(&err, "block size (%u) smaller than device block size (%u)", ++ le16_to_cpu(sb->sb->block_size) << 9, ++ bdev_logical_block_size(sb->bdev)); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = 0; ++ sb->have_layout = true; ++ ++ ret = bch2_sb_validate(sb, &err, READ); ++ if (ret) { ++ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", ++ path, err.buf); ++ goto err_no_print; ++ } ++out: ++ pr_verbose_init(*opts, "ret %i", ret); ++ printbuf_exit(&err); ++ return ret; ++err: ++ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", ++ path, err.buf); ++err_no_print: ++ bch2_free_super(sb); ++ goto out; ++} ++ ++/* write superblock: */ ++ ++static void write_super_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ ++ /* XXX: return errors directly */ ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ ca->sb_write_error = 1; ++ ++ closure_put(&ca->fs->sb_write); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void read_back_super(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ sb->offset = sb->layout.sb_offset[idx]; ++ ++ SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); ++ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), ++ null_nonce(), sb); ++ ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bch2_bio_map(bio, sb, ++ roundup((size_t) vstruct_bytes(sb), ++ bdev_logical_block_size(ca->disk_sb.bdev))); ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++int bch2_write_super(struct bch_fs *c) ++{ ++ struct closure *cl = &c->sb_write; ++ struct bch_dev *ca; ++ struct printbuf err = PRINTBUF; ++ unsigned i, sb = 0, nr_wrote; ++ struct bch_devs_mask sb_written; ++ bool wrote, can_mount_without_written, can_mount_with_written; ++ unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; ++ int ret = 0; ++ ++ trace_write_super(c, _RET_IP_); ++ ++ if (c->opts.very_degraded) ++ degraded_flags |= BCH_FORCE_IF_LOST; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ closure_init_stack(cl); ++ memset(&sb_written, 0, sizeof(sb_written)); ++ ++ le64_add_cpu(&c->disk_sb.sb->seq, 1); ++ ++ if (test_bit(BCH_FS_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); ++ ++ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); ++ ++ bch2_sb_counters_from_cpu(c); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ for_each_online_member(ca, c, i) { ++ printbuf_reset(&err); ++ ++ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); ++ if (ret) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); ++ percpu_ref_put(&ca->io_ref); ++ goto out; ++ } ++ } ++ ++ if (c->opts.nochanges) ++ goto out; ++ ++ /* ++ * Defer writing the superblock until filesystem initialization is ++ * complete - don't write out a partly initialized superblock: ++ */ ++ if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) ++ goto out; ++ ++ for_each_online_member(ca, c, i) { ++ __set_bit(ca->dev_idx, sb_written.d); ++ ca->sb_write_error = 0; ++ } ++ ++ for_each_online_member(ca, c, i) ++ read_back_super(c, ca); ++ closure_sync(cl); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ continue; ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { ++ bch2_fs_fatal_error(c, ++ "Superblock write was silently dropped! (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process (seq %llu expected %llu)", ++ le64_to_cpu(ca->sb_read_scratch->seq), ++ ca->disk_sb.seq); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ } ++ ++ do { ++ wrote = false; ++ for_each_online_member(ca, c, i) ++ if (!ca->sb_write_error && ++ sb < ca->disk_sb.sb->layout.nr_superblocks) { ++ write_one_super(c, ca, sb); ++ wrote = true; ++ } ++ closure_sync(cl); ++ sb++; ++ } while (wrote); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ __clear_bit(ca->dev_idx, sb_written.d); ++ else ++ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); ++ } ++ ++ nr_wrote = dev_mask_nr(&sb_written); ++ ++ can_mount_with_written = ++ bch2_have_enough_devs(c, sb_written, degraded_flags, false); ++ ++ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) ++ sb_written.d[i] = ~sb_written.d[i]; ++ ++ can_mount_without_written = ++ bch2_have_enough_devs(c, sb_written, degraded_flags, false); ++ ++ /* ++ * If we would be able to mount _without_ the devices we successfully ++ * wrote superblocks to, we weren't able to write to enough devices: ++ * ++ * Exception: if we can mount without the successes because we haven't ++ * written anything (new filesystem), we continue if we'd be able to ++ * mount with the devices we did successfully write to: ++ */ ++ if (bch2_fs_fatal_err_on(!nr_wrote || ++ !can_mount_with_written || ++ (can_mount_without_written && ++ !can_mount_with_written), c, ++ "Unable to write superblock to sufficient devices (from %ps)", ++ (void *) _RET_IP_)) ++ ret = -1; ++out: ++ /* Make new options visible after they're persistent: */ ++ bch2_sb_update(c); ++ printbuf_exit(&err); ++ return ret; ++} ++ ++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static int bch2_sb_members_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ unsigned i; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) { ++ prt_printf(err, "too many devices for section size"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { ++ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", ++ i, le64_to_cpu(m->nbuckets), LONG_MAX); ++ return -EINVAL; ++ } ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { ++ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", ++ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); ++ return -EINVAL; ++ } ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) { ++ prt_printf(err, "device %u: bucket size %u smaller than block size %u", ++ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); ++ return -EINVAL; ++ } ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) { ++ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", ++ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); ++ unsigned i; ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ unsigned data_have = bch2_sb_dev_has_data(sb, i); ++ u64 bucket_size = le16_to_cpu(m->bucket_size); ++ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ prt_printf(out, "Device:"); ++ prt_tab(out); ++ prt_printf(out, "%u", i); ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "UUID:"); ++ prt_tab(out); ++ pr_uuid(out, m->uuid.b); ++ prt_newline(out); ++ ++ prt_printf(out, "Size:"); ++ prt_tab(out); ++ prt_units_u64(out, device_size << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "Bucket size:"); ++ prt_tab(out); ++ prt_units_u64(out, bucket_size << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "First bucket:"); ++ prt_tab(out); ++ prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); ++ prt_newline(out); ++ ++ prt_printf(out, "Buckets:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); ++ prt_newline(out); ++ ++ prt_printf(out, "Last mount:"); ++ prt_tab(out); ++ if (m->last_mount) ++ pr_time(out, le64_to_cpu(m->last_mount)); ++ else ++ prt_printf(out, "(never)"); ++ prt_newline(out); ++ ++ prt_printf(out, "State:"); ++ prt_tab(out); ++ prt_printf(out, "%s", ++ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR ++ ? bch2_member_states[BCH_MEMBER_STATE(m)] ++ : "unknown"); ++ prt_newline(out); ++ ++ prt_printf(out, "Label:"); ++ prt_tab(out); ++ if (BCH_MEMBER_GROUP(m)) { ++ unsigned idx = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (idx < disk_groups_nr(gi)) ++ prt_printf(out, "%s (%u)", ++ gi->entries[idx].label, idx); ++ else ++ prt_printf(out, "(bad disk labels section)"); ++ } else { ++ prt_printf(out, "(none)"); ++ } ++ prt_newline(out); ++ ++ prt_printf(out, "Data allowed:"); ++ prt_tab(out); ++ if (BCH_MEMBER_DATA_ALLOWED(m)) ++ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); ++ else ++ prt_printf(out, "(none)"); ++ prt_newline(out); ++ ++ prt_printf(out, "Has data:"); ++ prt_tab(out); ++ if (data_have) ++ prt_bitflags(out, bch2_data_types, data_have); ++ else ++ prt_printf(out, "(none)"); ++ prt_newline(out); ++ ++ prt_printf(out, "Discard:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); ++ prt_newline(out); ++ ++ prt_printf(out, "Freespace initialized:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); ++ prt_newline(out); ++ ++ printbuf_indent_sub(out, 2); ++ } ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_members_validate, ++ .to_text = bch2_sb_members_to_text, ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++static int bch2_sb_crypt_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { ++ prt_printf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&crypt->field), sizeof(*crypt)); ++ return -EINVAL; ++ } ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) { ++ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); ++ prt_newline(out); ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_crypt_validate, ++ .to_text = bch2_sb_crypt_to_text, ++}; ++ ++/* BCH_SB_FIELD_clean: */ ++ ++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = bch2_journal_entry_validate(c, "superblock", entry, ++ le16_to_cpu(c->disk_sb.sb->version), ++ BCH_SB_BIG_ENDIAN(c->disk_sb.sb), ++ write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); ++ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) ++{ ++ struct jset_entry *entry = *end; ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ ++ memset(entry, 0, u64s * sizeof(u64)); ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = cpu_to_le16(u64s - 1); ++ ++ *end = vstruct_next(*end); ++ return entry; ++} ++ ++void bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry **end, ++ u64 journal_seq) ++{ ++ struct bch_dev *ca; ++ unsigned i, dev; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ if (!journal_seq) { ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_inodes; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_key_version; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_reserved; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), ++ struct jset_entry_data_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ memcpy(&u->r, e, replicas_entry_bytes(e)); ++ } ++ ++ for_each_member_device(ca, c, dev) { ++ unsigned b = sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; ++ struct jset_entry_dev_usage *u = ++ container_of(jset_entry_init(end, b), ++ struct jset_entry_dev_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_dev_usage; ++ u->dev = cpu_to_le32(dev); ++ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); ++ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); ++ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); ++ } ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < 2; i++) { ++ struct jset_entry_clock *clock = ++ container_of(jset_entry_init(end, sizeof(*clock)), ++ struct jset_entry_clock, entry); ++ ++ clock->entry.type = BCH_JSET_ENTRY_clock; ++ clock->rw = i; ++ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); ++ } ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ bch2_journal_super_entries_add_common(c, &entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ /* ++ * this should be in the write path, and we should be validating every ++ * superblock section: ++ */ ++ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); ++ if (ret) { ++ bch_err(c, "error writing marking filesystem clean: validate error"); ++ goto out; ++ } ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} ++ ++static int bch2_sb_clean_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { ++ prt_printf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&clean->field), sizeof(*clean)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ struct jset_entry *entry; ++ ++ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); ++ prt_newline(out); ++ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); ++ prt_newline(out); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ if (entry->type == BCH_JSET_ENTRY_btree_keys && ++ !entry->u64s) ++ continue; ++ ++ bch2_journal_entry_to_text(out, NULL, entry); ++ prt_newline(out); ++ } ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_clean_validate, ++ .to_text = bch2_sb_clean_to_text, ++}; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { ++#define x(f, nr) \ ++ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, ++ BCH_SB_FIELDS() ++#undef x ++}; ++ ++static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ struct printbuf field_err = PRINTBUF; ++ int ret; ++ ++ if (type >= BCH_SB_FIELD_NR) ++ return 0; ++ ++ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); ++ if (ret) { ++ prt_printf(err, "Invalid superblock section %s: %s", ++ bch2_sb_fields[type], ++ field_err.buf); ++ prt_newline(err); ++ bch2_sb_field_to_text(err, sb, f); ++ } ++ ++ printbuf_exit(&field_err); ++ return ret; ++} ++ ++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type] : NULL; ++ ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ ++ if (ops) ++ prt_printf(out, "%s", bch2_sb_fields[type]); ++ else ++ prt_printf(out, "(unknown field %u)", type); ++ ++ prt_printf(out, " (size %zu):", vstruct_bytes(f)); ++ prt_newline(out); ++ ++ if (ops && ops->to_text) { ++ printbuf_indent_add(out, 2); ++ bch2_sb_field_ops[type]->to_text(out, sb, f); ++ printbuf_indent_sub(out, 2); ++ } ++} ++ ++void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) ++{ ++ unsigned i; ++ ++ prt_printf(out, "Type: %u", l->layout_type); ++ prt_newline(out); ++ ++ prt_str(out, "Superblock max size: "); ++ prt_units_u64(out, 512 << l->sb_max_size_bits); ++ prt_newline(out); ++ ++ prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); ++ prt_newline(out); ++ ++ prt_str(out, "Offsets: "); ++ for (i = 0; i < l->nr_superblocks; i++) { ++ if (i) ++ prt_str(out, ", "); ++ prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); ++ } ++ prt_newline(out); ++} ++ ++void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, ++ bool print_layout, unsigned fields) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field *f; ++ u64 fields_have = 0; ++ unsigned nr_devices = 0; ++ ++ if (!out->tabstops[0]) ++ out->tabstops[0] = 32; ++ ++ mi = bch2_sb_get_members(sb); ++ if (mi) { ++ struct bch_member *m; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) ++ nr_devices += bch2_member_exists(m); ++ } ++ ++ prt_printf(out, "External UUID:"); ++ prt_tab(out); ++ pr_uuid(out, sb->user_uuid.b); ++ prt_newline(out); ++ ++ prt_printf(out, "Internal UUID:"); ++ prt_tab(out); ++ pr_uuid(out, sb->uuid.b); ++ prt_newline(out); ++ ++ prt_str(out, "Device index:"); ++ prt_tab(out); ++ prt_printf(out, "%u", sb->dev_idx); ++ prt_newline(out); ++ ++ prt_str(out, "Label:"); ++ prt_tab(out); ++ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); ++ prt_newline(out); ++ ++ prt_str(out, "Version:"); ++ prt_tab(out); ++ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); ++ prt_newline(out); ++ ++ prt_printf(out, "Oldest version on disk:"); ++ prt_tab(out); ++ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); ++ prt_newline(out); ++ ++ prt_printf(out, "Created:"); ++ prt_tab(out); ++ if (sb->time_base_lo) ++ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); ++ else ++ prt_printf(out, "(not set)"); ++ prt_newline(out); ++ ++ prt_printf(out, "Sequence number:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", le64_to_cpu(sb->seq)); ++ prt_newline(out); ++ ++ prt_printf(out, "Superblock size:"); ++ prt_tab(out); ++ prt_printf(out, "%zu", vstruct_bytes(sb)); ++ prt_newline(out); ++ ++ prt_printf(out, "Clean:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); ++ prt_newline(out); ++ ++ prt_printf(out, "Devices:"); ++ prt_tab(out); ++ prt_printf(out, "%u", nr_devices); ++ prt_newline(out); ++ ++ prt_printf(out, "Sections:"); ++ vstruct_for_each(sb, f) ++ fields_have |= 1 << le32_to_cpu(f->type); ++ prt_tab(out); ++ prt_bitflags(out, bch2_sb_fields, fields_have); ++ prt_newline(out); ++ ++ prt_printf(out, "Features:"); ++ prt_tab(out); ++ prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); ++ prt_newline(out); ++ ++ prt_printf(out, "Compat features:"); ++ prt_tab(out); ++ prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); ++ prt_newline(out); ++ ++ prt_newline(out); ++ prt_printf(out, "Options:"); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ { ++ enum bch_opt_id id; ++ ++ for (id = 0; id < bch2_opts_nr; id++) { ++ const struct bch_option *opt = bch2_opt_table + id; ++ ++ if (opt->get_sb != BCH2_NO_SB_OPT) { ++ u64 v = bch2_opt_from_sb(sb, id); ++ ++ prt_printf(out, "%s:", opt->attr.name); ++ prt_tab(out); ++ bch2_opt_to_text(out, NULL, sb, opt, v, ++ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); ++ prt_newline(out); ++ } ++ } ++ } ++ ++ printbuf_indent_sub(out, 2); ++ ++ if (print_layout) { ++ prt_newline(out); ++ prt_printf(out, "layout:"); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ bch2_sb_layout_to_text(out, &sb->layout); ++ printbuf_indent_sub(out, 2); ++ } ++ ++ vstruct_for_each(sb, f) ++ if (fields & (1 << le32_to_cpu(f->type))) { ++ prt_newline(out); ++ bch2_sb_field_to_text(out, sb, f); ++ } ++} +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +new file mode 100644 +index 000000000000..14a25f6fe29a +--- /dev/null ++++ b/fs/bcachefs/super-io.h +@@ -0,0 +1,126 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_IO_H ++#define _BCACHEFS_SUPER_IO_H ++ ++#include "extents.h" ++#include "eytzinger.h" ++#include "super_types.h" ++#include "super.h" ++ ++#include ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, ++ enum bch_sb_field_type, unsigned); ++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); ++ ++#define field_to_type(_f, _name) \ ++ container_of_or_null(_f, struct bch_sb_field_##_name, field) ++ ++#define x(_name, _nr) \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_get_##_name(struct bch_sb *sb) \ ++{ \ ++ return field_to_type(bch2_sb_field_get(sb, \ ++ BCH_SB_FIELD_##_name), _name); \ ++} \ ++ \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ ++{ \ ++ return field_to_type(bch2_sb_field_resize(sb, \ ++ BCH_SB_FIELD_##_name, u64s), _name); \ ++} ++ ++BCH_SB_FIELDS() ++#undef x ++ ++extern const char * const bch2_sb_fields[]; ++ ++struct bch_sb_field_ops { ++ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); ++}; ++ ++static inline __le64 bch2_sb_magic(struct bch_fs *c) ++{ ++ __le64 ret; ++ memcpy(&ret, &c->sb.uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 jset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); ++} ++ ++static inline __u64 bset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); ++} ++ ++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); ++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); ++ ++void bch2_free_super(struct bch_sb_handle *); ++int bch2_sb_realloc(struct bch_sb_handle *, unsigned); ++ ++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); ++int bch2_write_super(struct bch_fs *); ++void __bch2_check_set_feature(struct bch_fs *, unsigned); ++ ++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ if (!(c->sb.features & (1ULL << feat))) ++ __bch2_check_set_feature(c, feat); ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, ++ struct bch_sb_field_members *mi, ++ unsigned dev) ++{ ++ return dev < sb->nr_devices && ++ bch2_member_exists(&mi->members[dev]); ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), ++ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), ++ }; ++} ++ ++/* BCH_SB_FIELD_clean: */ ++ ++void bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry **, u64); ++ ++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); ++void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); ++ ++#endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +new file mode 100644 +index 000000000000..7c6348001ae3 +--- /dev/null ++++ b/fs/bcachefs/super.c +@@ -0,0 +1,1950 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs setup/teardown code, and some metadata io - read a superblock and ++ * figure out what to do with it. ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_key_cache.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets_waiting_for_journal.h" ++#include "chardev.h" ++#include "checksum.h" ++#include "clock.h" ++#include "compress.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "errcode.h" ++#include "error.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "move.h" ++#include "migrate.h" ++#include "movinggc.h" ++#include "quota.h" ++#include "rebalance.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "subvolume.h" ++#include "super.h" ++#include "super-io.h" ++#include "sysfs.h" ++#include "counters.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Kent Overstreet "); ++ ++#define KTYPE(type) \ ++static const struct attribute_group type ## _group = { \ ++ .attrs = type ## _files \ ++}; \ ++ \ ++static const struct attribute_group *type ## _groups[] = { \ ++ &type ## _group, \ ++ NULL \ ++}; \ ++ \ ++static const struct kobj_type type ## _ktype = { \ ++ .release = type ## _release, \ ++ .sysfs_ops = &type ## _sysfs_ops, \ ++ .default_groups = type ## _groups \ ++} ++ ++static void bch2_fs_release(struct kobject *); ++static void bch2_dev_release(struct kobject *); ++static void bch2_fs_counters_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_internal_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_opts_dir_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_time_stats_release(struct kobject *k) ++{ ++} ++ ++KTYPE(bch2_fs); ++KTYPE(bch2_fs_counters); ++KTYPE(bch2_fs_internal); ++KTYPE(bch2_fs_opts_dir); ++KTYPE(bch2_fs_time_stats); ++KTYPE(bch2_dev); ++ ++static struct kset *bcachefs_kset; ++static LIST_HEAD(bch_fs_list); ++static DEFINE_MUTEX(bch_fs_list_lock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); ++ ++static void bch2_dev_free(struct bch_dev *); ++static int bch2_dev_alloc(struct bch_fs *, unsigned); ++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); ++ ++struct bch_fs *bch2_dev_to_fs(dev_t dev) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ rcu_read_lock(); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { ++ closure_get(&c->cl); ++ goto found; ++ } ++ c = NULL; ++found: ++ rcu_read_unlock(); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) ++ return c; ++ ++ return NULL; ++} ++ ++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(uuid); ++ if (c) ++ closure_get(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static void bch2_dev_usage_journal_reserve(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, u64s = ++ ((sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / ++ sizeof(u64); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) ++ nr++; ++ rcu_read_unlock(); ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->dev_usage_journal_res, u64s * nr); ++} ++ ++/* Filesystem RO/RW: */ ++ ++/* ++ * For startup/shutdown of RW stuff, the dependencies are: ++ * ++ * - foreground writes depend on copygc and rebalance (to free up space) ++ * ++ * - copygc and rebalance depend on mark and sweep gc (they actually probably ++ * don't because they either reserve ahead of time or don't block if ++ * allocations fail, but allocations can require mark and sweep gc to run ++ * because of generation number wraparound) ++ * ++ * - all of the above depends on the allocator threads ++ * ++ * - allocator depends on the journal (when it rewrites prios and gens) ++ */ ++ ++static void __bch2_fs_read_only(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i, clean_passes = 0; ++ u64 seq = 0; ++ ++ bch2_rebalance_stop(c); ++ bch2_copygc_stop(c); ++ bch2_gc_thread_stop(c); ++ ++ bch_verbose(c, "flushing journal and stopping allocators"); ++ ++ do { ++ clean_passes++; ++ ++ if (bch2_btree_interior_updates_flush(c) || ++ bch2_journal_flush_all_pins(&c->journal) || ++ bch2_btree_flush_all_writes(c) || ++ seq != atomic64_read(&c->journal.seq)) { ++ seq = atomic64_read(&c->journal.seq); ++ clean_passes = 0; ++ } ++ } while (clean_passes < 2); ++ ++ bch_verbose(c, "flushing journal and stopping allocators complete"); ++ ++ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); ++ bch2_fs_journal_stop(&c->journal); ++ ++ /* ++ * After stopping journal: ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_remove(c, ca); ++} ++ ++static void bch2_writes_disabled(struct percpu_ref *writes) ++{ ++ struct bch_fs *c = container_of(writes, struct bch_fs, writes); ++ ++ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ wake_up(&bch_read_only_wait); ++} ++ ++void bch2_fs_read_only(struct bch_fs *c) ++{ ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ bch2_journal_reclaim_stop(&c->journal); ++ return; ++ } ++ ++ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ /* ++ * Block new foreground-end write operations from starting - any new ++ * writes will return -EROFS: ++ */ ++ percpu_ref_kill(&c->writes); ++ ++ cancel_work_sync(&c->ec_stripe_delete_work); ++ ++ /* ++ * If we're not doing an emergency shutdown, we want to wait on ++ * outstanding writes to complete so they don't see spurious errors due ++ * to shutting down the allocator: ++ * ++ * If we are doing an emergency shutdown outstanding writes may ++ * hang until we shutdown the allocator so we don't want to wait ++ * on outstanding writes before shutting everything down - but ++ * we do need to wait on them before returning and signalling ++ * that going RO is complete: ++ */ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || ++ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); ++ ++ __bch2_fs_read_only(c); ++ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ ++ if (!bch2_journal_error(&c->journal) && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && ++ test_bit(BCH_FS_STARTED, &c->flags) && ++ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && ++ !c->opts.norecovery) { ++ bch_verbose(c, "marking filesystem clean"); ++ bch2_fs_mark_clean(c); ++ } ++ ++ clear_bit(BCH_FS_RW, &c->flags); ++} ++ ++static void bch2_fs_read_only_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, read_only_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++} ++ ++static void bch2_fs_read_only_async(struct bch_fs *c) ++{ ++ queue_work(system_long_wq, &c->read_only_work); ++} ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *c) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); ++ ++ bch2_journal_halt(&c->journal); ++ bch2_fs_read_only_async(c); ++ ++ wake_up(&bch_read_only_wait); ++ return ret; ++} ++ ++static int bch2_fs_read_write_late(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; ++ } ++ ++ ret = bch2_rebalance_start(c); ++ if (ret) { ++ bch_err(c, "error starting rebalance thread"); ++ return ret; ++ } ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ ++ return 0; ++} ++ ++static int __bch2_fs_read_write(struct bch_fs *c, bool early) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { ++ bch_err(c, "cannot go rw, unfixed btree errors"); ++ return -EROFS; ++ } ++ ++ if (test_bit(BCH_FS_RW, &c->flags)) ++ return 0; ++ ++ /* ++ * nochanges is used for fsck -n mode - we have to allow going rw ++ * during recovery for that to work: ++ */ ++ if (c->opts.norecovery || ++ (c->opts.nochanges && ++ (!early || c->opts.read_only))) ++ return -EROFS; ++ ++ bch_info(c, "going read-write"); ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ goto err; ++ ++ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ bch2_do_discards(c); ++ bch2_do_invalidates(c); ++ ++ if (!early) { ++ ret = bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ percpu_ref_reinit(&c->writes); ++ set_bit(BCH_FS_RW, &c->flags); ++ set_bit(BCH_FS_WAS_RW, &c->flags); ++ return 0; ++err: ++ __bch2_fs_read_only(c); ++ return ret; ++} ++ ++int bch2_fs_read_write(struct bch_fs *c) ++{ ++ return __bch2_fs_read_write(c, false); ++} ++ ++int bch2_fs_read_write_early(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ return __bch2_fs_read_write(c, true); ++} ++ ++/* Filesystem startup/shutdown: */ ++ ++static void __bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ int cpu; ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_exit(&c->times[i]); ++ ++ bch2_fs_counters_exit(c); ++ bch2_fs_snapshots_exit(c); ++ bch2_fs_quota_exit(c); ++ bch2_fs_fsio_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_io_exit(c); ++ bch2_fs_buckets_waiting_for_journal_exit(c); ++ bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_replicas_exit(c); ++ bch2_fs_journal_exit(&c->journal); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); ++ bch2_fs_compress_exit(c); ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(c); ++ percpu_free_rwsem(&c->mark_lock); ++ ++ if (c->btree_paths_bufs) ++ for_each_possible_cpu(cpu) ++ kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); ++ ++ free_percpu(c->online_reserved); ++ free_percpu(c->btree_paths_bufs); ++ free_percpu(c->pcpu); ++ mempool_exit(&c->large_bkey_pool); ++ mempool_exit(&c->btree_bounce_pool); ++ bioset_exit(&c->btree_bio); ++ mempool_exit(&c->fill_iter); ++ percpu_ref_exit(&c->writes); ++ kfree(rcu_dereference_protected(c->disk_groups, 1)); ++ kfree(c->journal_seq_blacklist_table); ++ kfree(c->unused_inode_hints); ++ free_heap(&c->copygc_heap); ++ ++ if (c->io_complete_wq ) ++ destroy_workqueue(c->io_complete_wq ); ++ if (c->copygc_wq) ++ destroy_workqueue(c->copygc_wq); ++ if (c->btree_io_complete_wq) ++ destroy_workqueue(c->btree_io_complete_wq); ++ if (c->btree_update_wq) ++ destroy_workqueue(c->btree_update_wq); ++ ++ bch2_free_super(&c->disk_sb); ++ kvpfree(c, sizeof(*c)); ++ module_put(THIS_MODULE); ++} ++ ++static void bch2_fs_release(struct kobject *kobj) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ __bch2_fs_free(c); ++} ++ ++void __bch2_fs_stop(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ bch_verbose(c, "shutting down"); ++ ++ set_bit(BCH_FS_STOPPING, &c->flags); ++ ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ ++ if (c->kobj.state_in_sysfs) ++ kobject_del(&c->kobj); ++ ++ bch2_fs_debug_exit(c); ++ bch2_fs_chardev_exit(c); ++ ++ kobject_put(&c->counters_kobj); ++ kobject_put(&c->time_stats); ++ kobject_put(&c->opts_dir); ++ kobject_put(&c->internal); ++ ++ /* btree prefetch might have kicked off reads in the background: */ ++ bch2_btree_flush_all_reads(c); ++ ++ for_each_member_device(ca, c, i) ++ cancel_work_sync(&ca->io_error_work); ++ ++ cancel_work_sync(&c->read_only_work); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_free_super(&c->devs[i]->disk_sb); ++} ++ ++void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ ++ bch_verbose(c, "shutdown complete"); ++ ++ kobject_put(&c->kobj); ++} ++ ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ __bch2_fs_stop(c); ++ bch2_fs_free(c); ++} ++ ++static int bch2_fs_online(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ if (__bch2_uuid_to_fs(c->sb.uuid)) { ++ bch_err(c, "filesystem UUID already open"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_fs_chardev_init(c); ++ if (ret) { ++ bch_err(c, "error creating character device"); ++ return ret; ++ } ++ ++ bch2_fs_debug_init(c); ++ ++ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: ++ kobject_add(&c->internal, &c->kobj, "internal") ?: ++ kobject_add(&c->opts_dir, &c->kobj, "options") ?: ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: ++ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: ++ bch2_opts_create_sysfs_files(&c->opts_dir); ++ if (ret) { ++ bch_err(c, "error creating sysfs objects"); ++ return ret; ++ } ++ ++ down_write(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) { ++ ret = bch2_dev_sysfs_online(c, ca); ++ if (ret) { ++ bch_err(c, "error creating sysfs objects"); ++ percpu_ref_put(&ca->ref); ++ goto err; ++ } ++ } ++ ++ BUG_ON(!list_empty(&c->list)); ++ list_add(&c->list, &bch_fs_list); ++err: ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_fs *c; ++ struct printbuf name = PRINTBUF; ++ unsigned i, iter_size; ++ int ret = 0; ++ ++ pr_verbose_init(opts, ""); ++ ++ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); ++ if (!c) { ++ c = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ __module_get(THIS_MODULE); ++ ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); ++ ++ c->minor = -1; ++ c->disk_sb.fs_sb = true; ++ ++ init_rwsem(&c->state_lock); ++ mutex_init(&c->sb_lock); ++ mutex_init(&c->replicas_gc_lock); ++ mutex_init(&c->btree_root_lock); ++ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); ++ ++ init_rwsem(&c->gc_lock); ++ mutex_init(&c->gc_gens_lock); ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_init(&c->times[i]); ++ ++ bch2_fs_copygc_init(c); ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); ++ bch2_fs_allocator_background_init(c); ++ bch2_fs_allocator_foreground_init(c); ++ bch2_fs_rebalance_init(c); ++ bch2_fs_quota_init(c); ++ bch2_fs_ec_init_early(c); ++ ++ INIT_LIST_HEAD(&c->list); ++ ++ mutex_init(&c->usage_scratch_lock); ++ ++ mutex_init(&c->bio_bounce_pages_lock); ++ mutex_init(&c->snapshot_table_lock); ++ ++ spin_lock_init(&c->btree_write_error_lock); ++ ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ ++ INIT_LIST_HEAD(&c->journal_iters); ++ ++ INIT_LIST_HEAD(&c->fsck_errors); ++ mutex_init(&c->fsck_error_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_head_list); ++ mutex_init(&c->ec_stripe_head_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_new_list); ++ mutex_init(&c->ec_stripe_new_lock); ++ ++ INIT_LIST_HEAD(&c->data_progress_list); ++ mutex_init(&c->data_progress_lock); ++ ++ spin_lock_init(&c->ec_stripes_heap_lock); ++ ++ seqcount_init(&c->gc_pos_lock); ++ ++ seqcount_init(&c->usage_lock); ++ ++ sema_init(&c->io_in_flight, 64); ++ ++ c->copy_gc_enabled = 1; ++ c->rebalance.enabled = 1; ++ c->promote_whole_extents = true; ++ ++ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; ++ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ ++ mutex_init(&c->sectors_available_lock); ++ ++ ret = percpu_init_rwsem(&c->mark_lock); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ ret = bch2_sb_to_fs(c, sb); ++ mutex_unlock(&c->sb_lock); ++ ++ if (ret) ++ goto err; ++ ++ pr_uuid(&name, c->sb.user_uuid.b); ++ strlcpy(c->name, name.buf, sizeof(c->name)); ++ printbuf_exit(&name); ++ ++ ret = name.allocation_failure ? -ENOMEM : 0; ++ if (ret) ++ goto err; ++ ++ /* Compat: */ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); ++ ++ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) ++ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); ++ ++ c->opts = bch2_opts_default; ++ ret = bch2_opts_from_sb(&c->opts, sb); ++ if (ret) ++ goto err; ++ ++ bch2_opts_apply(&c->opts, opts); ++ ++ /* key cache currently disabled for inodes, because of snapshots: */ ++ c->opts.inodes_use_key_cache = 0; ++ ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; ++ if (c->opts.inodes_use_key_cache) ++ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; ++ ++ c->block_bits = ilog2(block_sectors(c)); ++ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); ++ ++ if (bch2_fs_init_fault("fs_alloc")) { ++ bch_err(c, "fs_alloc fault injected"); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ iter_size = sizeof(struct sort_iter) + ++ (btree_blocks(c) + 1) * 2 * ++ sizeof(struct sort_iter_set); ++ ++ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); ++ ++ if (!(c->btree_update_wq = alloc_workqueue("bcachefs", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->io_complete_wq = alloc_workqueue("bcachefs_io", ++ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || ++ percpu_ref_init(&c->writes, bch2_writes_disabled, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || ++ bioset_init(&c->btree_bio, 1, ++ max(offsetof(struct btree_read_bio, bio), ++ offsetof(struct btree_write_bio, wbio.bio)), ++ BIOSET_NEED_BVECS) || ++ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || ++ !(c->online_reserved = alloc_percpu(u64)) || ++ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, ++ btree_bytes(c)) || ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, ++ sizeof(u64), GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_io_clock_init(&c->io_clock[READ]) ?: ++ bch2_io_clock_init(&c->io_clock[WRITE]) ?: ++ bch2_fs_journal_init(&c->journal) ?: ++ bch2_fs_replicas_init(c) ?: ++ bch2_fs_btree_cache_init(c) ?: ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: ++ bch2_fs_btree_iter_init(c) ?: ++ bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_buckets_waiting_for_journal_init(c) ?: ++ bch2_fs_subvolumes_init(c) ?: ++ bch2_fs_io_init(c) ?: ++ bch2_fs_encryption_init(c) ?: ++ bch2_fs_compress_init(c) ?: ++ bch2_fs_ec_init(c) ?: ++ bch2_fs_fsio_init(c) ?: ++ bch2_fs_counters_init(c); ++ if (ret) ++ goto err; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && ++ bch2_dev_alloc(c, i)) { ++ ret = -EEXIST; ++ goto err; ++ } ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->btree_root_journal_res, ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); ++ bch2_dev_usage_journal_reserve(c); ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->clock_journal_res, ++ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); ++ ++ mutex_lock(&bch_fs_list_lock); ++ ret = bch2_fs_online(c); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ if (ret) ++ goto err; ++out: ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err: ++ bch2_fs_free(c); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++noinline_for_stack ++static void print_mount_opts(struct bch_fs *c) ++{ ++ enum bch_opt_id i; ++ struct printbuf p = PRINTBUF; ++ bool first = true; ++ ++ if (c->opts.read_only) { ++ prt_printf(&p, "ro"); ++ first = false; ++ } ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->flags & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ if (!first) ++ prt_printf(&p, ","); ++ first = false; ++ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); ++ } ++ ++ if (!p.pos) ++ prt_printf(&p, "(null)"); ++ ++ bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); ++ printbuf_exit(&p); ++} ++ ++int bch2_fs_start(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ time64_t now = ktime_get_real_seconds(); ++ unsigned i; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for_each_online_member(ca, c, i) ++ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ++ ? bch2_fs_recovery(c) ++ : bch2_fs_initialize(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_opts_check_may_set(c); ++ if (ret) ++ goto err; ++ ++ ret = -EINVAL; ++ if (bch2_fs_init_fault("fs_start")) { ++ bch_err(c, "fs_start fault injected"); ++ goto err; ++ } ++ ++ set_bit(BCH_FS_STARTED, &c->flags); ++ ++ if (c->opts.read_only || c->opts.nochanges) { ++ bch2_fs_read_only(c); ++ } else { ++ ret = !test_bit(BCH_FS_RW, &c->flags) ++ ? bch2_fs_read_write(c) ++ : bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ print_mount_opts(c); ++ ret = 0; ++out: ++ up_write(&c->state_lock); ++ return ret; ++err: ++ bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); ++ ++ if (ret < -BCH_ERR_START) ++ ret = -EINVAL; ++ goto out; ++} ++ ++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) ++{ ++ struct bch_sb_field_members *sb_mi; ++ ++ sb_mi = bch2_sb_get_members(sb); ++ if (!sb_mi) ++ return "Invalid superblock: member info area missing"; ++ ++ if (le16_to_cpu(sb->block_size) != block_sectors(c)) ++ return "mismatched block size"; ++ ++ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) ++ return "new cache bucket size is too small"; ++ ++ return NULL; ++} ++ ++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) ++{ ++ struct bch_sb *newest = ++ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); ++ ++ if (uuid_le_cmp(fs->uuid, sb->uuid)) ++ return "device not a member of filesystem"; ++ ++ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) ++ return "device has been removed"; ++ ++ if (fs->block_size != sb->block_size) ++ return "mismatched block size"; ++ ++ return NULL; ++} ++ ++/* Device startup/shutdown: */ ++ ++static void bch2_dev_release(struct kobject *kobj) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ ++ kfree(ca); ++} ++ ++static void bch2_dev_free(struct bch_dev *ca) ++{ ++ cancel_work_sync(&ca->io_error_work); ++ ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ ++ if (ca->kobj.state_in_sysfs) ++ kobject_del(&ca->kobj); ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++ ++ free_percpu(ca->io_done); ++ bioset_exit(&ca->replica_set); ++ bch2_dev_buckets_free(ca); ++ free_page((unsigned long) ca->sb_read_scratch); ++ ++ bch2_time_stats_exit(&ca->io_latency[WRITE]); ++ bch2_time_stats_exit(&ca->io_latency[READ]); ++ ++ percpu_ref_exit(&ca->io_ref); ++ percpu_ref_exit(&ca->ref); ++ kobject_put(&ca->kobj); ++} ++ ++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) ++{ ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (percpu_ref_is_zero(&ca->io_ref)) ++ return; ++ ++ __bch2_dev_read_only(c, ca); ++ ++ reinit_completion(&ca->io_ref_completion); ++ percpu_ref_kill(&ca->io_ref); ++ wait_for_completion(&ca->io_ref_completion); ++ ++ if (ca->kobj.state_in_sysfs) { ++ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); ++ sysfs_remove_link(&ca->kobj, "block"); ++ } ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++} ++ ++static void bch2_dev_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); ++ ++ complete(&ca->ref_completion); ++} ++ ++static void bch2_dev_io_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); ++ ++ complete(&ca->io_ref_completion); ++} ++ ++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) ++{ ++ int ret; ++ ++ if (!c->kobj.state_in_sysfs) ++ return 0; ++ ++ if (!ca->kobj.state_in_sysfs) { ++ ret = kobject_add(&ca->kobj, &c->kobj, ++ "dev-%u", ca->dev_idx); ++ if (ret) ++ return ret; ++ } ++ ++ if (ca->disk_sb.bdev) { ++ struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ++ ++ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); ++ if (ret) ++ return ret; ++ ++ ret = sysfs_create_link(&ca->kobj, block, "block"); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ++ struct bch_member *member) ++{ ++ struct bch_dev *ca; ++ ++ ca = kzalloc(sizeof(*ca), GFP_KERNEL); ++ if (!ca) ++ return NULL; ++ ++ kobject_init(&ca->kobj, &bch2_dev_ktype); ++ init_completion(&ca->ref_completion); ++ init_completion(&ca->io_ref_completion); ++ ++ init_rwsem(&ca->bucket_lock); ++ ++ INIT_WORK(&ca->io_error_work, bch2_io_error_work); ++ ++ bch2_time_stats_init(&ca->io_latency[READ]); ++ bch2_time_stats_init(&ca->io_latency[WRITE]); ++ ++ ca->mi = bch2_mi_to_cpu(member); ++ ca->uuid = member->uuid; ++ ++ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / btree_sectors(c)); ++ ++ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, ++ 0, GFP_KERNEL) || ++ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ bch2_dev_buckets_alloc(c, ca) || ++ bioset_init(&ca->replica_set, 4, ++ offsetof(struct bch_write_bio, bio), 0) || ++ !(ca->io_done = alloc_percpu(*ca->io_done))) ++ goto err; ++ ++ return ca; ++err: ++ bch2_dev_free(ca); ++ return NULL; ++} ++ ++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, ++ unsigned dev_idx) ++{ ++ ca->dev_idx = dev_idx; ++ __set_bit(ca->dev_idx, ca->self.d); ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ ca->fs = c; ++ rcu_assign_pointer(c->devs[ca->dev_idx], ca); ++ ++ if (bch2_dev_sysfs_online(c, ca)) ++ pr_warn("error creating sysfs objects"); ++} ++ ++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ++{ ++ struct bch_member *member = ++ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; ++ struct bch_dev *ca = NULL; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bch2_fs_init_fault("dev_alloc")) ++ goto err; ++ ++ ca = __bch2_dev_alloc(c, member); ++ if (!ca) ++ goto err; ++ ++ ca->fs = c; ++ ++ bch2_dev_attach(c, ca, dev_idx); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ++{ ++ unsigned ret; ++ ++ if (bch2_dev_is_online(ca)) { ++ bch_err(ca, "already have device online in slot %u", ++ sb->sb->dev_idx); ++ return -EINVAL; ++ } ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "cannot online: device too small"); ++ return -EINVAL; ++ } ++ ++ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "device too small"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_dev_journal_init(ca, sb->sb); ++ if (ret) ++ return ret; ++ ++ /* Commit: */ ++ ca->disk_sb = *sb; ++ if (sb->mode & FMODE_EXCL) ++ ca->disk_sb.bdev->bd_holder = ca; ++ memset(sb, 0, sizeof(*sb)); ++ ++ ca->dev = ca->disk_sb.bdev->bd_dev; ++ ++ percpu_ref_reinit(&ca->io_ref); ++ ++ return 0; ++} ++ ++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (le64_to_cpu(sb->sb->seq) > ++ le64_to_cpu(c->disk_sb.sb->seq)) ++ bch2_sb_to_fs(c, sb->sb); ++ ++ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || ++ !c->devs[sb->sb->dev_idx]); ++ ++ ca = bch_dev_locked(c, sb->sb->dev_idx); ++ ++ ret = __bch2_dev_attach_bdev(ca, sb); ++ if (ret) ++ return ret; ++ ++ bch2_dev_sysfs_online(c, ca); ++ ++ if (c->sb.nr_devices == 1) ++ bdevname(ca->disk_sb.bdev, c->name); ++ bdevname(ca->disk_sb.bdev, ca->name); ++ ++ rebalance_wakeup(c); ++ return 0; ++} ++ ++/* Device management: */ ++ ++/* ++ * Note: this function is also used by the error paths - when a particular ++ * device sees an error, we call it to determine whether we can just set the ++ * device RO, or - if this function returns false - we'll set the whole ++ * filesystem RO: ++ * ++ * XXX: maybe we should be more explicit about whether we're changing state ++ * because we got an error or what have you? ++ */ ++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_devs_mask new_online_devs; ++ struct bch_dev *ca2; ++ int i, nr_rw = 0, required; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ switch (new_state) { ++ case BCH_MEMBER_STATE_rw: ++ return true; ++ case BCH_MEMBER_STATE_ro: ++ if (ca->mi.state != BCH_MEMBER_STATE_rw) ++ return true; ++ ++ /* do we have enough devices to write to? */ ++ for_each_member_device(ca2, c, i) ++ if (ca2 != ca) ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; ++ ++ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ++ ? c->opts.metadata_replicas ++ : c->opts.metadata_replicas_required, ++ !(flags & BCH_FORCE_IF_DATA_DEGRADED) ++ ? c->opts.data_replicas ++ : c->opts.data_replicas_required); ++ ++ return nr_rw >= required; ++ case BCH_MEMBER_STATE_failed: ++ case BCH_MEMBER_STATE_spare: ++ if (ca->mi.state != BCH_MEMBER_STATE_rw && ++ ca->mi.state != BCH_MEMBER_STATE_ro) ++ return true; ++ ++ /* do we have enough devices to read from? */ ++ new_online_devs = bch2_online_devs(c); ++ __clear_bit(ca->dev_idx, new_online_devs.d); ++ ++ return bch2_have_enough_devs(c, new_online_devs, flags, false); ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_fs_may_start(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned i, flags = 0; ++ ++ if (c->opts.very_degraded) ++ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; ++ ++ if (c->opts.degraded) ++ flags |= BCH_FORCE_IF_DEGRADED; ++ ++ if (!c->opts.degraded && ++ !c->opts.very_degraded) { ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) ++ continue; ++ ++ ca = bch_dev_locked(c, i); ++ ++ if (!bch2_dev_is_online(ca) && ++ (ca->mi.state == BCH_MEMBER_STATE_rw || ++ ca->mi.state == BCH_MEMBER_STATE_ro)) { ++ mutex_unlock(&c->sb_lock); ++ return false; ++ } ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); ++} ++ ++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ++{ ++ /* ++ * Device going read only means the copygc reserve get smaller, so we ++ * don't want that happening while copygc is in progress: ++ */ ++ bch2_copygc_stop(c); ++ ++ /* ++ * The allocator thread itself allocates btree nodes, so stop it first: ++ */ ++ bch2_dev_allocator_remove(c, ca); ++ bch2_dev_journal_stop(&c->journal, ca); ++ ++ bch2_copygc_start(c); ++} ++ ++static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); ++ ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++} ++ ++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ int ret = 0; ++ ++ if (ca->mi.state == new_state) ++ return 0; ++ ++ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) ++ return -EINVAL; ++ ++ if (new_state != BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_only(c, ca); ++ ++ bch_notice(ca, "%s", bch2_member_states[new_state]); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (new_state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); ++ ++ rebalance_wakeup(c); ++ ++ return ret; ++} ++ ++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ int ret; ++ ++ down_write(&c->state_lock); ++ ret = __bch2_dev_set_state(c, ca, new_state, flags); ++ up_write(&c->state_lock); ++ ++ return ret; ++} ++ ++/* Device add/removal: */ ++ ++static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bpos start = POS(ca->dev_idx, 0); ++ struct bpos end = POS(ca->dev_idx, U64_MAX); ++ int ret; ++ ++ /* ++ * We clear the LRU and need_discard btrees first so that we don't race ++ * with bch2_do_invalidates() and bch2_do_discards() ++ */ ++ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, ++ BTREE_TRIGGER_NORUN, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, ++ BTREE_TRIGGER_NORUN, NULL); ++ if (ret) ++ bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); ++ ++ return ret; ++} ++ ++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ unsigned dev_idx = ca->dev_idx, data; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ /* ++ * We consume a reference to ca->ref, regardless of whether we succeed ++ * or fail: ++ */ ++ percpu_ref_put(&ca->ref); ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { ++ bch_err(ca, "Cannot remove without losing data"); ++ goto err; ++ } ++ ++ __bch2_dev_read_only(c, ca); ++ ++ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); ++ if (ret) { ++ bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ ret = bch2_dev_remove_alloc(c, ca); ++ if (ret) { ++ bch_err(ca, "Remove failed, error deleting alloc info"); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); ++ if (ret) { ++ bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush(&c->journal); ++ if (ret) { ++ bch_err(ca, "Remove failed, journal error"); ++ goto err; ++ } ++ ++ ret = bch2_replicas_gc2(c); ++ if (ret) { ++ bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ struct printbuf data_has = PRINTBUF; ++ ++ prt_bitflags(&data_has, bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); ++ printbuf_exit(&data_has); ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); ++ mutex_unlock(&c->sb_lock); ++ ++ percpu_ref_kill(&ca->ref); ++ wait_for_completion(&ca->ref_completion); ++ ++ bch2_dev_free(ca); ++ ++ /* ++ * Free this device's slot in the bch_member array - all pointers to ++ * this device must be gone: ++ */ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); ++ ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++ ++ bch2_dev_usage_journal_reserve(c); ++ return 0; ++err: ++ if (ca->mi.state == BCH_MEMBER_STATE_rw && ++ !percpu_ref_is_zero(&ca->io_ref)) ++ __bch2_dev_read_write(c, ca); ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++/* Add new device to running filesystem: */ ++int bch2_dev_add(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb; ++ const char *err; ++ struct bch_dev *ca = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member dev_mi; ++ unsigned dev_idx, nr_devices, u64s; ++ struct printbuf errbuf = PRINTBUF; ++ int ret; ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; ++ ++ err = bch2_dev_may_add(sb.sb, c); ++ if (err) { ++ bch_err(c, "device add error: %s", err); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ca = __bch2_dev_alloc(c, &dev_mi); ++ if (!ca) { ++ bch2_free_super(&sb); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bch2_dev_usage_init(ca); ++ ++ ret = __bch2_dev_attach_bdev(ca, &sb); ++ if (ret) { ++ bch2_dev_free(ca); ++ goto err; ++ } ++ ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ bch_err(c, "device add error: journal alloc failed"); ++ goto err; ++ } ++ ++ down_write(&c->state_lock); ++ mutex_lock(&c->sb_lock); ++ ++ ret = bch2_sb_from_fs(c, ca); ++ if (ret) { ++ bch_err(c, "device add error: new device superblock too small"); ++ goto err_unlock; ++ } ++ ++ mi = bch2_sb_get_members(ca->disk_sb.sb); ++ ++ if (!bch2_sb_resize_members(&ca->disk_sb, ++ le32_to_cpu(mi->field.u64s) + ++ sizeof(dev_mi) / sizeof(u64))) { ++ bch_err(c, "device add error: new device superblock too small"); ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto no_slot; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) ++ goto have_slot; ++no_slot: ++ bch_err(c, "device add error: already have maximum number of devices"); ++ ret = -ENOSPC; ++ goto err_unlock; ++ ++have_slot: ++ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ u64s = (sizeof(struct bch_sb_field_members) + ++ sizeof(struct bch_member) * nr_devices) / sizeof(u64); ++ ++ mi = bch2_sb_resize_members(&c->disk_sb, u64s); ++ if (!mi) { ++ bch_err(c, "device add error: no room in superblock for member info"); ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ /* success: */ ++ ++ mi->members[dev_idx] = dev_mi; ++ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ c->disk_sb.sb->nr_devices = nr_devices; ++ ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_dev_usage_journal_reserve(c); ++ ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); ++ goto err_late; ++ } ++ ++ ret = bch2_fs_freespace_init(c); ++ if (ret) { ++ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); ++ goto err_late; ++ } ++ ++ ca->new_fs_bucket_idx = 0; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); ++ ++ up_write(&c->state_lock); ++ return 0; ++ ++err_unlock: ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ bch2_free_super(&sb); ++ printbuf_exit(&errbuf); ++ return ret; ++err_late: ++ up_write(&c->state_lock); ++ ca = NULL; ++ goto err; ++} ++ ++/* Hot add existing device to running filesystem: */ ++int bch2_dev_online(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb = { NULL }; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ const char *err; ++ int ret; ++ ++ down_write(&c->state_lock); ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ up_write(&c->state_lock); ++ return ret; ++ } ++ ++ dev_idx = sb.sb->dev_idx; ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); ++ if (err) { ++ bch_err(c, "error bringing %s online: %s", path, err); ++ goto err; ++ } ++ ++ ret = bch2_dev_attach_bdev(c, &sb); ++ if (ret) ++ goto err; ++ ++ ca = bch_dev_locked(c, dev_idx); ++ ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", ++ path, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ mi->members[ca->dev_idx].last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ up_write(&c->state_lock); ++ return 0; ++err: ++ up_write(&c->state_lock); ++ bch2_free_super(&sb); ++ return -EINVAL; ++} ++ ++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ down_write(&c->state_lock); ++ ++ if (!bch2_dev_is_online(ca)) { ++ bch_err(ca, "Already offline"); ++ up_write(&c->state_lock); ++ return 0; ++ } ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { ++ bch_err(ca, "Cannot offline required disk"); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ up_write(&c->state_lock); ++ return 0; ++} ++ ++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bch_member *mi; ++ int ret = 0; ++ ++ down_write(&c->state_lock); ++ ++ if (nbuckets < ca->mi.nbuckets) { ++ bch_err(ca, "Cannot shrink yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (bch2_dev_is_online(ca) && ++ get_capacity(ca->disk_sb.bdev->bd_disk) < ++ ca->mi.bucket_size * nbuckets) { ++ bch_err(ca, "New size larger than device"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_dev_buckets_resize(c, ca, nbuckets); ++ if (ret) { ++ bch_err(ca, "Resize error: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ ret = bch2_trans_mark_dev_sb(c, ca); ++ if (ret) { ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ mi->nbuckets = cpu_to_le64(nbuckets); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_recalc_capacity(c); ++err: ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++/* return with ref on ca->ref: */ ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (!strcmp(name, ca->name)) ++ goto found; ++ ca = ERR_PTR(-ENOENT); ++found: ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* Filesystem open: */ ++ ++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, ++ struct bch_opts opts) ++{ ++ struct bch_sb_handle *sb = NULL; ++ struct bch_fs *c = NULL; ++ struct bch_sb_field_members *mi; ++ unsigned i, best_sb = 0; ++ const char *err; ++ struct printbuf errbuf = PRINTBUF; ++ int ret = 0; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return ERR_PTR(-ENODEV); ++ ++ pr_verbose_init(opts, ""); ++ ++ if (!nr_devices) { ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); ++ if (!sb) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ if (ret) ++ goto err; ++ ++ } ++ ++ for (i = 1; i < nr_devices; i++) ++ if (le64_to_cpu(sb[i].sb->seq) > ++ le64_to_cpu(sb[best_sb].sb->seq)) ++ best_sb = i; ++ ++ mi = bch2_sb_get_members(sb[best_sb].sb); ++ ++ i = 0; ++ while (i < nr_devices) { ++ if (i != best_sb && ++ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { ++ char buf[BDEVNAME_SIZE]; ++ pr_info("%s has been removed, skipping", ++ bdevname(sb[i].bdev, buf)); ++ bch2_free_super(&sb[i]); ++ array_remove_item(sb, nr_devices, i); ++ continue; ++ } ++ ++ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ if (err) ++ goto err_print; ++ i++; ++ } ++ ++ c = bch2_fs_alloc(sb[best_sb].sb, opts); ++ if (IS_ERR(c)) { ++ ret = PTR_ERR(c); ++ goto err; ++ } ++ ++ down_write(&c->state_lock); ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_dev_attach_bdev(c, &sb[i]); ++ if (ret) { ++ up_write(&c->state_lock); ++ goto err; ++ } ++ } ++ up_write(&c->state_lock); ++ ++ err = "insufficient devices"; ++ if (!bch2_fs_may_start(c)) ++ goto err_print; ++ ++ if (!c->opts.nostart) { ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++out: ++ kfree(sb); ++ printbuf_exit(&errbuf); ++ module_put(THIS_MODULE); ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err_print: ++ pr_err("bch_fs_open err opening %s: %s", ++ devices[0], err); ++ ret = -EINVAL; ++err: ++ if (!IS_ERR_OR_NULL(c)) ++ bch2_fs_stop(c); ++ if (sb) ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++/* Global interfaces/init */ ++ ++static void bcachefs_exit(void) ++{ ++ bch2_debug_exit(); ++ bch2_vfs_exit(); ++ bch2_chardev_exit(); ++ bch2_btree_key_cache_exit(); ++ if (bcachefs_kset) ++ kset_unregister(bcachefs_kset); ++} ++ ++static int __init bcachefs_init(void) ++{ ++ bch2_bkey_pack_test(); ++ ++ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_btree_key_cache_init() || ++ bch2_chardev_init() || ++ bch2_vfs_init() || ++ bch2_debug_init()) ++ goto err; ++ ++ return 0; ++err: ++ bcachefs_exit(); ++ return -ENOMEM; ++} ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ bool bch2_##name; \ ++ module_param_named(name, bch2_##name, bool, 0644); \ ++ MODULE_PARM_DESC(name, description); ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++module_exit(bcachefs_exit); ++module_init(bcachefs_init); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +new file mode 100644 +index 000000000000..8501adaff4c2 +--- /dev/null ++++ b/fs/bcachefs/super.h +@@ -0,0 +1,264 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_H ++#define _BCACHEFS_SUPER_H ++ ++#include "extents.h" ++ ++#include "bcachefs_ioctl.h" ++ ++#include ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, ++ u32 *offset) ++{ ++ return div_u64_rem(s, ca->mi.bucket_size, offset); ++} ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_failed; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw || ++ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); ++ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); ++ devs->devs[devs->nr++] = dev; ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 b_offset = bucket_to_sector(ca, b); ++ u64 b_end = bucket_to_sector(ca, b + 1); ++ unsigned i; ++ ++ if (!b) ++ return true; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ u64 end = offset + (1 << layout->sb_max_size_bits); ++ ++ if (!(offset >= b_end || end <= b_offset)) ++ return true; ++ } ++ ++ return false; ++} ++ ++struct bch_fs *bch2_dev_to_fs(dev_t); ++struct bch_fs *bch2_uuid_to_fs(uuid_le); ++ ++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++ ++int bch2_dev_fail(struct bch_dev *, int); ++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_add(struct bch_fs *, const char *); ++int bch2_dev_online(struct bch_fs *, const char *); ++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); ++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *); ++void bch2_fs_read_only(struct bch_fs *); ++ ++int bch2_fs_read_write(struct bch_fs *); ++int bch2_fs_read_write_early(struct bch_fs *); ++ ++/* ++ * Only for use in the recovery/fsck path: ++ */ ++static inline void bch2_fs_lazy_rw(struct bch_fs *c) ++{ ++ if (percpu_ref_is_zero(&c->writes)) ++ bch2_fs_read_write_early(c); ++} ++ ++void __bch2_fs_stop(struct bch_fs *); ++void bch2_fs_free(struct bch_fs *); ++void bch2_fs_stop(struct bch_fs *); ++ ++int bch2_fs_start(struct bch_fs *); ++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++ ++#endif /* _BCACHEFS_SUPER_H */ +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +new file mode 100644 +index 000000000000..89419fc7930d +--- /dev/null ++++ b/fs/bcachefs/super_types.h +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_TYPES_H ++#define _BCACHEFS_SUPER_TYPES_H ++ ++struct bch_sb_handle { ++ struct bch_sb *sb; ++ struct block_device *bdev; ++ struct bio *bio; ++ size_t buffer_size; ++ fmode_t mode; ++ unsigned have_layout:1; ++ unsigned have_bio:1; ++ unsigned fs_sb:1; ++ u64 seq; ++}; ++ ++struct bch_devs_mask { ++ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; ++}; ++ ++struct bch_devs_list { ++ u8 nr; ++ u8 devs[BCH_BKEY_PTRS_MAX]; ++}; ++ ++struct bch_member_cpu { ++ u64 nbuckets; /* device size */ ++ u16 first_bucket; /* index of first bucket used */ ++ u16 bucket_size; /* sectors */ ++ u16 group; ++ u8 state; ++ u8 discard; ++ u8 data_allowed; ++ u8 durability; ++ u8 freespace_initialized; ++ u8 valid; ++}; ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[]; ++}; ++ ++#endif /* _BCACHEFS_SUPER_TYPES_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +new file mode 100644 +index 000000000000..2c650055f530 +--- /dev/null ++++ b/fs/bcachefs/sysfs.c +@@ -0,0 +1,943 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcache sysfs interfaces ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "sysfs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "inode.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "opts.h" ++#include "rebalance.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "tests.h" ++ ++#include ++#include ++#include ++#include ++ ++#include "util.h" ++ ++#define SYSFS_OPS(type) \ ++const struct sysfs_ops type ## _sysfs_ops = { \ ++ .show = type ## _show, \ ++ .store = type ## _store \ ++} ++ ++#define SHOW(fn) \ ++static ssize_t fn ## _to_text(struct printbuf *, \ ++ struct kobject *, struct attribute *);\ ++ \ ++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ++ char *buf) \ ++{ \ ++ struct printbuf out = PRINTBUF; \ ++ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ ++ \ ++ if (out.pos && out.buf[out.pos - 1] != '\n') \ ++ prt_newline(&out); \ ++ \ ++ if (!ret && out.allocation_failure) \ ++ ret = -ENOMEM; \ ++ \ ++ if (!ret) { \ ++ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ ++ memcpy(buf, out.buf, ret); \ ++ } \ ++ printbuf_exit(&out); \ ++ return ret; \ ++} \ ++ \ ++static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ ++ struct attribute *attr) ++ ++#define STORE(fn) \ ++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ ++ const char *buf, size_t size) \ ++ ++#define __sysfs_attribute(_name, _mode) \ ++ static struct attribute sysfs_##_name = \ ++ { .name = #_name, .mode = _mode } ++ ++#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) ++#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) ++#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++ ++#define sysfs_printf(file, fmt, ...) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ prt_printf(out, fmt "\n", __VA_ARGS__); \ ++} while (0) ++ ++#define sysfs_print(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ snprint(out, var); \ ++} while (0) ++ ++#define sysfs_hprint(file, val) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ prt_human_readable_s64(out, val); \ ++} while (0) ++ ++#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) ++#define var_print(_var) sysfs_print(_var, var(_var)) ++#define var_hprint(_var) sysfs_hprint(_var, var(_var)) ++ ++#define sysfs_strtoul(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe(buf, var) ?: (ssize_t) size; \ ++} while (0) ++ ++#define sysfs_strtoul_clamp(file, var, min, max) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe_clamp(buf, var, min, max) \ ++ ?: (ssize_t) size; \ ++} while (0) ++ ++#define strtoul_or_return(cp) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define strtoul_restrict_or_return(cp, min, max) \ ++({ \ ++ unsigned long __v = 0; \ ++ int _r = strtoul_safe_restrict(cp, __v, min, max); \ ++ if (_r) \ ++ return _r; \ ++ __v; \ ++}) ++ ++#define strtoi_h_or_return(cp) \ ++({ \ ++ u64 _v; \ ++ int _r = strtoi_h(cp, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define sysfs_hatoi(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoi_h(buf, &var) ?: (ssize_t) size; \ ++} while (0) ++ ++write_attribute(trigger_gc); ++write_attribute(trigger_discards); ++write_attribute(trigger_invalidates); ++write_attribute(prune_cache); ++rw_attribute(btree_gc_periodic); ++rw_attribute(gc_gens_pos); ++ ++read_attribute(uuid); ++read_attribute(minor); ++read_attribute(bucket_size); ++read_attribute(first_bucket); ++read_attribute(nbuckets); ++read_attribute(durability); ++read_attribute(iodone); ++ ++read_attribute(io_latency_read); ++read_attribute(io_latency_write); ++read_attribute(io_latency_stats_read); ++read_attribute(io_latency_stats_write); ++read_attribute(congested); ++ ++read_attribute(btree_avg_write_size); ++ ++read_attribute(btree_cache_size); ++read_attribute(compression_stats); ++read_attribute(journal_debug); ++read_attribute(btree_updates); ++read_attribute(btree_cache); ++read_attribute(btree_key_cache); ++read_attribute(stripes_heap); ++read_attribute(open_buckets); ++ ++read_attribute(internal_uuid); ++ ++read_attribute(has_data); ++read_attribute(alloc_debug); ++ ++read_attribute(read_realloc_races); ++read_attribute(extent_migrate_done); ++read_attribute(extent_migrate_raced); ++read_attribute(bucket_alloc_fail); ++ ++#define x(t, n, ...) read_attribute(t); ++BCH_PERSISTENT_COUNTERS() ++#undef x ++ ++rw_attribute(discard); ++rw_attribute(label); ++ ++rw_attribute(copy_gc_enabled); ++read_attribute(copy_gc_wait); ++ ++rw_attribute(rebalance_enabled); ++sysfs_pd_controller_attribute(rebalance); ++read_attribute(rebalance_work); ++rw_attribute(promote_whole_extents); ++ ++read_attribute(new_stripes); ++ ++read_attribute(io_timers_read); ++read_attribute(io_timers_write); ++ ++read_attribute(data_jobs); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++write_attribute(perf_test); ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#define x(_name) \ ++ static struct attribute sysfs_time_stat_##_name = \ ++ { .name = #_name, .mode = S_IRUGO }; ++ BCH_TIME_STATS() ++#undef x ++ ++static struct attribute sysfs_state_rw = { ++ .name = "state", ++ .mode = S_IRUGO ++}; ++ ++static size_t bch2_btree_cache_size(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct btree *b; ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_for_each_entry(b, &c->btree_cache.live, list) ++ ret += btree_bytes(c); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ return ret; ++} ++ ++static size_t bch2_btree_avg_write_size(struct bch_fs *c) ++{ ++ u64 nr = atomic64_read(&c->btree_writes_nr); ++ u64 sectors = atomic64_read(&c->btree_writes_sectors); ++ ++ return nr ? div64_u64(sectors, nr) : 0; ++} ++ ++static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ long ret = 0; ++ struct bch_move_stats *stats; ++ ++ mutex_lock(&c->data_progress_lock); ++ list_for_each_entry(stats, &c->data_progress_list, list) { ++ prt_printf(out, "%s: data type %s btree_id %s position: ", ++ stats->name, ++ bch2_data_types[stats->data_type], ++ bch2_btree_ids[stats->btree_id]); ++ bch2_bpos_to_text(out, stats->pos); ++ prt_printf(out, "%s", "\n"); ++ } ++ ++ mutex_unlock(&c->data_progress_lock); ++ return ret; ++} ++ ++static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ enum btree_id id; ++ u64 nr_uncompressed_extents = 0, ++ nr_compressed_extents = 0, ++ nr_incompressible_extents = 0, ++ uncompressed_sectors = 0, ++ incompressible_sectors = 0, ++ compressed_sectors_compressed = 0, ++ compressed_sectors_uncompressed = 0; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ if (!((1U << id) & BTREE_ID_HAS_PTRS)) ++ continue; ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, ++ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bool compressed = false, uncompressed = false, incompressible = false; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ switch (p.crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_none: ++ uncompressed = true; ++ uncompressed_sectors += k.k->size; ++ break; ++ case BCH_COMPRESSION_TYPE_incompressible: ++ incompressible = true; ++ incompressible_sectors += k.k->size; ++ break; ++ default: ++ compressed_sectors_compressed += ++ p.crc.compressed_size; ++ compressed_sectors_uncompressed += ++ p.crc.uncompressed_size; ++ compressed = true; ++ break; ++ } ++ } ++ ++ if (incompressible) ++ nr_incompressible_extents++; ++ else if (uncompressed) ++ nr_uncompressed_extents++; ++ else if (compressed) ++ nr_compressed_extents++; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ prt_printf(out, "uncompressed:\n"); ++ prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); ++ prt_printf(out, " size: "); ++ prt_human_readable_u64(out, uncompressed_sectors << 9); ++ prt_printf(out, "\n"); ++ ++ prt_printf(out, "compressed:\n"); ++ prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); ++ prt_printf(out, " compressed size: "); ++ prt_human_readable_u64(out, compressed_sectors_compressed << 9); ++ prt_printf(out, "\n"); ++ prt_printf(out, " uncompressed size: "); ++ prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); ++ prt_printf(out, "\n"); ++ ++ prt_printf(out, "incompressible:\n"); ++ prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); ++ prt_printf(out, " size: "); ++ prt_human_readable_u64(out, incompressible_sectors << 9); ++ prt_printf(out, "\n"); ++ return 0; ++} ++ ++static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); ++ bch2_bpos_to_text(out, c->gc_gens_pos); ++ prt_printf(out, "\n"); ++} ++ ++SHOW(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_print(minor, c->minor); ++ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); ++ ++ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); ++ ++ sysfs_print(read_realloc_races, ++ atomic_long_read(&c->read_realloc_races)); ++ sysfs_print(extent_migrate_done, ++ atomic_long_read(&c->extent_migrate_done)); ++ sysfs_print(extent_migrate_raced, ++ atomic_long_read(&c->extent_migrate_raced)); ++ sysfs_print(bucket_alloc_fail, ++ atomic_long_read(&c->bucket_alloc_fail)); ++ ++ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); ++ ++ if (attr == &sysfs_gc_gens_pos) ++ bch2_gc_gens_pos_to_text(out, c); ++ ++ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ ++ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); ++ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ sysfs_hprint(copy_gc_wait, ++ max(0LL, c->copygc_wait - ++ atomic64_read(&c->io_clock[WRITE].now)) << 9); ++ ++ if (attr == &sysfs_rebalance_work) ++ bch2_rebalance_work_to_text(out, c); ++ ++ sysfs_print(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_journal_debug) ++ bch2_journal_debug_to_text(out, &c->journal); ++ ++ if (attr == &sysfs_btree_updates) ++ bch2_btree_updates_to_text(out, c); ++ ++ if (attr == &sysfs_btree_cache) ++ bch2_btree_cache_to_text(out, c); ++ ++ if (attr == &sysfs_btree_key_cache) ++ bch2_btree_key_cache_to_text(out, &c->btree_key_cache); ++ ++ if (attr == &sysfs_stripes_heap) ++ bch2_stripes_heap_to_text(out, c); ++ ++ if (attr == &sysfs_open_buckets) ++ bch2_open_buckets_to_text(out, c); ++ ++ if (attr == &sysfs_compression_stats) ++ bch2_compression_stats_to_text(out, c); ++ ++ if (attr == &sysfs_new_stripes) ++ bch2_new_stripes_to_text(out, c); ++ ++ if (attr == &sysfs_io_timers_read) ++ bch2_io_timers_to_text(out, &c->io_clock[READ]); ++ ++ if (attr == &sysfs_io_timers_write) ++ bch2_io_timers_to_text(out, &c->io_clock[WRITE]); ++ ++ if (attr == &sysfs_data_jobs) ++ data_progress_to_text(out, c); ++ ++ return 0; ++} ++ ++STORE(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ if (attr == &sysfs_btree_gc_periodic) { ++ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ++ ?: (ssize_t) size; ++ ++ wake_up_process(c->gc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_copy_gc_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ++ ?: (ssize_t) size; ++ ++ if (c->copygc_thread) ++ wake_up_process(c->copygc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_rebalance_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ++ ?: (ssize_t) size; ++ ++ rebalance_wakeup(c); ++ return ret; ++ } ++ ++ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ ++ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ /* Debugging: */ ++ ++ if (!test_bit(BCH_FS_RW, &c->flags)) ++ return -EROFS; ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++ ++ if (attr == &sysfs_trigger_gc) { ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ down_read(&c->state_lock); ++ bch2_gc(c, false, false); ++ up_read(&c->state_lock); ++#else ++ bch2_gc_gens(c); ++#endif ++ } ++ ++ if (attr == &sysfs_trigger_discards) ++ bch2_do_discards(c); ++ ++ if (attr == &sysfs_trigger_invalidates) ++ bch2_do_invalidates(c); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ if (attr == &sysfs_perf_test) { ++ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; ++ char *test = strsep(&p, " \t\n"); ++ char *nr_str = strsep(&p, " \t\n"); ++ char *threads_str = strsep(&p, " \t\n"); ++ unsigned threads; ++ u64 nr; ++ int ret = -EINVAL; ++ ++ if (threads_str && ++ !(ret = kstrtouint(threads_str, 10, &threads)) && ++ !(ret = bch2_strtoull_h(nr_str, &nr))) ++ ret = bch2_btree_perf_test(c, test, nr, threads); ++ kfree(tmp); ++ ++ if (ret) ++ size = ret; ++ } ++#endif ++ return size; ++} ++SYSFS_OPS(bch2_fs); ++ ++struct attribute *bch2_fs_files[] = { ++ &sysfs_minor, ++ &sysfs_btree_cache_size, ++ &sysfs_btree_avg_write_size, ++ ++ &sysfs_promote_whole_extents, ++ ++ &sysfs_compression_stats, ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ &sysfs_perf_test, ++#endif ++ NULL ++}; ++ ++/* counters dir */ ++ ++SHOW(bch2_fs_counters) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); ++ u64 counter = 0; ++ u64 counter_since_mount = 0; ++ ++ out->tabstops[0] = 32; ++ #define x(t, ...) \ ++ if (attr == &sysfs_##t) { \ ++ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ ++ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ ++ prt_printf(out, "since mount:"); \ ++ prt_tab(out); \ ++ prt_human_readable_u64(out, counter_since_mount << 9); \ ++ prt_newline(out); \ ++ \ ++ prt_printf(out, "since filesystem creation:"); \ ++ prt_tab(out); \ ++ prt_human_readable_u64(out, counter << 9); \ ++ prt_newline(out); \ ++ } ++ BCH_PERSISTENT_COUNTERS() ++ #undef x ++ return 0; ++} ++ ++STORE(bch2_fs_counters) { ++ return 0; ++} ++ ++SYSFS_OPS(bch2_fs_counters); ++ ++struct attribute *bch2_fs_counters_files[] = { ++#define x(t, ...) \ ++ &sysfs_##t, ++ BCH_PERSISTENT_COUNTERS() ++#undef x ++ NULL ++}; ++/* internal dir - just a wrapper */ ++ ++SHOW(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_to_text(out, &c->kobj, attr); ++} ++ ++STORE(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_store(&c->kobj, attr, buf, size); ++} ++SYSFS_OPS(bch2_fs_internal); ++ ++struct attribute *bch2_fs_internal_files[] = { ++ &sysfs_journal_debug, ++ &sysfs_btree_updates, ++ &sysfs_btree_cache, ++ &sysfs_btree_key_cache, ++ &sysfs_new_stripes, ++ &sysfs_stripes_heap, ++ &sysfs_open_buckets, ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_trigger_gc, ++ &sysfs_trigger_discards, ++ &sysfs_trigger_invalidates, ++ &sysfs_prune_cache, ++ ++ &sysfs_read_realloc_races, ++ &sysfs_extent_migrate_done, ++ &sysfs_extent_migrate_raced, ++ &sysfs_bucket_alloc_fail, ++ ++ &sysfs_gc_gens_pos, ++ ++ &sysfs_copy_gc_enabled, ++ &sysfs_copy_gc_wait, ++ ++ &sysfs_rebalance_enabled, ++ &sysfs_rebalance_work, ++ sysfs_pd_controller_files(rebalance), ++ ++ &sysfs_data_jobs, ++ ++ &sysfs_internal_uuid, ++ NULL ++}; ++ ++/* options */ ++ ++SHOW(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int id = opt - bch2_opt_table; ++ u64 v = bch2_opt_get_by_id(&c->opts, id); ++ ++ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); ++ prt_char(out, '\n'); ++ ++ return 0; ++} ++ ++STORE(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int ret, id = opt - bch2_opt_table; ++ char *tmp; ++ u64 v; ++ ++ /* ++ * We don't need to take c->writes for correctness, but it eliminates an ++ * unsightly error message in the dmesg log when we're RO: ++ */ ++ if (unlikely(!percpu_ref_tryget_live(&c->writes))) ++ return -EROFS; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); ++ kfree(tmp); ++ ++ if (ret < 0) ++ goto err; ++ ++ ret = bch2_opt_check_may_set(c, id, v); ++ if (ret < 0) ++ goto err; ++ ++ bch2_opt_set_sb(c, opt, v); ++ bch2_opt_set_by_id(&c->opts, id, v); ++ ++ if ((id == Opt_background_target || ++ id == Opt_background_compression) && v) { ++ bch2_rebalance_add_work(c, S64_MAX); ++ rebalance_wakeup(c); ++ } ++ ++ ret = size; ++err: ++ percpu_ref_put(&c->writes); ++ return ret; ++} ++SYSFS_OPS(bch2_fs_opts_dir); ++ ++struct attribute *bch2_fs_opts_dir_files[] = { NULL }; ++ ++int bch2_opts_create_sysfs_files(struct kobject *kobj) ++{ ++ const struct bch_option *i; ++ int ret; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + bch2_opts_nr; ++ i++) { ++ if (!(i->flags & OPT_FS)) ++ continue; ++ ++ ret = sysfs_create_file(kobj, &i->attr); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* time stats */ ++ ++SHOW(bch2_fs_time_stats) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) \ ++ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); ++ BCH_TIME_STATS() ++#undef x ++ ++ return 0; ++} ++ ++STORE(bch2_fs_time_stats) ++{ ++ return size; ++} ++SYSFS_OPS(bch2_fs_time_stats); ++ ++struct attribute *bch2_fs_time_stats_files[] = { ++#define x(name) \ ++ &sysfs_time_stat_##name, ++ BCH_TIME_STATS() ++#undef x ++ NULL ++}; ++ ++static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); ++ unsigned i, nr[BCH_DATA_NR]; ++ ++ memset(nr, 0, sizeof(nr)); ++ ++ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) ++ nr[c->open_buckets[i].data_type]++; ++ ++ prt_printf(out, ++ "\t\t\t buckets\t sectors fragmented\n" ++ "capacity\t%16llu\n", ++ ca->mi.nbuckets - ca->mi.first_bucket); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) ++ prt_printf(out, "%-16s%16llu%16llu%16llu\n", ++ bch2_data_types[i], stats.d[i].buckets, ++ stats.d[i].sectors, stats.d[i].fragmented); ++ ++ prt_printf(out, ++ "ec\t\t%16llu\n" ++ "\n" ++ "freelist_wait\t\t%s\n" ++ "open buckets allocated\t%u\n" ++ "open buckets this dev\t%u\n" ++ "open buckets total\t%u\n" ++ "open_buckets_wait\t%s\n" ++ "open_buckets_btree\t%u\n" ++ "open_buckets_user\t%u\n" ++ "buckets_to_invalidate\t%llu\n" ++ "btree reserve cache\t%u\n", ++ stats.buckets_ec, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, ++ ca->nr_open_buckets, ++ OPEN_BUCKETS_COUNT, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_btree], ++ nr[BCH_DATA_user], ++ should_invalidate_buckets(ca, stats), ++ c->btree_reserve_cache_nr); ++} ++ ++static const char * const bch2_rw[] = { ++ "read", ++ "write", ++ NULL ++}; ++ ++static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ int rw, i; ++ ++ for (rw = 0; rw < 2; rw++) { ++ prt_printf(out, "%s:\n", bch2_rw[rw]); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ prt_printf(out, "%-12s:%12llu\n", ++ bch2_data_types[i], ++ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); ++ } ++} ++ ++SHOW(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ ++ sysfs_printf(uuid, "%pU\n", ca->uuid.b); ++ ++ sysfs_print(bucket_size, bucket_bytes(ca)); ++ sysfs_print(first_bucket, ca->mi.first_bucket); ++ sysfs_print(nbuckets, ca->mi.nbuckets); ++ sysfs_print(durability, ca->mi.durability); ++ sysfs_print(discard, ca->mi.discard); ++ ++ if (attr == &sysfs_label) { ++ if (ca->mi.group) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, c->disk_sb.sb, ++ ca->mi.group - 1); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ prt_char(out, '\n'); ++ } ++ ++ if (attr == &sysfs_has_data) { ++ prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); ++ prt_char(out, '\n'); ++ } ++ ++ if (attr == &sysfs_state_rw) { ++ prt_string_option(out, bch2_member_states, ca->mi.state); ++ prt_char(out, '\n'); ++ } ++ ++ if (attr == &sysfs_iodone) ++ dev_iodone_to_text(out, ca); ++ ++ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); ++ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); ++ ++ if (attr == &sysfs_io_latency_stats_read) ++ bch2_time_stats_to_text(out, &ca->io_latency[READ]); ++ ++ if (attr == &sysfs_io_latency_stats_write) ++ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); ++ ++ sysfs_printf(congested, "%u%%", ++ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) ++ * 100 / CONGESTED_MAX); ++ ++ if (attr == &sysfs_alloc_debug) ++ dev_alloc_debug_to_text(out, ca); ++ ++ return 0; ++} ++ ++STORE(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct bch_member *mi; ++ ++ if (attr == &sysfs_discard) { ++ bool v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DISCARD(mi)) { ++ SET_BCH_MEMBER_DISCARD(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_label) { ++ char *tmp; ++ int ret; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_dev_group_set(c, ca, strim(tmp)); ++ kfree(tmp); ++ if (ret) ++ return ret; ++ } ++ ++ return size; ++} ++SYSFS_OPS(bch2_dev); ++ ++struct attribute *bch2_dev_files[] = { ++ &sysfs_uuid, ++ &sysfs_bucket_size, ++ &sysfs_first_bucket, ++ &sysfs_nbuckets, ++ &sysfs_durability, ++ ++ /* settings: */ ++ &sysfs_discard, ++ &sysfs_state_rw, ++ &sysfs_label, ++ ++ &sysfs_has_data, ++ &sysfs_iodone, ++ ++ &sysfs_io_latency_read, ++ &sysfs_io_latency_write, ++ &sysfs_io_latency_stats_read, ++ &sysfs_io_latency_stats_write, ++ &sysfs_congested, ++ ++ /* debug: */ ++ &sysfs_alloc_debug, ++ NULL ++}; ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h +new file mode 100644 +index 000000000000..222cd5062702 +--- /dev/null ++++ b/fs/bcachefs/sysfs.h +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SYSFS_H_ ++#define _BCACHEFS_SYSFS_H_ ++ ++#include ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++struct attribute; ++struct sysfs_ops; ++ ++extern struct attribute *bch2_fs_files[]; ++extern struct attribute *bch2_fs_counters_files[]; ++extern struct attribute *bch2_fs_internal_files[]; ++extern struct attribute *bch2_fs_opts_dir_files[]; ++extern struct attribute *bch2_fs_time_stats_files[]; ++extern struct attribute *bch2_dev_files[]; ++ ++extern const struct sysfs_ops bch2_fs_sysfs_ops; ++extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; ++extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++extern const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++int bch2_opts_create_sysfs_files(struct kobject *); ++ ++#else ++ ++static struct attribute *bch2_fs_files[] = {}; ++static struct attribute *bch2_fs_counters_files[] = {}; ++static struct attribute *bch2_fs_internal_files[] = {}; ++static struct attribute *bch2_fs_opts_dir_files[] = {}; ++static struct attribute *bch2_fs_time_stats_files[] = {}; ++static struct attribute *bch2_dev_files[] = {}; ++ ++static const struct sysfs_ops bch2_fs_sysfs_ops; ++static const struct sysfs_ops bch2_fs_counters_sysfs_ops; ++static const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++static const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } ++ ++#endif /* NO_BCACHEFS_SYSFS */ ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +new file mode 100644 +index 000000000000..56058a56f2a2 +--- /dev/null ++++ b/fs/bcachefs/tests.c +@@ -0,0 +1,976 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "journal_reclaim.h" ++#include "subvolume.h" ++#include "tests.h" ++ ++#include "linux/kthread.h" ++#include "linux/random.h" ++ ++static void delete_test_keys(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, ++ NULL); ++ BUG_ON(ret); ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); ++ BUG_ON(ret); ++} ++ ++/* unit tests */ ++ ++static int test_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.snapshot = U32_MAX; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &k.k_i, 0)); ++ if (ret) { ++ bch_err(c, "update error in test_delete: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ pr_info("deleting once"); ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); ++ if (ret) { ++ bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ pr_info("deleting twice"); ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); ++ if (ret) { ++ bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret)); ++ goto err; ++ } ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_delete_written(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.snapshot = U32_MAX; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(&trans, &iter, &k.k_i, 0)); ++ if (ret) { ++ bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ bch2_trans_unlock(&trans); ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(&trans, &iter, 0)); ++ if (ret) { ++ bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret)); ++ goto err; ++ } ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_iterate(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ u64 i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i; ++ k.k.p.snapshot = U32_MAX; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, ++ NULL, NULL, 0); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ({ ++ BUG_ON(k.k->p.offset != i++); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, U64_MAX, U32_MAX), 0, k, ++ ({ ++ BUG_ON(k.k->p.offset != --i); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i); ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_iterate_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ u64 i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test extents"); ++ ++ for (i = 0; i < nr; i += 8) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 8; ++ k.k.p.snapshot = U32_MAX; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, ++ NULL, NULL, 0); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ({ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ i = k.k->p.offset; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, ++ SPOS(0, U64_MAX, U32_MAX), 0, k, ++ ({ ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i); ++err: ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_iterate_slots(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ u64 i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i * 2; ++ k.k.p.snapshot = U32_MAX; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, ++ NULL, NULL, 0); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ({ ++ BUG_ON(k.k->p.offset != i); ++ i += 2; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i != nr * 2); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS, k, ({ ++ if (i >= nr * 2) ++ break; ++ ++ BUG_ON(k.k->p.offset != i); ++ BUG_ON(bkey_deleted(k.k) != (i & 1)); ++ ++ i++; ++ 0; ++ })); ++ if (ret < 0) { ++ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ret = 0; ++err: ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter = { NULL }; ++ struct bkey_s_c k; ++ u64 i; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i += 16) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 16; ++ k.k.p.snapshot = U32_MAX; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, ++ NULL, NULL, 0); ++ if (ret) { ++ bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0, k, ({ ++ BUG_ON(bkey_start_offset(k.k) != i + 8); ++ BUG_ON(k.k->size != 8); ++ i += 16; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS, k, ({ ++ if (i == nr) ++ break; ++ BUG_ON(bkey_deleted(k.k) != !(i % 16)); ++ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ BUG_ON(k.k->size != 8); ++ i = k.k->p.offset; ++ 0; ++ })); ++ if (ret) { ++ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); ++ goto err; ++ } ++ ret = 0; ++err: ++ bch2_trans_exit(&trans); ++ return 0; ++} ++ ++/* ++ * XXX: we really want to make sure we've got a btree with depth > 0 for these ++ * tests ++ */ ++static int test_peek_end(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); ++ ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ BUG_ON(k.k); ++ ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ BUG_ON(k.k); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return 0; ++} ++ ++static int test_peek_end_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), 0); ++ ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ BUG_ON(k.k); ++ ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ BUG_ON(k.k); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return 0; ++} ++ ++/* extent unit tests */ ++ ++u64 test_version; ++ ++static int insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.offset = end; ++ k.k_i.k.p.snapshot = U32_MAX; ++ k.k_i.k.size = end - start; ++ k.k_i.k.version.lo = test_version++; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++static int __test_extent_overwrite(struct bch_fs *c, ++ u64 e1_start, u64 e1_end, ++ u64 e2_start, u64 e2_end) ++{ ++ int ret; ++ ++ ret = insert_test_extent(c, e1_start, e1_end) ?: ++ insert_test_extent(c, e2_start, e2_end); ++ ++ delete_test_keys(c); ++ return ret; ++} ++ ++static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++{ ++ return __test_extent_overwrite(c, 0, 64, 0, 32) ?: ++ __test_extent_overwrite(c, 8, 64, 0, 32); ++} ++ ++static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++{ ++ return __test_extent_overwrite(c, 0, 64, 32, 64) ?: ++ __test_extent_overwrite(c, 0, 64, 32, 72); ++} ++ ++static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++{ ++ return __test_extent_overwrite(c, 0, 64, 32, 40); ++} ++ ++static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++{ ++ return __test_extent_overwrite(c, 32, 64, 0, 64) ?: ++ __test_extent_overwrite(c, 32, 64, 0, 128) ?: ++ __test_extent_overwrite(c, 32, 64, 32, 64) ?: ++ __test_extent_overwrite(c, 32, 64, 32, 128); ++} ++ ++/* snapshot unit tests */ ++ ++/* Test skipping over keys in unrelated snapshots: */ ++static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie cookie; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = snapid_hi; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, snapid_lo), 0); ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ ++ BUG_ON(k.k->p.snapshot != U32_MAX); ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int test_snapshots(struct bch_fs *c, u64 nr) ++{ ++ struct bkey_i_cookie cookie; ++ u32 snapids[2]; ++ u32 snapid_subvols[2] = { 1, 1 }; ++ int ret; ++ ++ bkey_cookie_init(&cookie.k_i); ++ cookie.k.p.snapshot = U32_MAX; ++ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_snapshot_node_create(&trans, U32_MAX, ++ snapids, ++ snapid_subvols, ++ 2)); ++ if (ret) ++ return ret; ++ ++ if (snapids[0] > snapids[1]) ++ swap(snapids[0], snapids[1]); ++ ++ ret = test_snapshot_filter(c, snapids[0], snapids[1]); ++ if (ret) { ++ bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* perf tests */ ++ ++static u64 test_rand(void) ++{ ++ u64 v; ++#if 0 ++ v = prandom_u32(); ++#else ++ prandom_bytes(&v, sizeof(v)); ++#endif ++ return v; ++} ++ ++static int rand_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k; ++ int ret = 0; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = test_rand(); ++ k.k.p.snapshot = U32_MAX; ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); ++ if (ret) { ++ bch_err(c, "error in rand_insert: %s", bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int rand_insert_multi(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k[8]; ++ int ret = 0; ++ unsigned j; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i += ARRAY_SIZE(k)) { ++ for (j = 0; j < ARRAY_SIZE(k); j++) { ++ bkey_cookie_init(&k[j].k_i); ++ k[j].k.p.offset = test_rand(); ++ k[j].k.p.snapshot = U32_MAX; ++ } ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: ++ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); ++ if (ret) { ++ bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int rand_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); ++ ++ for (i = 0; i < nr; i++) { ++ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); ++ ++ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ ret = bkey_err(k); ++ if (ret) { ++ bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int rand_mixed_trans(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i_cookie *cookie, ++ u64 i, u64 pos) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); ++ ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(iter))); ++ ret = bkey_err(k); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret)); ++ if (ret) ++ return ret; ++ ++ if (!(i & 3) && k.k) { ++ bkey_cookie_init(&cookie->k_i); ++ cookie->k.p = iter->pos; ++ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); ++ } ++ ++ return ret; ++} ++ ++static int rand_mixed(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_i_cookie cookie; ++ int ret = 0; ++ u64 i, rand; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0); ++ ++ for (i = 0; i < nr; i++) { ++ rand = test_rand(); ++ ret = commit_do(&trans, NULL, NULL, 0, ++ rand_mixed_trans(&trans, &iter, &cookie, i, rand)); ++ if (ret) { ++ bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ bch2_trans_iter_exit(&trans, &iter); ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int __do_delete(struct btree_trans *trans, struct bpos pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, ++ BTREE_ITER_INTENT); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k) ++ goto err; ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int rand_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ int ret = 0; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ struct bpos pos = SPOS(0, test_rand(), U32_MAX); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ __do_delete(&trans, pos)); ++ if (ret) { ++ bch_err(c, "error in rand_delete: %s", bch2_err_str(ret)); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int seq_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie insert; ++ int ret = 0; ++ ++ bkey_cookie_init(&insert.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ++ NULL, NULL, 0, ++ ({ ++ if (iter.pos.offset >= nr) ++ break; ++ insert.k.p = iter.pos; ++ bch2_trans_update(&trans, &iter, &insert.k_i, 0); ++ })); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int seq_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), 0, k, ++ 0); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int seq_overwrite(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), ++ BTREE_ITER_INTENT, k, ++ NULL, NULL, 0, ++ ({ ++ struct bkey_i_cookie u; ++ ++ bkey_reassemble(&u.k_i, k); ++ bch2_trans_update(&trans, &iter, &u.k_i, 0); ++ })); ++ if (ret) ++ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++static int seq_delete(struct bch_fs *c, u64 nr) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), SPOS_MAX, ++ 0, NULL); ++ if (ret) ++ bch_err(c, "error in seq_delete: %s", bch2_err_str(ret)); ++ return ret; ++} ++ ++typedef int (*perf_test_fn)(struct bch_fs *, u64); ++ ++struct test_job { ++ struct bch_fs *c; ++ u64 nr; ++ unsigned nr_threads; ++ perf_test_fn fn; ++ ++ atomic_t ready; ++ wait_queue_head_t ready_wait; ++ ++ atomic_t done; ++ struct completion done_completion; ++ ++ u64 start; ++ u64 finish; ++ int ret; ++}; ++ ++static int btree_perf_test_thread(void *data) ++{ ++ struct test_job *j = data; ++ int ret; ++ ++ if (atomic_dec_and_test(&j->ready)) { ++ wake_up(&j->ready_wait); ++ j->start = sched_clock(); ++ } else { ++ wait_event(j->ready_wait, !atomic_read(&j->ready)); ++ } ++ ++ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); ++ if (ret) { ++ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); ++ j->ret = ret; ++ } ++ ++ if (atomic_dec_and_test(&j->done)) { ++ j->finish = sched_clock(); ++ complete(&j->done_completion); ++ } ++ ++ return 0; ++} ++ ++int bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) ++{ ++ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; ++ char name_buf[20]; ++ struct printbuf nr_buf = PRINTBUF; ++ struct printbuf per_sec_buf = PRINTBUF; ++ unsigned i; ++ u64 time; ++ ++ atomic_set(&j.ready, nr_threads); ++ init_waitqueue_head(&j.ready_wait); ++ ++ atomic_set(&j.done, nr_threads); ++ init_completion(&j.done_completion); ++ ++#define perf_test(_test) \ ++ if (!strcmp(testname, #_test)) j.fn = _test ++ ++ perf_test(rand_insert); ++ perf_test(rand_insert_multi); ++ perf_test(rand_lookup); ++ perf_test(rand_mixed); ++ perf_test(rand_delete); ++ ++ perf_test(seq_insert); ++ perf_test(seq_lookup); ++ perf_test(seq_overwrite); ++ perf_test(seq_delete); ++ ++ /* a unit test, not a perf test: */ ++ perf_test(test_delete); ++ perf_test(test_delete_written); ++ perf_test(test_iterate); ++ perf_test(test_iterate_extents); ++ perf_test(test_iterate_slots); ++ perf_test(test_iterate_slots_extents); ++ perf_test(test_peek_end); ++ perf_test(test_peek_end_extents); ++ ++ perf_test(test_extent_overwrite_front); ++ perf_test(test_extent_overwrite_back); ++ perf_test(test_extent_overwrite_middle); ++ perf_test(test_extent_overwrite_all); ++ ++ perf_test(test_snapshots); ++ ++ if (!j.fn) { ++ pr_err("unknown test %s", testname); ++ return -EINVAL; ++ } ++ ++ //pr_info("running test %s:", testname); ++ ++ if (nr_threads == 1) ++ btree_perf_test_thread(&j); ++ else ++ for (i = 0; i < nr_threads; i++) ++ kthread_run(btree_perf_test_thread, &j, ++ "bcachefs perf test[%u]", i); ++ ++ while (wait_for_completion_interruptible(&j.done_completion)) ++ ; ++ ++ time = j.finish - j.start; ++ ++ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); ++ prt_human_readable_u64(&nr_buf, nr); ++ prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); ++ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", ++ name_buf, nr_buf.buf, nr_threads, ++ div_u64(time, NSEC_PER_SEC), ++ div_u64(time * nr_threads, nr), ++ per_sec_buf.buf); ++ printbuf_exit(&per_sec_buf); ++ printbuf_exit(&nr_buf); ++ return j.ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +new file mode 100644 +index 000000000000..c73b18aea7e0 +--- /dev/null ++++ b/fs/bcachefs/tests.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_TEST_H ++#define _BCACHEFS_TEST_H ++ ++struct bch_fs; ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++ ++#else ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#endif /* _BCACHEFS_TEST_H */ +diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c +new file mode 100644 +index 000000000000..59e8dfa3d245 +--- /dev/null ++++ b/fs/bcachefs/trace.c +@@ -0,0 +1,12 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "btree_types.h" ++#include "keylist.h" ++ ++#include ++#include "keylist.h" ++ ++#define CREATE_TRACE_POINTS ++#include +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +new file mode 100644 +index 000000000000..ee2c7d9e7050 +--- /dev/null ++++ b/fs/bcachefs/util.c +@@ -0,0 +1,964 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * random utiility code, for bcache but in theory not specific to bcache ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "eytzinger.h" ++#include "util.h" ++ ++static const char si_units[] = "?kMGTPEZY"; ++ ++/* string_get_size units: */ ++static const char *const units_2[] = { ++ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" ++}; ++static const char *const units_10[] = { ++ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" ++}; ++ ++static int parse_u64(const char *cp, u64 *res) ++{ ++ const char *start = cp; ++ u64 v = 0; ++ ++ if (!isdigit(*cp)) ++ return -EINVAL; ++ ++ do { ++ if (v > U64_MAX / 10) ++ return -ERANGE; ++ v *= 10; ++ if (v > U64_MAX - (*cp - '0')) ++ return -ERANGE; ++ v += *cp - '0'; ++ cp++; ++ } while (isdigit(*cp)); ++ ++ *res = v; ++ return cp - start; ++} ++ ++static int bch2_pow(u64 n, u64 p, u64 *res) ++{ ++ *res = 1; ++ ++ while (p--) { ++ if (*res > div_u64(U64_MAX, n)) ++ return -ERANGE; ++ *res *= n; ++ } ++ return 0; ++} ++ ++static int parse_unit_suffix(const char *cp, u64 *res) ++{ ++ const char *start = cp; ++ u64 base = 1024; ++ unsigned u; ++ int ret; ++ ++ if (*cp == ' ') ++ cp++; ++ ++ for (u = 1; u < strlen(si_units); u++) ++ if (*cp == si_units[u]) { ++ cp++; ++ goto got_unit; ++ } ++ ++ for (u = 0; u < ARRAY_SIZE(units_2); u++) ++ if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { ++ cp += strlen(units_2[u]); ++ goto got_unit; ++ } ++ ++ for (u = 0; u < ARRAY_SIZE(units_10); u++) ++ if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { ++ cp += strlen(units_10[u]); ++ base = 1000; ++ goto got_unit; ++ } ++ ++ *res = 1; ++ return 0; ++got_unit: ++ ret = bch2_pow(base, u, res); ++ if (ret) ++ return ret; ++ ++ return cp - start; ++} ++ ++#define parse_or_ret(cp, _f) \ ++do { \ ++ int ret = _f; \ ++ if (ret < 0) \ ++ return ret; \ ++ cp += ret; \ ++} while (0) ++ ++static int __bch2_strtou64_h(const char *cp, u64 *res) ++{ ++ const char *start = cp; ++ u64 v = 0, b, f_n = 0, f_d = 1; ++ int ret; ++ ++ parse_or_ret(cp, parse_u64(cp, &v)); ++ ++ if (*cp == '.') { ++ cp++; ++ ret = parse_u64(cp, &f_n); ++ if (ret < 0) ++ return ret; ++ cp += ret; ++ ++ ret = bch2_pow(10, ret, &f_d); ++ if (ret) ++ return ret; ++ } ++ ++ parse_or_ret(cp, parse_unit_suffix(cp, &b)); ++ ++ if (v > div_u64(U64_MAX, b)) ++ return -ERANGE; ++ v *= b; ++ ++ if (f_n > div_u64(U64_MAX, b)) ++ return -ERANGE; ++ ++ f_n = div_u64(f_n * b, f_d); ++ if (v + f_n < v) ++ return -ERANGE; ++ v += f_n; ++ ++ *res = v; ++ return cp - start; ++} ++ ++static int __bch2_strtoh(const char *cp, u64 *res, ++ u64 t_max, bool t_signed) ++{ ++ bool positive = *cp != '-'; ++ u64 v = 0; ++ ++ if (*cp == '+' || *cp == '-') ++ cp++; ++ ++ parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); ++ ++ if (*cp == '\n') ++ cp++; ++ if (*cp) ++ return -EINVAL; ++ ++ if (positive) { ++ if (v > t_max) ++ return -ERANGE; ++ } else { ++ if (v && !t_signed) ++ return -ERANGE; ++ ++ if (v > t_max + 1) ++ return -ERANGE; ++ v = -v; ++ } ++ ++ *res = v; ++ return 0; ++} ++ ++#define STRTO_H(name, type) \ ++int bch2_ ## name ## _h(const char *cp, type *res) \ ++{ \ ++ u64 v = 0; \ ++ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ++ ANYSINT_MAX(type) != ((type) ~0ULL)); \ ++ *res = v; \ ++ return ret; \ ++} ++ ++STRTO_H(strtoint, int) ++STRTO_H(strtouint, unsigned int) ++STRTO_H(strtoll, long long) ++STRTO_H(strtoull, unsigned long long) ++STRTO_H(strtou64, u64) ++ ++u64 bch2_read_flag_list(char *opt, const char * const list[]) ++{ ++ u64 ret = 0; ++ char *p, *s, *d = kstrdup(opt, GFP_KERNEL); ++ ++ if (!d) ++ return -ENOMEM; ++ ++ s = strim(d); ++ ++ while ((p = strsep(&s, ","))) { ++ int flag = match_string(list, -1, p); ++ if (flag < 0) { ++ ret = -1; ++ break; ++ } ++ ++ ret |= 1 << flag; ++ } ++ ++ kfree(d); ++ ++ return ret; ++} ++ ++bool bch2_is_zero(const void *_p, size_t n) ++{ ++ const char *p = _p; ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ if (p[i]) ++ return false; ++ return true; ++} ++ ++static void bch2_quantiles_update(struct quantiles *q, u64 v) ++{ ++ unsigned i = 0; ++ ++ while (i < ARRAY_SIZE(q->entries)) { ++ struct quantile_entry *e = q->entries + i; ++ ++ if (unlikely(!e->step)) { ++ e->m = v; ++ e->step = max_t(unsigned, v / 2, 1024); ++ } else if (e->m > v) { ++ e->m = e->m >= e->step ++ ? e->m - e->step ++ : 0; ++ } else if (e->m < v) { ++ e->m = e->m + e->step > e->m ++ ? e->m + e->step ++ : U32_MAX; ++ } ++ ++ if ((e->m > v ? e->m - v : v - e->m) < e->step) ++ e->step = max_t(unsigned, e->step / 2, 1); ++ ++ if (v >= e->m) ++ break; ++ ++ i = eytzinger0_child(i, v > e->m); ++ } ++} ++ ++/* time stats: */ ++ ++static void bch2_time_stats_update_one(struct time_stats *stats, ++ u64 start, u64 end) ++{ ++ u64 duration, freq; ++ ++ duration = time_after64(end, start) ++ ? end - start : 0; ++ freq = time_after64(end, stats->last_event) ++ ? end - stats->last_event : 0; ++ ++ stats->count++; ++ ++ stats->average_duration = stats->average_duration ++ ? ewma_add(stats->average_duration, duration, 6) ++ : duration; ++ ++ stats->average_frequency = stats->average_frequency ++ ? ewma_add(stats->average_frequency, freq, 6) ++ : freq; ++ ++ stats->max_duration = max(stats->max_duration, duration); ++ ++ stats->last_event = end; ++ ++ bch2_quantiles_update(&stats->quantiles, duration); ++} ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) ++{ ++ unsigned long flags; ++ ++ if (!stats->buffer) { ++ spin_lock_irqsave(&stats->lock, flags); ++ bch2_time_stats_update_one(stats, start, end); ++ ++ if (stats->average_frequency < 32 && ++ stats->count > 1024) ++ stats->buffer = ++ alloc_percpu_gfp(struct time_stat_buffer, ++ GFP_ATOMIC); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ } else { ++ struct time_stat_buffer_entry *i; ++ struct time_stat_buffer *b; ++ ++ preempt_disable(); ++ b = this_cpu_ptr(stats->buffer); ++ ++ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); ++ b->entries[b->nr++] = (struct time_stat_buffer_entry) { ++ .start = start, ++ .end = end ++ }; ++ ++ if (b->nr == ARRAY_SIZE(b->entries)) { ++ spin_lock_irqsave(&stats->lock, flags); ++ for (i = b->entries; ++ i < b->entries + ARRAY_SIZE(b->entries); ++ i++) ++ bch2_time_stats_update_one(stats, i->start, i->end); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ ++ b->nr = 0; ++ } ++ ++ preempt_enable(); ++ } ++} ++ ++static const struct time_unit { ++ const char *name; ++ u32 nsecs; ++} time_units[] = { ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "sec", NSEC_PER_SEC }, ++}; ++ ++static const struct time_unit *pick_time_units(u64 ns) ++{ ++ const struct time_unit *u; ++ ++ for (u = time_units; ++ u + 1 < time_units + ARRAY_SIZE(time_units) && ++ ns >= u[1].nsecs << 1; ++ u++) ++ ; ++ ++ return u; ++} ++ ++static void pr_time_units(struct printbuf *out, u64 ns) ++{ ++ const struct time_unit *u = pick_time_units(ns); ++ ++ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) ++{ ++ const struct time_unit *u; ++ u64 freq = READ_ONCE(stats->average_frequency); ++ u64 q, last_q = 0; ++ int i; ++ ++ prt_printf(out, "count:\t\t%llu", ++ stats->count); ++ prt_newline(out); ++ prt_printf(out, "rate:\t\t%llu/sec", ++ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ prt_newline(out); ++ ++ prt_printf(out, "frequency:\t"); ++ pr_time_units(out, freq); ++ ++ prt_newline(out); ++ prt_printf(out, "avg duration:\t"); ++ pr_time_units(out, stats->average_duration); ++ ++ prt_newline(out); ++ prt_printf(out, "max duration:\t"); ++ pr_time_units(out, stats->max_duration); ++ ++ i = eytzinger0_first(NR_QUANTILES); ++ u = pick_time_units(stats->quantiles.entries[i].m); ++ ++ prt_newline(out); ++ prt_printf(out, "quantiles (%s):\t", u->name); ++ eytzinger0_for_each(i, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ ++ q = max(stats->quantiles.entries[i].m, last_q); ++ prt_printf(out, "%llu ", ++ div_u64(q, u->nsecs)); ++ if (is_last) ++ prt_newline(out); ++ last_q = q; ++ } ++} ++ ++void bch2_time_stats_exit(struct time_stats *stats) ++{ ++ free_percpu(stats->buffer); ++} ++ ++void bch2_time_stats_init(struct time_stats *stats) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ spin_lock_init(&stats->lock); ++} ++ ++/* ratelimit: */ ++ ++/** ++ * bch2_ratelimit_delay() - return how long to delay until the next time to do ++ * some work ++ * ++ * @d - the struct bch_ratelimit to update ++ * ++ * Returns the amount of time to delay by, in jiffies ++ */ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *d) ++{ ++ u64 now = local_clock(); ++ ++ return time_after64(d->next, now) ++ ? nsecs_to_jiffies(d->next - now) ++ : 0; ++} ++ ++/** ++ * bch2_ratelimit_increment() - increment @d by the amount of work done ++ * ++ * @d - the struct bch_ratelimit to update ++ * @done - the amount of work done, in arbitrary units ++ */ ++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) ++{ ++ u64 now = local_clock(); ++ ++ d->next += div_u64(done * NSEC_PER_SEC, d->rate); ++ ++ if (time_before64(now + NSEC_PER_SEC, d->next)) ++ d->next = now + NSEC_PER_SEC; ++ ++ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) ++ d->next = now - NSEC_PER_SEC * 2; ++} ++ ++/* pd controller: */ ++ ++/* ++ * Updates pd_controller. Attempts to scale inputed values to units per second. ++ * @target: desired value ++ * @actual: current value ++ * ++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing ++ * it makes actual go down. ++ */ ++void bch2_pd_controller_update(struct bch_pd_controller *pd, ++ s64 target, s64 actual, int sign) ++{ ++ s64 proportional, derivative, change; ++ ++ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; ++ ++ if (seconds_since_update == 0) ++ return; ++ ++ pd->last_update = jiffies; ++ ++ proportional = actual - target; ++ proportional *= seconds_since_update; ++ proportional = div_s64(proportional, pd->p_term_inverse); ++ ++ derivative = actual - pd->last_actual; ++ derivative = div_s64(derivative, seconds_since_update); ++ derivative = ewma_add(pd->smoothed_derivative, derivative, ++ (pd->d_term / seconds_since_update) ?: 1); ++ derivative = derivative * pd->d_term; ++ derivative = div_s64(derivative, pd->p_term_inverse); ++ ++ change = proportional + derivative; ++ ++ /* Don't increase rate if not keeping up */ ++ if (change > 0 && ++ pd->backpressure && ++ time_after64(local_clock(), ++ pd->rate.next + NSEC_PER_MSEC)) ++ change = 0; ++ ++ change *= (sign * -1); ++ ++ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, ++ 1, UINT_MAX); ++ ++ pd->last_actual = actual; ++ pd->last_derivative = derivative; ++ pd->last_proportional = proportional; ++ pd->last_change = change; ++ pd->last_target = target; ++} ++ ++void bch2_pd_controller_init(struct bch_pd_controller *pd) ++{ ++ pd->rate.rate = 1024; ++ pd->last_update = jiffies; ++ pd->p_term_inverse = 6000; ++ pd->d_term = 30; ++ pd->d_smooth = pd->d_term; ++ pd->backpressure = 1; ++} ++ ++void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) ++{ ++ out->tabstops[0] = 20; ++ ++ prt_printf(out, "rate:"); ++ prt_tab(out); ++ prt_human_readable_s64(out, pd->rate.rate); ++ prt_newline(out); ++ ++ prt_printf(out, "target:"); ++ prt_tab(out); ++ prt_human_readable_u64(out, pd->last_target); ++ prt_newline(out); ++ ++ prt_printf(out, "actual:"); ++ prt_tab(out); ++ prt_human_readable_u64(out, pd->last_actual); ++ prt_newline(out); ++ ++ prt_printf(out, "proportional:"); ++ prt_tab(out); ++ prt_human_readable_s64(out, pd->last_proportional); ++ prt_newline(out); ++ ++ prt_printf(out, "derivative:"); ++ prt_tab(out); ++ prt_human_readable_s64(out, pd->last_derivative); ++ prt_newline(out); ++ ++ prt_printf(out, "change:"); ++ prt_tab(out); ++ prt_human_readable_s64(out, pd->last_change); ++ prt_newline(out); ++ ++ prt_printf(out, "next io:"); ++ prt_tab(out); ++ prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); ++ prt_newline(out); ++} ++ ++/* misc: */ ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t size) ++{ ++ while (size) { ++ struct page *page = is_vmalloc_addr(base) ++ ? vmalloc_to_page(base) ++ : virt_to_page(base); ++ unsigned offset = offset_in_page(base); ++ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, offset)); ++ size -= len; ++ base += len; ++ } ++} ++ ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++{ ++ while (size) { ++ struct page *page = alloc_page(gfp_mask); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ if (unlikely(!bio_add_page(bio, page, len, 0))) { ++ __free_page(page); ++ break; ++ } ++ ++ size -= len; ++ } ++ ++ return 0; ++} ++ ++size_t bch2_rand_range(size_t max) ++{ ++ size_t rand; ++ ++ if (!max) ++ return 0; ++ ++ do { ++ rand = get_random_long(); ++ rand &= roundup_pow_of_two(max) - 1; ++ } while (rand >= max); ++ ++ return rand; ++} ++ ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, dst, iter, dst_iter) { ++ void *dstp = kmap_atomic(bv.bv_page); ++ memcpy(dstp + bv.bv_offset, src, bv.bv_len); ++ kunmap_atomic(dstp); ++ ++ src += bv.bv_len; ++ } ++} ++ ++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, src, iter, src_iter) { ++ void *srcp = kmap_atomic(bv.bv_page); ++ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); ++ kunmap_atomic(srcp); ++ ++ dst += bv.bv_len; ++ } ++} ++ ++#include "eytzinger.h" ++ ++static int alignment_ok(const void *base, size_t align) ++{ ++ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || ++ ((unsigned long)base & (align - 1)) == 0; ++} ++ ++static void u32_swap(void *a, void *b, size_t size) ++{ ++ u32 t = *(u32 *)a; ++ *(u32 *)a = *(u32 *)b; ++ *(u32 *)b = t; ++} ++ ++static void u64_swap(void *a, void *b, size_t size) ++{ ++ u64 t = *(u64 *)a; ++ *(u64 *)a = *(u64 *)b; ++ *(u64 *)b = t; ++} ++ ++static void generic_swap(void *a, void *b, size_t size) ++{ ++ char t; ++ ++ do { ++ t = *(char *)a; ++ *(char *)a++ = *(char *)b; ++ *(char *)b++ = t; ++ } while (--size > 0); ++} ++ ++static inline int do_cmp(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ size_t l, size_t r) ++{ ++ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++static inline void do_swap(void *base, size_t n, size_t size, ++ void (*swap_func)(void *, void *, size_t), ++ size_t l, size_t r) ++{ ++ swap_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++void eytzinger0_sort(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)) ++{ ++ int i, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for (i = n / 2 - 1; i >= 0; --i) { ++ for (r = i; r * 2 + 1 < n; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < n && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - 1; i > 0; --i) { ++ do_swap(base, n, size, swap_func, 0, i); ++ ++ for (r = 0; r * 2 + 1 < i; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < i && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t size)) ++{ ++ /* pre-scale counters for performance */ ++ int i = (num/2 - 1) * size, n = num * size, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for ( ; i >= 0; i -= size) { ++ for (r = i; r * 2 + size < n; r = c) { ++ c = r * 2 + size; ++ if (c < n - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - size; i > 0; i -= size) { ++ swap_func(base, base + i, size); ++ for (r = 0; r * 2 + size < i; r = c) { ++ c = r * 2 + size; ++ if (c < i - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++} ++ ++static void mempool_free_vp(void *element, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ vpfree(element, size); ++} ++ ++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ return vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) ++{ ++ return size < PAGE_SIZE ++ ? mempool_init_kmalloc_pool(pool, min_nr, size) ++ : mempool_init(pool, min_nr, mempool_alloc_vp, ++ mempool_free_vp, (void *) size); ++} ++ ++#if 0 ++void eytzinger1_test(void) ++{ ++ unsigned inorder, eytz, size; ++ ++ pr_info("1 based eytzinger test:"); ++ ++ for (size = 2; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger1_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); ++ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); ++ ++ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); ++ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ ++ inorder = 1; ++ eytzinger1_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger1_last(size) && ++ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++void eytzinger0_test(void) ++{ ++ ++ unsigned inorder, eytz, size; ++ ++ pr_info("0 based eytzinger test:"); ++ ++ for (size = 1; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger0_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); ++ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); ++ ++ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); ++ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ ++ inorder = 0; ++ eytzinger0_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger0_last(size) && ++ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++{ ++ const u16 *l = _l, *r = _r; ++ ++ return (*l > *r) - (*r - *l); ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ int i, c1 = -1, c2 = -1; ++ ssize_t r; ++ ++ r = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) ++ c1 = test_array[r]; ++ ++ for (i = 0; i < nr; i++) ++ if (test_array[i] <= search && test_array[i] > c2) ++ c2 = test_array[i]; ++ ++ if (c1 != c2) { ++ eytzinger0_for_each(i, nr) ++ pr_info("[%3u] = %12u", i, test_array[i]); ++ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", ++ i, r, c1, c2); ++ } ++} ++ ++void eytzinger0_find_test(void) ++{ ++ unsigned i, nr, allocated = 1 << 12; ++ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); ++ ++ for (nr = 1; nr < allocated; nr++) { ++ pr_info("testing %u elems", nr); ++ ++ get_random_bytes(test_array, nr * sizeof(test_array[0])); ++ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); ++ ++ /* verify array is sorted correctly: */ ++ eytzinger0_for_each(i, nr) ++ BUG_ON(i != eytzinger0_last(nr) && ++ test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ ++ for (i = 0; i < U16_MAX; i += 1 << 12) ++ eytzinger0_find_test_val(test_array, nr, i); ++ ++ for (i = 0; i < nr; i++) { ++ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); ++ eytzinger0_find_test_val(test_array, nr, test_array[i]); ++ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); ++ } ++ } ++ ++ kfree(test_array); ++} ++#endif ++ ++/* ++ * Accumulate percpu counters onto one cpu's copy - only valid when access ++ * against any percpu counter is guarded against ++ */ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) ++{ ++ u64 *ret; ++ int cpu; ++ ++ /* access to pcpu vars has to be blocked by other locking */ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ ++ for_each_possible_cpu(cpu) { ++ u64 *i = per_cpu_ptr(p, cpu); ++ ++ if (i != ret) { ++ acc_u64s(ret, i, nr); ++ memset(i, 0, nr * sizeof(u64)); ++ } ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +new file mode 100644 +index 000000000000..1fe66fd91ccc +--- /dev/null ++++ b/fs/bcachefs/util.h +@@ -0,0 +1,783 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_UTIL_H ++#define _BCACHEFS_UTIL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct closure; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define EBUG_ON(cond) BUG_ON(cond) ++#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) ++#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) ++#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) ++#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) ++#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) ++#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) ++#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) ++#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) ++#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) ++#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) ++ ++#else /* DEBUG */ ++ ++#define EBUG_ON(cond) ++#define atomic_dec_bug(v) atomic_dec(v) ++#define atomic_inc_bug(v, i) atomic_inc(v) ++#define atomic_sub_bug(i, v) atomic_sub(i, v) ++#define atomic_add_bug(i, v) atomic_add(i, v) ++#define atomic_long_dec_bug(v) atomic_long_dec(v) ++#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) ++#define atomic64_dec_bug(v) atomic64_dec(v) ++#define atomic64_inc_bug(v, i) atomic64_inc(v) ++#define atomic64_sub_bug(i, v) atomic64_sub(i, v) ++#define atomic64_add_bug(i, v) atomic64_add(i, v) ++ ++#endif ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++#define CPU_BIG_ENDIAN 0 ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#define CPU_BIG_ENDIAN 1 ++#endif ++ ++/* type hackery */ ++ ++#define type_is_exact(_val, _type) \ ++ __builtin_types_compatible_p(typeof(_val), _type) ++ ++#define type_is(_val, _type) \ ++ (__builtin_types_compatible_p(typeof(_val), _type) || \ ++ __builtin_types_compatible_p(typeof(_val), const _type)) ++ ++/* Userspace doesn't align allocations as nicely as the kernel allocators: */ ++static inline size_t buf_pages(void *p, size_t len) ++{ ++ return DIV_ROUND_UP(len + ++ ((unsigned long) p & (PAGE_SIZE - 1)), ++ PAGE_SIZE); ++} ++ ++static inline void vpfree(void *p, size_t size) ++{ ++ if (is_vmalloc_addr(p)) ++ vfree(p); ++ else ++ free_pages((unsigned long) p, get_order(size)); ++} ++ ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask); ++} ++ ++static inline void kvpfree(void *p, size_t size) ++{ ++ if (size < PAGE_SIZE) ++ kfree(p); ++ else ++ vpfree(p, size); ++} ++ ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return size < PAGE_SIZE ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); ++ ++#define HEAP(type) \ ++struct { \ ++ size_t size, used; \ ++ type *data; \ ++} ++ ++#define DECLARE_HEAP(type, name) HEAP(type) name ++ ++#define init_heap(heap, _size, gfp) \ ++({ \ ++ (heap)->used = 0; \ ++ (heap)->size = (_size); \ ++ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ ++ (gfp)); \ ++}) ++ ++#define free_heap(heap) \ ++do { \ ++ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ ++ (heap)->data = NULL; \ ++} while (0) ++ ++#define heap_set_backpointer(h, i, _fn) \ ++do { \ ++ void (*fn)(typeof(h), size_t) = _fn; \ ++ if (fn) \ ++ fn(h, i); \ ++} while (0) ++ ++#define heap_swap(h, i, j, set_backpointer) \ ++do { \ ++ swap((h)->data[i], (h)->data[j]); \ ++ heap_set_backpointer(h, i, set_backpointer); \ ++ heap_set_backpointer(h, j, set_backpointer); \ ++} while (0) ++ ++#define heap_peek(h) \ ++({ \ ++ EBUG_ON(!(h)->used); \ ++ (h)->data[0]; \ ++}) ++ ++#define heap_full(h) ((h)->used == (h)->size) ++ ++#define heap_sift_down(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _c, _j = i; \ ++ \ ++ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ ++ _c = _j * 2 + 1; \ ++ if (_c + 1 < (h)->used && \ ++ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ ++ _c++; \ ++ \ ++ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ ++ break; \ ++ heap_swap(h, _c, _j, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_sift_up(h, i, cmp, set_backpointer) \ ++do { \ ++ while (i) { \ ++ size_t p = (i - 1) / 2; \ ++ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ ++ break; \ ++ heap_swap(h, i, p, set_backpointer); \ ++ i = p; \ ++ } \ ++} while (0) ++ ++#define __heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ size_t _i = (h)->used++; \ ++ (h)->data[_i] = d; \ ++ heap_set_backpointer(h, _i, set_backpointer); \ ++ \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ _i; \ ++}) ++ ++#define heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = !heap_full(h); \ ++ if (_r) \ ++ __heap_add(h, d, cmp, set_backpointer); \ ++ _r; \ ++}) ++ ++#define heap_add_or_replace(h, new, cmp, set_backpointer) \ ++do { \ ++ if (!heap_add(h, new, cmp, set_backpointer) && \ ++ cmp(h, new, heap_peek(h)) >= 0) { \ ++ (h)->data[0] = new; \ ++ heap_set_backpointer(h, 0, set_backpointer); \ ++ heap_sift_down(h, 0, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_del(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _i = (i); \ ++ \ ++ BUG_ON(_i >= (h)->used); \ ++ (h)->used--; \ ++ if ((_i) < (h)->used) { \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_pop(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = (h)->used; \ ++ if (_r) { \ ++ (d) = (h)->data[0]; \ ++ heap_del(h, 0, cmp, set_backpointer); \ ++ } \ ++ _r; \ ++}) ++ ++#define heap_resort(heap, cmp, set_backpointer) \ ++do { \ ++ ssize_t _i; \ ++ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ ++ heap_sift_down(heap, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define ANYSINT_MAX(t) \ ++ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) ++ ++ ++#ifdef __KERNEL__ ++static inline void pr_time(struct printbuf *out, u64 time) ++{ ++ prt_printf(out, "%llu", time); ++} ++#else ++#include ++static inline void pr_time(struct printbuf *out, u64 _time) ++{ ++ char time_str[64]; ++ time_t time = _time; ++ struct tm *tm = localtime(&time); ++ size_t err = strftime(time_str, sizeof(time_str), "%c", tm); ++ if (!err) ++ prt_printf(out, "(formatting error)"); ++ else ++ prt_printf(out, "%s", time_str); ++} ++#endif ++ ++#ifdef __KERNEL__ ++static inline void uuid_unparse_lower(u8 *uuid, char *out) ++{ ++ sprintf(out, "%pUb", uuid); ++} ++#else ++#include ++#endif ++ ++static inline void pr_uuid(struct printbuf *out, u8 *uuid) ++{ ++ char uuid_str[40]; ++ ++ uuid_unparse_lower(uuid, uuid_str); ++ prt_printf(out, "%s", uuid_str); ++} ++ ++int bch2_strtoint_h(const char *, int *); ++int bch2_strtouint_h(const char *, unsigned int *); ++int bch2_strtoll_h(const char *, long long *); ++int bch2_strtoull_h(const char *, unsigned long long *); ++int bch2_strtou64_h(const char *, u64 *); ++ ++static inline int bch2_strtol_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtoint_h(cp, (int *) res); ++#else ++ return bch2_strtoll_h(cp, (long long *) res); ++#endif ++} ++ ++static inline int bch2_strtoul_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtouint_h(cp, (unsigned int *) res); ++#else ++ return bch2_strtoull_h(cp, (unsigned long long *) res); ++#endif ++} ++ ++#define strtoi_h(cp, res) \ ++ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ ++ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ ++ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ ++ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ ++ : -EINVAL) ++ ++#define strtoul_safe(cp, var) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = _v; \ ++ _r; \ ++}) ++ ++#define strtoul_safe_clamp(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = clamp_t(typeof(var), _v, min, max); \ ++ _r; \ ++}) ++ ++#define strtoul_safe_restrict(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r && _v >= min && _v <= max) \ ++ var = _v; \ ++ else \ ++ _r = -EINVAL; \ ++ _r; \ ++}) ++ ++#define snprint(out, var) \ ++ prt_printf(out, \ ++ type_is(var, int) ? "%i\n" \ ++ : type_is(var, unsigned) ? "%u\n" \ ++ : type_is(var, long) ? "%li\n" \ ++ : type_is(var, unsigned long) ? "%lu\n" \ ++ : type_is(var, s64) ? "%lli\n" \ ++ : type_is(var, u64) ? "%llu\n" \ ++ : type_is(var, char *) ? "%s\n" \ ++ : "%i\n", var) ++ ++bool bch2_is_zero(const void *, size_t); ++ ++u64 bch2_read_flag_list(char *, const char * const[]); ++ ++#define NR_QUANTILES 15 ++#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) ++#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) ++#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) ++ ++struct quantiles { ++ struct quantile_entry { ++ u64 m; ++ u64 step; ++ } entries[NR_QUANTILES]; ++}; ++ ++struct time_stat_buffer { ++ unsigned nr; ++ struct time_stat_buffer_entry { ++ u64 start; ++ u64 end; ++ } entries[32]; ++}; ++ ++struct time_stats { ++ spinlock_t lock; ++ u64 count; ++ /* all fields are in nanoseconds */ ++ u64 average_duration; ++ u64 average_frequency; ++ u64 max_duration; ++ u64 last_event; ++ struct quantiles quantiles; ++ ++ struct time_stat_buffer __percpu *buffer; ++}; ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64, u64); ++ ++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) ++{ ++ __bch2_time_stats_update(stats, start, local_clock()); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); ++ ++void bch2_time_stats_exit(struct time_stats *); ++void bch2_time_stats_init(struct time_stats *); ++ ++#define ewma_add(ewma, val, weight) \ ++({ \ ++ typeof(ewma) _ewma = (ewma); \ ++ typeof(weight) _weight = (weight); \ ++ \ ++ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ ++}) ++ ++struct bch_ratelimit { ++ /* Next time we want to do some work, in nanoseconds */ ++ u64 next; ++ ++ /* ++ * Rate at which we want to do work, in units per nanosecond ++ * The units here correspond to the units passed to ++ * bch2_ratelimit_increment() ++ */ ++ unsigned rate; ++}; ++ ++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) ++{ ++ d->next = local_clock(); ++} ++ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *); ++void bch2_ratelimit_increment(struct bch_ratelimit *, u64); ++ ++struct bch_pd_controller { ++ struct bch_ratelimit rate; ++ unsigned long last_update; ++ ++ s64 last_actual; ++ s64 smoothed_derivative; ++ ++ unsigned p_term_inverse; ++ unsigned d_smooth; ++ unsigned d_term; ++ ++ /* for exporting to sysfs (no effect on behavior) */ ++ s64 last_derivative; ++ s64 last_proportional; ++ s64 last_change; ++ s64 last_target; ++ ++ /* If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. */ ++ bool backpressure; ++}; ++ ++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); ++void bch2_pd_controller_init(struct bch_pd_controller *); ++void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); ++ ++#define sysfs_pd_controller_attribute(name) \ ++ rw_attribute(name##_rate); \ ++ rw_attribute(name##_rate_bytes); \ ++ rw_attribute(name##_rate_d_term); \ ++ rw_attribute(name##_rate_p_term_inverse); \ ++ read_attribute(name##_rate_debug) ++ ++#define sysfs_pd_controller_files(name) \ ++ &sysfs_##name##_rate, \ ++ &sysfs_##name##_rate_bytes, \ ++ &sysfs_##name##_rate_d_term, \ ++ &sysfs_##name##_rate_p_term_inverse, \ ++ &sysfs_##name##_rate_debug ++ ++#define sysfs_pd_controller_show(name, var) \ ++do { \ ++ sysfs_hprint(name##_rate, (var)->rate.rate); \ ++ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ ++ sysfs_print(name##_rate_d_term, (var)->d_term); \ ++ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ ++ \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ bch2_pd_controller_debug_to_text(out, var); \ ++} while (0) ++ ++#define sysfs_pd_controller_store(name, var) \ ++do { \ ++ sysfs_strtoul_clamp(name##_rate, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul_clamp(name##_rate_bytes, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ ++ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ ++ (var)->p_term_inverse, 1, INT_MAX); \ ++} while (0) ++ ++#define container_of_or_null(ptr, type, member) \ ++({ \ ++ typeof(ptr) _ptr = ptr; \ ++ _ptr ? container_of(_ptr, type, member) : NULL; \ ++}) ++ ++/* Does linear interpolation between powers of two */ ++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) ++{ ++ unsigned fract = x & ~(~0 << fract_bits); ++ ++ x >>= fract_bits; ++ x = 1 << x; ++ x += (x * fract) >> fract_bits; ++ ++ return x; ++} ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t); ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++ ++static inline sector_t bdev_sectors(struct block_device *bdev) ++{ ++ return bdev->bd_inode->i_size >> 9; ++} ++ ++#define closure_bio_submit(bio, cl) \ ++do { \ ++ closure_get(cl); \ ++ submit_bio(bio); \ ++} while (0) ++ ++#define kthread_wait_freezable(cond) \ ++({ \ ++ int _ret = 0; \ ++ while (1) { \ ++ set_current_state(TASK_INTERRUPTIBLE); \ ++ if (kthread_should_stop()) { \ ++ _ret = -1; \ ++ break; \ ++ } \ ++ \ ++ if (cond) \ ++ break; \ ++ \ ++ schedule(); \ ++ try_to_freeze(); \ ++ } \ ++ set_current_state(TASK_RUNNING); \ ++ _ret; \ ++}) ++ ++size_t bch2_rand_range(size_t); ++ ++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); ++void memcpy_from_bio(void *, struct bio *, struct bvec_iter); ++ ++static inline void memcpy_u64s_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++} ++ ++static inline void __memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("rep ; movsq" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++#endif ++} ++ ++static inline void memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || ++ dst + u64s * sizeof(u64) <= src)); ++ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst > src); ++ ++ __memmove_u64s_down(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up_small(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s; ++ u64 *src = (u64 *) _src + u64s; ++ ++ while (u64s--) ++ *--dst = *--src; ++} ++ ++static inline void memmove_u64s_up_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up_small(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s - 1; ++ u64 *src = (u64 *) _src + u64s - 1; ++ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("std ;\n" ++ "rep ; movsq\n" ++ "cld ;\n" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ while (u64s--) ++ *dst-- = *src--; ++#endif ++} ++ ++static inline void memmove_u64s_up(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++static inline void memmove_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ if (dst < src) ++ __memmove_u64s_down(dst, src, u64s); ++ else ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ ++static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ++{ ++ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; ++ ++ memset(s + bytes, c, rem); ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++/* just the memmove, doesn't update @_nr */ ++#define __array_insert_item(_array, _nr, _pos) \ ++ memmove(&(_array)[(_pos) + 1], \ ++ &(_array)[(_pos)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))) ++ ++#define array_insert_item(_array, _nr, _pos, _new_item) \ ++do { \ ++ __array_insert_item(_array, _nr, _pos); \ ++ (_nr)++; \ ++ (_array)[(_pos)] = (_new_item); \ ++} while (0) ++ ++#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ ++do { \ ++ (_nr) -= (_nr_to_remove); \ ++ memmove(&(_array)[(_pos)], \ ++ &(_array)[(_pos) + (_nr_to_remove)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))); \ ++} while (0) ++ ++#define array_remove_item(_array, _nr, _pos) \ ++ array_remove_items(_array, _nr, _pos, 1) ++ ++static inline void __move_gap(void *array, size_t element_size, ++ size_t nr, size_t size, ++ size_t old_gap, size_t new_gap) ++{ ++ size_t gap_end = old_gap + size - nr; ++ ++ if (new_gap < old_gap) { ++ size_t move = old_gap - new_gap; ++ ++ memmove(array + element_size * (gap_end - move), ++ array + element_size * (old_gap - move), ++ element_size * move); ++ } else if (new_gap > old_gap) { ++ size_t move = new_gap - old_gap; ++ ++ memmove(array + element_size * old_gap, ++ array + element_size * gap_end, ++ element_size * move); ++ } ++} ++ ++/* Move the gap in a gap buffer: */ ++#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ ++ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) ++ ++#define bubble_sort(_base, _nr, _cmp) \ ++do { \ ++ ssize_t _i, _end; \ ++ bool _swapped = true; \ ++ \ ++ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ ++ _swapped = false; \ ++ for (_i = 0; _i < _end; _i++) \ ++ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ ++ swap((_base)[_i], (_base)[_i + 1]); \ ++ _swapped = true; \ ++ } \ ++ } \ ++} while (0) ++ ++static inline u64 percpu_u64_get(u64 __percpu *src) ++{ ++ u64 ret = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ ret += *per_cpu_ptr(src, cpu); ++ return ret; ++} ++ ++static inline void percpu_u64_set(u64 __percpu *dst, u64 src) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(dst, cpu) = 0; ++ this_cpu_write(*dst, src); ++} ++ ++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr; i++) ++ acc[i] += src[i]; ++} ++ ++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, ++ unsigned nr) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); ++} ++ ++static inline void percpu_memset(void __percpu *p, int c, size_t bytes) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(p, cpu), c, bytes); ++} ++ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); ++ ++#define cmp_int(l, r) ((l > r) - (l < r)) ++ ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ ++#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +new file mode 100644 +index 000000000000..5143b603bf67 +--- /dev/null ++++ b/fs/bcachefs/varint.c +@@ -0,0 +1,121 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_VALGRIND ++#include ++#endif ++ ++#include "varint.h" ++ ++/** ++ * bch2_varint_encode - encode a variable length integer ++ * @out - destination to encode to ++ * @v - unsigned integer to encode ++ * ++ * Returns the size in bytes of the encoded integer - at most 9 bytes ++ */ ++int bch2_varint_encode(u8 *out, u64 v) ++{ ++ unsigned bits = fls64(v|1); ++ unsigned bytes = DIV_ROUND_UP(bits, 7); ++ ++ if (likely(bytes < 9)) { ++ v <<= bytes; ++ v |= ~(~0 << (bytes - 1)); ++ v = cpu_to_le64(v); ++ memcpy(out, &v, bytes); ++ } else { ++ *out++ = 255; ++ bytes = 9; ++ put_unaligned_le64(v, out); ++ } ++ ++ return bytes; ++} ++ ++/** ++ * bch2_varint_decode - encode a variable length integer ++ * @in - varint to decode ++ * @end - end of buffer to decode from ++ * @out - on success, decoded integer ++ * ++ * Returns the size in bytes of the decoded integer - or -1 on failure (would ++ * have read past the end of the buffer) ++ */ ++int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) ++{ ++ unsigned bytes = likely(in < end) ++ ? ffz(*in & 255) + 1 ++ : 1; ++ u64 v; ++ ++ if (unlikely(in + bytes > end)) ++ return -1; ++ ++ if (likely(bytes < 9)) { ++ v = 0; ++ memcpy(&v, in, bytes); ++ v = le64_to_cpu(v); ++ v >>= bytes; ++ } else { ++ v = get_unaligned_le64(++in); ++ } ++ ++ *out = v; ++ return bytes; ++} ++ ++/** ++ * bch2_varint_encode_fast - fast version of bch2_varint_encode ++ * ++ * This version assumes it's always safe to write 8 bytes to @out, even if the ++ * encoded integer would be smaller. ++ */ ++int bch2_varint_encode_fast(u8 *out, u64 v) ++{ ++ unsigned bits = fls64(v|1); ++ unsigned bytes = DIV_ROUND_UP(bits, 7); ++ ++ if (likely(bytes < 9)) { ++ v <<= bytes; ++ v |= ~(~0 << (bytes - 1)); ++ } else { ++ *out++ = 255; ++ bytes = 9; ++ } ++ ++ put_unaligned_le64(v, out); ++ return bytes; ++} ++ ++/** ++ * bch2_varint_decode_fast - fast version of bch2_varint_decode ++ * ++ * This version assumes that it is safe to read at most 8 bytes past the end of ++ * @end (we still return an error if the varint extends past @end). ++ */ ++int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) ++{ ++#ifdef CONFIG_VALGRIND ++ VALGRIND_MAKE_MEM_DEFINED(in, 8); ++#endif ++ u64 v = get_unaligned_le64(in); ++ unsigned bytes = ffz(*in) + 1; ++ ++ if (unlikely(in + bytes > end)) ++ return -1; ++ ++ if (likely(bytes < 9)) { ++ v >>= bytes; ++ v &= ~(~0ULL << (7 * bytes)); ++ } else { ++ v = get_unaligned_le64(++in); ++ } ++ ++ *out = v; ++ return bytes; ++} +diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h +new file mode 100644 +index 000000000000..92a182fb3d7a +--- /dev/null ++++ b/fs/bcachefs/varint.h +@@ -0,0 +1,11 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_VARINT_H ++#define _BCACHEFS_VARINT_H ++ ++int bch2_varint_encode(u8 *, u64); ++int bch2_varint_decode(const u8 *, const u8 *, u64 *); ++ ++int bch2_varint_encode_fast(u8 *, u64); ++int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); ++ ++#endif /* _BCACHEFS_VARINT_H */ +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +new file mode 100644 +index 000000000000..53a694d71967 +--- /dev/null ++++ b/fs/bcachefs/vstructs.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _VSTRUCTS_H ++#define _VSTRUCTS_H ++ ++#include "util.h" ++ ++/* ++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this ++ * assumes u64 is little endian: ++ */ ++#define __vstruct_u64s(_s) \ ++({ \ ++ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ ++ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ ++ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ ++ : ((__force u8) ((_s)->u64s))); \ ++}) ++ ++#define __vstruct_bytes(_type, _u64s) \ ++({ \ ++ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ ++ \ ++ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++}) ++ ++#define vstruct_bytes(_s) \ ++ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) ++ ++#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ ++ (round_up(__vstruct_bytes(_type, _u64s), \ ++ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) ++ ++#define vstruct_blocks(_s, _sector_block_bits) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) ++ ++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ ++ __vstruct_u64s(_s) + (_u64s)) ++ ++#define vstruct_sectors(_s, _sector_block_bits) \ ++ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) ++ ++#define vstruct_next(_s) \ ++ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_last(_s) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_end(_s) \ ++ ((void *) ((_s)->_data + __vstruct_u64s(_s))) ++ ++#define vstruct_for_each(_s, _i) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s); \ ++ _i = vstruct_next(_i)) ++ ++#define vstruct_for_each_safe(_s, _i, _t) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ ++ _i = _t) ++ ++#define vstruct_idx(_s, _idx) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) ++ ++#endif /* _VSTRUCTS_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +new file mode 100644 +index 000000000000..186ffab542d5 +--- /dev/null ++++ b/fs/bcachefs/xattr.c +@@ -0,0 +1,648 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "fs.h" ++#include "rebalance.h" ++#include "str_hash.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); ++ ++static u64 bch2_xattr_hash(const struct bch_hash_info *info, ++ const struct xattr_search_key *key) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); ++ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); ++ ++ return bch2_str_hash_end(&ctx, info); ++} ++ ++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_xattr_hash(info, key); ++} ++ ++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); ++ ++ return bch2_xattr_hash(info, ++ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++} ++ ++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ const struct xattr_search_key *r = _r; ++ ++ return l.v->x_type != r->type || ++ l.v->x_name_len != r->name.len || ++ memcmp(l.v->x_name, r->name.name, r->name.len); ++} ++ ++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); ++ ++ return l.v->x_type != r.v->x_type || ++ l.v->x_name_len != r.v->x_name_len || ++ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++} ++ ++const struct bch_hash_desc bch2_xattr_hash_desc = { ++ .btree_id = BTREE_ID_xattrs, ++ .key_type = KEY_TYPE_xattr, ++ .hash_key = xattr_hash_key, ++ .hash_bkey = xattr_hash_bkey, ++ .cmp_key = xattr_cmp_key, ++ .cmp_bkey = xattr_cmp_bkey, ++}; ++ ++int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ int rw, struct printbuf *err) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) { ++ prt_printf(err, "incorrect value size (%zu < %zu)", ++ bkey_val_bytes(k.k), sizeof(*xattr.v)); ++ return -EINVAL; ++ } ++ ++ if (bkey_val_u64s(k.k) < ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))) { ++ prt_printf(err, "value too small (%zu < %u)", ++ bkey_val_u64s(k.k), ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))); ++ return -EINVAL; ++ } ++ ++ /* XXX why +4 ? */ ++ if (bkey_val_u64s(k.k) > ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)) { ++ prt_printf(err, "value too big (%zu > %u)", ++ bkey_val_u64s(k.k), ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)); ++ return -EINVAL; ++ } ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (!handler) { ++ prt_printf(err, "invalid type (%u)", xattr.v->x_type); ++ return -EINVAL; ++ } ++ ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { ++ prt_printf(err, "xattr name has invalid characters"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (handler && handler->prefix) ++ prt_printf(out, "%s", handler->prefix); ++ else if (handler) ++ prt_printf(out, "(type %u)", xattr.v->x_type); ++ else ++ prt_printf(out, "(unknown type %u)", xattr.v->x_type); ++ ++ prt_printf(out, "%.*s:%.*s", ++ xattr.v->x_name_len, ++ xattr.v->x_name, ++ le16_to_cpu(xattr.v->x_val_len), ++ (char *) xattr_val(xattr.v)); ++} ++ ++static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); ++ struct btree_iter iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, ++ inode_inum(inode), ++ &X_SEARCH(type, name, strlen(name)), ++ 0); ++ if (ret) ++ goto err1; ++ ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err2; ++ ++ xattr = bkey_s_c_to_xattr(k); ++ ret = le16_to_cpu(xattr.v->x_val_len); ++ if (buffer) { ++ if (ret > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, xattr_val(xattr.v), ret); ++ } ++err2: ++ bch2_trans_iter_exit(trans, &iter); ++err1: ++ return ret == -ENOENT ? -ENODATA : ret; ++} ++ ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); ++} ++ ++int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ++ const struct bch_hash_info *hash_info, ++ const char *name, const void *value, size_t size, ++ int type, int flags) ++{ ++ struct btree_iter inode_iter = { NULL }; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ /* ++ * We need to do an inode update so that bi_journal_sync gets updated ++ * and fsync works: ++ * ++ * Perhaps we should be updating bi_mtime too? ++ */ ++ ++ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: ++ bch2_inode_write(trans, &inode_iter, &inode_u); ++ bch2_trans_iter_exit(trans, &inode_iter); ++ ++ if (ret) ++ return ret; ++ ++ if (value) { ++ struct bkey_i_xattr *xattr; ++ unsigned namelen = strlen(name); ++ unsigned u64s = BKEY_U64s + ++ xattr_val_u64s(namelen, size); ++ ++ if (u64s > U8_MAX) ++ return -ERANGE; ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = type; ++ xattr->v.x_name_len = namelen; ++ xattr->v.x_val_len = cpu_to_le16(size); ++ memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr_val(&xattr->v), value, size); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inum, &xattr->k_i, ++ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| ++ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(type, name, strlen(name)); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, ++ hash_info, inum, &search); ++ } ++ ++ if (ret == -ENOENT) ++ ret = flags & XATTR_REPLACE ? -ENODATA : 0; ++ ++ return ret; ++} ++ ++struct xattr_buf { ++ char *buf; ++ size_t len; ++ size_t used; ++}; ++ ++static int __bch2_xattr_emit(const char *prefix, ++ const char *name, size_t name_len, ++ struct xattr_buf *buf) ++{ ++ const size_t prefix_len = strlen(prefix); ++ const size_t total_len = prefix_len + name_len + 1; ++ ++ if (buf->buf) { ++ if (buf->used + total_len > buf->len) ++ return -ERANGE; ++ ++ memcpy(buf->buf + buf->used, prefix, prefix_len); ++ memcpy(buf->buf + buf->used + prefix_len, ++ name, name_len); ++ buf->buf[buf->used + prefix_len + name_len] = '\0'; ++ } ++ ++ buf->used += total_len; ++ return 0; ++} ++ ++static int bch2_xattr_emit(struct dentry *dentry, ++ const struct bch_xattr *xattr, ++ struct xattr_buf *buf) ++{ ++ const struct xattr_handler *handler = ++ bch2_xattr_type_to_handler(xattr->x_type); ++ ++ return handler && (!handler->list || handler->list(dentry)) ++ ? __bch2_xattr_emit(handler->prefix ?: handler->name, ++ xattr->x_name, xattr->x_name_len, buf) ++ : 0; ++} ++ ++static int bch2_xattr_list_bcachefs(struct bch_fs *c, ++ struct bch_inode_unpacked *inode, ++ struct xattr_buf *buf, ++ bool all) ++{ ++ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; ++ unsigned id; ++ int ret = 0; ++ u64 v; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ v = bch2_inode_opt_get(inode, id); ++ if (!v) ++ continue; ++ ++ if (!all && ++ !(inode->bi_fields_set & (1 << id))) ++ continue; ++ ++ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], ++ strlen(bch2_inode_opts[id]), buf); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ struct bch_fs *c = dentry->d_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; ++ u64 offset = 0, inum = inode->ei_inode.bi_inum; ++ u32 snapshot; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ iter = (struct btree_iter) { NULL }; ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, ++ SPOS(inum, offset, snapshot), ++ POS(inum, U64_MAX), 0, k, ret) { ++ if (k.k->type != KEY_TYPE_xattr) ++ continue; ++ ++ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); ++ if (ret) ++ break; ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); ++ if (ret) ++ return ret; ++ ++ return buf.used; ++} ++ ++static int bch2_xattr_get_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++} ++ ++static int bch2_xattr_set_handler(const struct xattr_handler *handler, ++ struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); ++ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_xattr_set(&trans, inode_inum(inode), &hash, ++ name, value, size, ++ handler->flags, flags)); ++} ++ ++static const struct xattr_handler bch_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_USER, ++}; ++ ++static bool bch2_xattr_trusted_list(struct dentry *dentry) ++{ ++ return capable(CAP_SYS_ADMIN); ++} ++ ++static const struct xattr_handler bch_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = bch2_xattr_trusted_list, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, ++}; ++ ++static const struct xattr_handler bch_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, ++}; ++ ++#ifndef NO_BCACHEFS_FS ++ ++static int opt_to_inode_opt(int id) ++{ ++ switch (id) { ++#define x(name, ...) \ ++ case Opt_##name: return Inode_opt_##name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ return -1; ++ } ++} ++ ++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size, ++ bool all) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_opts opts = ++ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); ++ const struct bch_option *opt; ++ int id, inode_opt_id; ++ struct printbuf out = PRINTBUF; ++ int ret; ++ u64 v; ++ ++ id = bch2_opt_lookup(name); ++ if (id < 0 || !bch2_opt_is_inode_opt(id)) ++ return -EINVAL; ++ ++ inode_opt_id = opt_to_inode_opt(id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + id; ++ ++ if (!bch2_opt_defined_by_id(&opts, id)) ++ return -ENODATA; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) ++ return -ENODATA; ++ ++ v = bch2_opt_get_by_id(&opts, id); ++ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); ++ ++ ret = out.pos; ++ ++ if (out.allocation_failure) { ++ ret = -ENOMEM; ++ } else if (buffer) { ++ if (out.pos > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, out.buf, out.pos); ++ } ++ ++ printbuf_exit(&out); ++ return ret; ++} ++ ++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, false); ++} ++ ++struct inode_opt_set { ++ int id; ++ u64 v; ++ bool defined; ++}; ++ ++static int inode_opt_set_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_opt_set *s = p; ++ ++ if (s->defined) ++ bi->bi_fields_set |= 1U << s->id; ++ else ++ bi->bi_fields_set &= ~(1U << s->id); ++ ++ bch2_inode_opt_set(bi, s->id, s->v); ++ ++ return 0; ++} ++ ++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ++ struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ const struct bch_option *opt; ++ char *buf; ++ struct inode_opt_set s; ++ int opt_id, inode_opt_id, ret; ++ ++ opt_id = bch2_opt_lookup(name); ++ if (opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + opt_id; ++ ++ inode_opt_id = opt_to_inode_opt(opt_id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ s.id = inode_opt_id; ++ ++ if (value) { ++ u64 v = 0; ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ memcpy(buf, value, size); ++ buf[size] = '\0'; ++ ++ ret = bch2_opt_parse(c, opt, buf, &v, NULL); ++ kfree(buf); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, opt_id, v); ++ if (ret < 0) ++ return ret; ++ ++ s.v = v + 1; ++ s.defined = true; ++ } else { ++ if (!IS_ROOT(dentry)) { ++ struct bch_inode_info *dir = ++ to_bch_ei(d_inode(dentry->d_parent)); ++ ++ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); ++ } else { ++ s.v = 0; ++ } ++ ++ s.defined = false; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (inode_opt_id == Inode_opt_project) { ++ /* ++ * inode fields accessible via the xattr interface are stored ++ * with a +1 bias, so that 0 means unset: ++ */ ++ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (value && ++ (opt_id == Opt_background_compression || ++ opt_id == Opt_background_target)) ++ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ ++ return ret; ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_handler = { ++ .prefix = "bcachefs.", ++ .get = bch2_xattr_bcachefs_get, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++static int bch2_xattr_bcachefs_get_effective( ++ const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, true); ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { ++ .prefix = "bcachefs_effective.", ++ .get = bch2_xattr_bcachefs_get_effective, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++const struct xattr_handler *bch2_xattr_handlers[] = { ++ &bch_xattr_user_handler, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ &posix_acl_access_xattr_handler, ++ &posix_acl_default_xattr_handler, ++#endif ++ &bch_xattr_trusted_handler, ++ &bch_xattr_security_handler, ++#ifndef NO_BCACHEFS_FS ++ &bch_xattr_bcachefs_handler, ++ &bch_xattr_bcachefs_effective_handler, ++#endif ++ NULL ++}; ++ ++static const struct xattr_handler *bch_xattr_handler_map[] = { ++ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = ++ &posix_acl_access_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = ++ &posix_acl_default_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, ++ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, ++}; ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) ++{ ++ return type < ARRAY_SIZE(bch_xattr_handler_map) ++ ? bch_xattr_handler_map[type] ++ : NULL; ++} +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +new file mode 100644 +index 000000000000..66d7a1e30350 +--- /dev/null ++++ b/fs/bcachefs/xattr.h +@@ -0,0 +1,50 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_XATTR_H ++#define _BCACHEFS_XATTR_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_xattr_hash_desc; ++ ++int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); ++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++ .key_invalid = bch2_xattr_invalid, \ ++ .val_to_text = bch2_xattr_to_text, \ ++} ++ ++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ name_len + val_len, sizeof(u64)); ++} ++ ++#define xattr_val(_xattr) \ ++ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ++struct xattr_search_key { ++ u8 type; ++ struct qstr name; ++}; ++ ++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ ++ { .type = _type, .name = QSTR_INIT(_name, _len) }) ++ ++struct dentry; ++struct xattr_handler; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, ++ const char *, void *, size_t, int); ++ ++int bch2_xattr_set(struct btree_trans *, subvol_inum, ++ const struct bch_hash_info *, ++ const char *, const void *, size_t, int, int); ++ ++ssize_t bch2_xattr_list(struct dentry *, char *, size_t); ++ ++extern const struct xattr_handler *bch2_xattr_handlers[]; ++ ++#endif /* _BCACHEFS_XATTR_H */ +diff --git a/fs/d_path.c b/fs/d_path.c +index e4e0ebad1f15..1bd9e85f2f65 100644 +--- a/fs/d_path.c ++++ b/fs/d_path.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + #include "mount.h" + +@@ -294,6 +295,40 @@ char *d_path(const struct path *path, char *buf, int buflen) + } + EXPORT_SYMBOL(d_path); + ++/** ++ * prt_path - format a path for output ++ * @out: printbuf to output to ++ * @path: path to write into the sequence buffer. ++ * @esc: set of characters to escape in the output ++ * ++ * Write a path name into the sequence buffer. ++ * ++ * Returns 0 on success, or error code from d_path ++ */ ++int prt_path(struct printbuf *out, const struct path *path, const char *esc) ++{ ++ char *p, *buf; ++ size_t size; ++again: ++ buf = out->buf + out->pos; ++ size = printbuf_remaining_size(out); ++ ++ p = d_path(path, buf, size); ++ if (IS_ERR(p)) { ++ printbuf_make_room(out, max_t(size_t, 64, size * 2)); ++ if (printbuf_remaining_size(out) > size) ++ goto again; ++ ++ return PTR_ERR(p); ++ } ++ ++ p = mangle_path(buf, p, esc); ++ if (p) ++ out->pos += p - buf; ++ return 0; ++} ++EXPORT_SYMBOL(prt_path); ++ + /* + * Helper function for dentry_operations.d_dname() members + */ +diff --git a/fs/dcache.c b/fs/dcache.c +index 93f4f5ee07bf..d90ed65e2a75 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -3193,9 +3193,8 @@ void d_genocide(struct dentry *parent) + + EXPORT_SYMBOL(d_genocide); + +-void d_tmpfile(struct dentry *dentry, struct inode *inode) ++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) + { +- inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); +@@ -3205,6 +3204,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); ++} ++EXPORT_SYMBOL(d_mark_tmpfile); ++ ++void d_tmpfile(struct dentry *dentry, struct inode *inode) ++{ ++ inode_dec_link_count(inode); ++ d_mark_tmpfile(dentry, inode); + d_instantiate(dentry, inode); + } + EXPORT_SYMBOL(d_tmpfile); +diff --git a/fs/inode.c b/fs/inode.c +index bd4da9c5207e..ac0da28a1ac6 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -56,8 +56,23 @@ + + static unsigned int i_hash_mask __read_mostly; + static unsigned int i_hash_shift __read_mostly; +-static struct hlist_head *inode_hashtable __read_mostly; +-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); ++static struct hlist_bl_head *inode_hashtable __read_mostly; ++ ++static unsigned long hash(struct super_block *sb, unsigned long hashval) ++{ ++ unsigned long tmp; ++ ++ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / ++ L1_CACHE_BYTES; ++ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); ++ return tmp & i_hash_mask; ++} ++ ++static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, ++ unsigned int hashval) ++{ ++ return inode_hashtable + hash(sb, hashval); ++} + + /* + * Empty aops. Can be used for the cases where the user does not +@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once); + void inode_init_once(struct inode *inode) + { + memset(inode, 0, sizeof(*inode)); +- INIT_HLIST_NODE(&inode->i_hash); ++ INIT_HLIST_BL_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); +@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode) + } + } + +-static unsigned long hash(struct super_block *sb, unsigned long hashval) ++/* ++ * Ensure that we store the hash head in the inode when we insert the inode into ++ * the hlist_bl_head... ++ */ ++static inline void ++__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) + { +- unsigned long tmp; +- +- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / +- L1_CACHE_BYTES; +- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); +- return tmp & i_hash_mask; ++ hlist_bl_add_head_rcu(&inode->i_hash, b); ++ inode->i_hash_head = b; + } + + /** +@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) + */ + void __insert_inode_hash(struct inode *inode, unsigned long hashval) + { +- struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); + +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + spin_lock(&inode->i_lock); +- hlist_add_head_rcu(&inode->i_hash, b); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + } + EXPORT_SYMBOL(__insert_inode_hash); + +@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash); + */ + void __remove_inode_hash(struct inode *inode) + { +- spin_lock(&inode_hash_lock); +- spin_lock(&inode->i_lock); +- hlist_del_init_rcu(&inode->i_hash); +- spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ struct hlist_bl_head *b = inode->i_hash_head; ++ ++ /* ++ * There are some callers that come through here without synchronisation ++ * and potentially with multiple references to the inode. Hence we have ++ * to handle the case that we might race with a remove and insert to a ++ * different list. Coda, in particular, seems to have a userspace API ++ * that can directly trigger "unhash/rehash to different list" behaviour ++ * without any serialisation at all. ++ * ++ * Hence we have to handle the situation where the inode->i_hash_head ++ * might point to a different list than what we expect, indicating that ++ * we raced with another unhash and potentially a new insertion. This ++ * means we have to retest the head once we have everything locked up ++ * and loop again if it doesn't match. ++ */ ++ while (b) { ++ hlist_bl_lock(b); ++ spin_lock(&inode->i_lock); ++ if (b != inode->i_hash_head) { ++ hlist_bl_unlock(b); ++ b = inode->i_hash_head; ++ spin_unlock(&inode->i_lock); ++ continue; ++ } ++ /* ++ * Need to set the pprev pointer to NULL after list removal so ++ * that both RCU traversals and hlist_bl_unhashed() work ++ * correctly at this point. ++ */ ++ hlist_bl_del_rcu(&inode->i_hash); ++ inode->i_hash.pprev = NULL; ++ inode->i_hash_head = NULL; ++ spin_unlock(&inode->i_lock); ++ hlist_bl_unlock(b); ++ break; ++ } ++ + } + EXPORT_SYMBOL(__remove_inode_hash); + +@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) + return freed; + } + +-static void __wait_on_freeing_inode(struct inode *inode); ++static void __wait_on_freeing_inode(struct hlist_bl_head *b, ++ struct inode *inode); + /* + * Called with the inode lock held. + */ + static struct inode *find_inode(struct super_block *sb, +- struct hlist_head *head, ++ struct hlist_bl_head *b, + int (*test)(struct inode *, void *), + void *data) + { ++ struct hlist_bl_node *node; + struct inode *inode = NULL; + + repeat: +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { +- __wait_on_freeing_inode(inode); ++ __wait_on_freeing_inode(b, inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { +@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb, + * iget_locked for details. + */ + static struct inode *find_inode_fast(struct super_block *sb, +- struct hlist_head *head, unsigned long ino) ++ struct hlist_bl_head *b, unsigned long ino) + { ++ struct hlist_bl_node *node; + struct inode *inode = NULL; + + repeat: +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { +- __wait_on_freeing_inode(inode); ++ __wait_on_freeing_inode(b, inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { +@@ -1156,26 +1208,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories); + * return it locked, hashed, and with the I_NEW flag set. The file system gets + * to fill it in before unlocking it via unlock_new_inode(). + * +- * Note both @test and @set are called with the inode_hash_lock held, so can't +- * sleep. ++ * Note both @test and @set are called with the inode hash chain lock held, ++ * so can't sleep. + */ + struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); + struct inode *old; + bool creating = inode->i_state & I_CREATING; + + again: +- spin_lock(&inode_hash_lock); +- old = find_inode(inode->i_sb, head, test, data); ++ hlist_bl_lock(b); ++ old = find_inode(inode->i_sb, b, test, data); + if (unlikely(old)) { + /* + * Uhhuh, somebody else created the same inode under us. + * Use the old inode instead of the preallocated one. + */ +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + if (IS_ERR(old)) + return NULL; + wait_on_inode(old); +@@ -1197,12 +1249,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + */ + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); + if (!creating) + inode_sb_list_add(inode); + unlock: +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + + return inode; + } +@@ -1263,12 +1315,12 @@ EXPORT_SYMBOL(iget5_locked); + */ + struct inode *iget_locked(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + struct inode *inode; + again: +- spin_lock(&inode_hash_lock); +- inode = find_inode_fast(sb, head, ino); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode_fast(sb, b, ino); ++ hlist_bl_unlock(b); + if (inode) { + if (IS_ERR(inode)) + return NULL; +@@ -1284,17 +1336,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) + if (inode) { + struct inode *old; + +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + /* We released the lock, so.. */ +- old = find_inode_fast(sb, head, ino); ++ old = find_inode_fast(sb, b, ino); + if (!old) { + inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents +@@ -1307,7 +1359,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) + * us. Use the old inode instead of the one we just + * allocated. + */ +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + destroy_inode(inode); + if (IS_ERR(old)) + return NULL; +@@ -1331,10 +1383,11 @@ EXPORT_SYMBOL(iget_locked); + */ + static int test_inode_iunique(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *b = inode_hashtable + hash(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); ++ struct hlist_bl_node *node; + struct inode *inode; + +- hlist_for_each_entry_rcu(inode, b, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) + return 0; + } +@@ -1418,12 +1471,12 @@ EXPORT_SYMBOL(igrab); + struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct inode *inode; + +- spin_lock(&inode_hash_lock); +- inode = find_inode(sb, head, test, data); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode(sb, b, test, data); ++ hlist_bl_unlock(b); + + return IS_ERR(inode) ? NULL : inode; + } +@@ -1473,12 +1526,12 @@ EXPORT_SYMBOL(ilookup5); + */ + struct inode *ilookup(struct super_block *sb, unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + struct inode *inode; + again: +- spin_lock(&inode_hash_lock); +- inode = find_inode_fast(sb, head, ino); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_lock(b); ++ inode = find_inode_fast(sb, b, ino); ++ hlist_bl_unlock(b); + + if (inode) { + if (IS_ERR(inode)) +@@ -1522,12 +1575,13 @@ struct inode *find_inode_nowait(struct super_block *sb, + void *), + void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); ++ struct hlist_bl_node *node; + struct inode *inode, *ret_inode = NULL; + int mval; + +- spin_lock(&inode_hash_lock); +- hlist_for_each_entry(inode, head, i_hash) { ++ hlist_bl_lock(b); ++ hlist_bl_for_each_entry(inode, node, b, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); +@@ -1538,7 +1592,7 @@ struct inode *find_inode_nowait(struct super_block *sb, + goto out; + } + out: +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return ret_inode; + } + EXPORT_SYMBOL(find_inode_nowait); +@@ -1567,13 +1621,14 @@ EXPORT_SYMBOL(find_inode_nowait); + struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); ++ struct hlist_bl_head *b = i_hash_head(sb, hashval); ++ struct hlist_bl_node *node; + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_rcu() usage"); + +- hlist_for_each_entry_rcu(inode, head, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + test(inode, data)) +@@ -1605,13 +1660,14 @@ EXPORT_SYMBOL(find_inode_rcu); + struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) + { +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); ++ struct hlist_bl_node *node; + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_by_ino_rcu() usage"); + +- hlist_for_each_entry_rcu(inode, head, i_hash) { ++ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { + if (inode->i_ino == ino && + inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) +@@ -1625,39 +1681,42 @@ int insert_inode_locked(struct inode *inode) + { + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; +- struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ struct hlist_bl_head *b = i_hash_head(sb, ino); + + while (1) { +- struct inode *old = NULL; +- spin_lock(&inode_hash_lock); +- hlist_for_each_entry(old, head, i_hash) { +- if (old->i_ino != ino) ++ struct hlist_bl_node *node; ++ struct inode *old = NULL, *t; ++ ++ hlist_bl_lock(b); ++ hlist_bl_for_each_entry(t, node, b, i_hash) { ++ if (t->i_ino != ino) + continue; +- if (old->i_sb != sb) ++ if (t->i_sb != sb) + continue; +- spin_lock(&old->i_lock); +- if (old->i_state & (I_FREEING|I_WILL_FREE)) { +- spin_unlock(&old->i_lock); ++ spin_lock(&t->i_lock); ++ if (t->i_state & (I_FREEING|I_WILL_FREE)) { ++ spin_unlock(&t->i_lock); + continue; + } ++ old = t; + break; + } + if (likely(!old)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW | I_CREATING; +- hlist_add_head_rcu(&inode->i_hash, head); ++ __insert_inode_hash_head(inode, b); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return 0; + } + if (unlikely(old->i_state & I_CREATING)) { + spin_unlock(&old->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + return -EBUSY; + } + __iget(old); + spin_unlock(&old->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); +@@ -2131,17 +2190,18 @@ EXPORT_SYMBOL(inode_needs_sync); + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. + */ +-static void __wait_on_freeing_inode(struct inode *inode) ++static void __wait_on_freeing_inode(struct hlist_bl_head *b, ++ struct inode *inode) + { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); +- spin_unlock(&inode_hash_lock); ++ hlist_bl_unlock(b); + schedule(); + finish_wait(wq, &wait.wq_entry); +- spin_lock(&inode_hash_lock); ++ hlist_bl_lock(b); + } + + static __initdata unsigned long ihash_entries; +@@ -2167,7 +2227,7 @@ void __init inode_init_early(void) + + inode_hashtable = + alloc_large_system_hash("Inode-cache", +- sizeof(struct hlist_head), ++ sizeof(struct hlist_bl_head), + ihash_entries, + 14, + HASH_EARLY | HASH_ZERO, +@@ -2193,7 +2253,7 @@ void __init inode_init(void) + + inode_hashtable = + alloc_large_system_hash("Inode-cache", +- sizeof(struct hlist_head), ++ sizeof(struct hlist_bl_head), + ihash_entries, + 14, + HASH_ZERO, +diff --git a/include/linux/bio.h b/include/linux/bio.h +index 992ee987f273..6d5acc1b407f 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -480,7 +480,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + extern void bio_copy_data(struct bio *dst, struct bio *src); + extern void bio_free_pages(struct bio *bio); + void guard_bio_eod(struct bio *bio); +-void zero_fill_bio(struct bio *bio); ++void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); ++ ++static inline void zero_fill_bio(struct bio *bio) ++{ ++ zero_fill_bio_iter(bio, bio->bi_iter); ++} + + static inline void bio_release_pages(struct bio *bio, bool mark_dirty) + { +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 2f7b43444c5f..4ef515977abc 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -884,6 +884,7 @@ extern const char *blk_op_str(unsigned int op); + + int blk_status_to_errno(blk_status_t status); + blk_status_t errno_to_blk_status(int errno); ++const char *blk_status_to_str(blk_status_t status); + + /* only poll the hardware once, don't continue until a completion was found */ + #define BLK_POLL_ONESHOT (1 << 0) +diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h +similarity index 94% +rename from drivers/md/bcache/closure.h +rename to include/linux/closure.h +index c88cdc4ae4ec..36b4a83f9b77 100644 +--- a/drivers/md/bcache/closure.h ++++ b/include/linux/closure.h +@@ -155,7 +155,7 @@ struct closure { + + atomic_t remaining; + +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + #define CLOSURE_MAGIC_DEAD 0xc054dead + #define CLOSURE_MAGIC_ALIVE 0xc054a11e + +@@ -184,15 +184,13 @@ static inline void closure_sync(struct closure *cl) + __closure_sync(cl); + } + +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + +-void closure_debug_init(void); + void closure_debug_create(struct closure *cl); + void closure_debug_destroy(struct closure *cl); + + #else + +-static inline void closure_debug_init(void) {} + static inline void closure_debug_create(struct closure *cl) {} + static inline void closure_debug_destroy(struct closure *cl) {} + +@@ -200,21 +198,21 @@ static inline void closure_debug_destroy(struct closure *cl) {} + + static inline void closure_set_ip(struct closure *cl) + { +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _THIS_IP_; + #endif + } + + static inline void closure_set_ret_ip(struct closure *cl) + { +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + cl->ip = _RET_IP_; + #endif + } + + static inline void closure_set_waiting(struct closure *cl, unsigned long f) + { +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + cl->waiting_on = f; + #endif + } +@@ -243,6 +241,7 @@ static inline void closure_queue(struct closure *cl) + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); ++ + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); +@@ -255,7 +254,7 @@ static inline void closure_queue(struct closure *cl) + */ + static inline void closure_get(struct closure *cl) + { +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); + #else +@@ -271,7 +270,7 @@ static inline void closure_get(struct closure *cl) + */ + static inline void closure_init(struct closure *cl, struct closure *parent) + { +- memset(cl, 0, sizeof(struct closure)); ++ cl->fn = NULL; + cl->parent = parent; + if (parent) + closure_get(parent); +@@ -375,4 +374,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, + continue_at_nobarrier(cl, fn, wq); + } + ++#define __closure_wait_event(waitlist, _cond) \ ++do { \ ++ struct closure cl; \ ++ \ ++ closure_init_stack(&cl); \ ++ \ ++ while (1) { \ ++ closure_wait(waitlist, &cl); \ ++ if (_cond) \ ++ break; \ ++ closure_sync(&cl); \ ++ } \ ++ closure_wake_up(waitlist); \ ++ closure_sync(&cl); \ ++} while (0) ++ ++#define closure_wait_event(waitlist, _cond) \ ++do { \ ++ if (!(_cond)) \ ++ __closure_wait_event(waitlist, _cond); \ ++} while (0) ++ + #endif /* _LINUX_CLOSURE_H */ +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index 445e80517cab..57e7d0b94119 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -371,4 +371,9 @@ + */ + #define __weak __attribute__((__weak__)) + ++/* ++ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ */ ++#define __flatten __attribute__((flatten)) ++ + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index f5bba51480b2..6c661059a55b 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -248,6 +248,7 @@ extern struct dentry * d_make_root(struct inode *); + /* - the ramfs-type tree */ + extern void d_genocide(struct dentry *); + ++extern void d_mark_tmpfile(struct dentry *, struct inode *); + extern void d_tmpfile(struct dentry *, struct inode *); + + extern struct dentry *d_find_alias(struct inode *); +@@ -293,6 +294,7 @@ extern char *d_absolute_path(const struct path *, char *, int); + extern char *d_path(const struct path *, char *, int); + extern char *dentry_path_raw(const struct dentry *, char *, int); + extern char *dentry_path(const struct dentry *, char *, int); ++extern int prt_path(struct printbuf *, const struct path *, const char *); + + /* Allocation counts.. */ + +diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h +index fe848901fcc3..5a3cc0e1da9b 100644 +--- a/include/linux/exportfs.h ++++ b/include/linux/exportfs.h +@@ -98,6 +98,12 @@ enum fid_type { + */ + FILEID_FAT_WITH_PARENT = 0x72, + ++ /* ++ * 64 bit inode number, 32 bit subvolume, 32 bit generation number: ++ */ ++ FILEID_BCACHEFS_WITHOUT_PARENT = 0x80, ++ FILEID_BCACHEFS_WITH_PARENT = 0x81, ++ + /* + * 128 bit child FID (struct lu_fid) + * 128 bit parent FID (struct lu_fid) +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 9ad5e3520fae..1f7671a674e3 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -630,7 +630,8 @@ struct inode { + unsigned long dirtied_when; /* jiffies of first dirtying */ + unsigned long dirtied_time_when; + +- struct hlist_node i_hash; ++ struct hlist_bl_node i_hash; ++ struct hlist_bl_head *i_hash_head; + struct list_head i_io_list; /* backing dev IO list */ + #ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +@@ -696,7 +697,7 @@ static inline unsigned int i_blocksize(const struct inode *node) + + static inline int inode_unhashed(struct inode *inode) + { +- return hlist_unhashed(&inode->i_hash); ++ return hlist_bl_unhashed(&inode->i_hash); + } + + /* +@@ -707,7 +708,7 @@ static inline int inode_unhashed(struct inode *inode) + */ + static inline void inode_fake_hash(struct inode *inode) + { +- hlist_add_fake(&inode->i_hash); ++ hlist_bl_add_fake(&inode->i_hash); + } + + /* +@@ -2974,7 +2975,7 @@ static inline void insert_inode_hash(struct inode *inode) + extern void __remove_inode_hash(struct inode *); + static inline void remove_inode_hash(struct inode *inode) + { +- if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) ++ if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) + __remove_inode_hash(inode); + } + +diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h +index 107613f7d792..c74b7376990d 100644 +--- a/include/linux/generic-radix-tree.h ++++ b/include/linux/generic-radix-tree.h +@@ -38,6 +38,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -116,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) + + #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) + #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) ++#define __genradix_objs_per_page(_radix) \ ++ (PAGE_SIZE / sizeof((_radix)->type[0])) ++#define __genradix_page_remainder(_radix) \ ++ (PAGE_SIZE % sizeof((_radix)->type[0])) ++ + #define __genradix_idx_to_offset(_radix, _idx) \ + __idx_to_offset(_idx, __genradix_obj_size(_radix)) + +@@ -179,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); + #define genradix_iter_peek(_iter, _radix) \ + (__genradix_cast(_radix) \ + __genradix_iter_peek(_iter, &(_radix)->tree, \ +- PAGE_SIZE / __genradix_obj_size(_radix))) ++ __genradix_objs_per_page(_radix))) ++ ++void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, ++ size_t, size_t); ++ ++/** ++ * genradix_iter_peek - get first entry at or below iterator's current ++ * position ++ * @_iter: a genradix_iter ++ * @_radix: genradix being iterated over ++ * ++ * If no more entries exist at or below @_iter's current position, returns NULL ++ */ ++#define genradix_iter_peek_prev(_iter, _radix) \ ++ (__genradix_cast(_radix) \ ++ __genradix_iter_peek_prev(_iter, &(_radix)->tree, \ ++ __genradix_objs_per_page(_radix), \ ++ __genradix_obj_size(_radix) + \ ++ __genradix_page_remainder(_radix))) + + static inline void __genradix_iter_advance(struct genradix_iter *iter, + size_t obj_size) + { ++ if (iter->offset + obj_size < iter->offset) { ++ iter->offset = SIZE_MAX; ++ iter->pos = SIZE_MAX; ++ return; ++ } ++ + iter->offset += obj_size; + + if (!is_power_of_2(obj_size) && +@@ -196,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, + #define genradix_iter_advance(_iter, _radix) \ + __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) + ++static inline void __genradix_iter_rewind(struct genradix_iter *iter, ++ size_t obj_size) ++{ ++ if (iter->offset == 0 || ++ iter->offset == SIZE_MAX) { ++ iter->offset = SIZE_MAX; ++ return; ++ } ++ ++ if ((iter->offset & (PAGE_SIZE - 1)) == 0) ++ iter->offset -= PAGE_SIZE % obj_size; ++ ++ iter->offset -= obj_size; ++ iter->pos--; ++} ++ ++#define genradix_iter_rewind(_iter, _radix) \ ++ __genradix_iter_rewind(_iter, __genradix_obj_size(_radix)) ++ + #define genradix_for_each_from(_radix, _iter, _p, _start) \ + for (_iter = genradix_iter_init(_radix, _start); \ + (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ +@@ -213,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, + #define genradix_for_each(_radix, _iter, _p) \ + genradix_for_each_from(_radix, _iter, _p, 0) + ++#define genradix_last_pos(_radix) \ ++ (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) ++ ++/** ++ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order ++ * @_radix: genradix to iterate over ++ * @_iter: a genradix_iter to track current position ++ * @_p: pointer to genradix entry type ++ * ++ * On every iteration, @_p will point to the current entry, and @_iter.pos ++ * will be the current entry's index. ++ */ ++#define genradix_for_each_reverse(_radix, _iter, _p) \ ++ for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\ ++ (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\ ++ genradix_iter_rewind(&_iter, _radix)) ++ + int __genradix_prealloc(struct __genradix *, size_t, gfp_t); + + /** +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index fe6efb24d151..9ba5a53c6ad5 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -202,11 +202,17 @@ static inline void might_fault(void) { } + + void do_exit(long error_code) __noreturn; + ++struct printbuf; ++extern void prt_u64_minwidth(struct printbuf *out, u64 num, unsigned width); ++extern void prt_u64(struct printbuf *out, u64 num); + extern int num_to_str(char *buf, int size, + unsigned long long num, unsigned int width); + + /* lib/printf utilities */ + ++extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...); ++extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list); ++ + extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...); + extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list); + extern __printf(3, 4) +@@ -289,6 +295,12 @@ extern int hex_to_bin(unsigned char ch); + extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); + extern char *bin2hex(char *dst, const void *src, size_t count); + ++struct printbuf; ++void prt_hex_bytes(struct printbuf *, const void *, unsigned, unsigned, unsigned); ++void prt_hex_line(struct printbuf *, const void *, size_t, int, int, bool); ++void prt_hex_dump(struct printbuf *, const void *, size_t, ++ const char *, int, unsigned, unsigned, bool); ++ + bool mac_pton(const char *s, u8 *mac); + + /* +diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h +index ae1b541446c9..8ee2bf5af131 100644 +--- a/include/linux/list_bl.h ++++ b/include/linux/list_bl.h +@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) + } + } + ++/** ++ * hlist_bl_add_fake - create a fake list consisting of a single headless node ++ * @n: Node to make a fake list out of ++ * ++ * This makes @n appear to be its own predecessor on a headless hlist. ++ * The point of this is to allow things like hlist_bl_del() to work correctly ++ * in cases where there is no list. ++ */ ++static inline void hlist_bl_add_fake(struct hlist_bl_node *n) ++{ ++ n->pprev = &n->next; ++} ++ ++/** ++ * hlist_fake: Is this node a fake hlist_bl? ++ * @h: Node to check for being a self-referential fake hlist. ++ */ ++static inline bool hlist_bl_fake(struct hlist_bl_node *n) ++{ ++ return n->pprev == &n->next; ++} ++ + static inline void hlist_bl_lock(struct hlist_bl_head *b) + { + bit_spin_lock(0, (unsigned long *)b); +diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h +index b6829b970093..5b90b2abd326 100644 +--- a/include/linux/lockdep.h ++++ b/include/linux/lockdep.h +@@ -335,6 +335,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); + #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) + #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) + ++int lock_class_is_held(struct lock_class_key *key); ++ + #else /* !CONFIG_LOCKDEP */ + + static inline void lockdep_init_task(struct task_struct *task) +@@ -423,6 +425,8 @@ extern int lockdep_is_held(const void *); + #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) + #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) + ++static inline int lock_class_is_held(struct lock_class_key *key) { return 0; } ++ + #endif /* !LOCKDEP */ + + enum xhlock_context_t { +diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h +new file mode 100644 +index 000000000000..f39d8edfba02 +--- /dev/null ++++ b/include/linux/pretty-printers.h +@@ -0,0 +1,10 @@ ++/* SPDX-License-Identifier: LGPL-2.1+ */ ++/* Copyright (C) 2022 Kent Overstreet */ ++ ++#ifndef _LINUX_PRETTY_PRINTERS_H ++#define _LINUX_PRETTY_PRINTERS_H ++ ++void prt_string_option(struct printbuf *, const char * const[], size_t); ++void prt_bitflags(struct printbuf *, const char * const[], u64); ++ ++#endif /* _LINUX_PRETTY_PRINTERS_H */ +diff --git a/include/linux/printbuf.h b/include/linux/printbuf.h +new file mode 100644 +index 000000000000..861c5d75f852 +--- /dev/null ++++ b/include/linux/printbuf.h +@@ -0,0 +1,283 @@ ++/* SPDX-License-Identifier: LGPL-2.1+ */ ++/* Copyright (C) 2022 Kent Overstreet */ ++ ++#ifndef _LINUX_PRINTBUF_H ++#define _LINUX_PRINTBUF_H ++ ++/* ++ * Printbufs: Simple strings for printing to, with optional heap allocation ++ * ++ * This code has provisions for use in userspace, to aid in making other code ++ * portable between kernelspace and userspace. ++ * ++ * Basic example: ++ * struct printbuf buf = PRINTBUF; ++ * ++ * prt_printf(&buf, "foo="); ++ * foo_to_text(&buf, foo); ++ * printk("%s", buf.buf); ++ * printbuf_exit(&buf); ++ * ++ * Or ++ * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) ++ * ++ * We can now write pretty printers instead of writing code that dumps ++ * everything to the kernel log buffer, and then those pretty-printers can be ++ * used by other code that outputs to kernel log, sysfs, debugfs, etc. ++ * ++ * Memory allocation: Outputing to a printbuf may allocate memory. This ++ * allocation is done with GFP_KERNEL, by default: use the newer ++ * memalloc_*_(save|restore) functions as needed. ++ * ++ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations ++ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. ++ * ++ * It's allowed to grab the output buffer and free it later with kfree() instead ++ * of using printbuf_exit(), if the user just needs a heap allocated string at ++ * the end. ++ * ++ * Memory allocation failures: We don't return errors directly, because on ++ * memory allocation failure we usually don't want to bail out and unwind - we ++ * want to print what we've got, on a best-effort basis. But code that does want ++ * to return -ENOMEM may check printbuf.allocation_failure. ++ * ++ * Indenting, tabstops: ++ * ++ * To aid is writing multi-line pretty printers spread across multiple ++ * functions, printbufs track the current indent level. ++ * ++ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent ++ * level, respectively. ++ * ++ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from ++ * start of line. Once set, prt_tab() will output spaces up to the next tabstop. ++ * prt_tab_rjust() will also advance the current line of text up to the next ++ * tabstop, but it does so by shifting text since the previous tabstop up to the ++ * next tabstop - right justifying it. ++ * ++ * Make sure you use prt_newline() instead of \n in the format string for indent ++ * level and tabstops to work corretly. ++ * ++ * Output units: printbuf->units exists to tell pretty-printers how to output ++ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as ++ * human readable bytes. prt_units() obeys it. ++ */ ++ ++#include ++#include ++ ++enum printbuf_si { ++ PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ ++ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ ++}; ++ ++struct printbuf { ++ char *buf; ++ unsigned size; ++ unsigned pos; ++ unsigned last_newline; ++ unsigned last_field; ++ unsigned indent; ++ /* ++ * If nonzero, allocations will be done with GFP_ATOMIC: ++ */ ++ u8 atomic; ++ bool allocation_failure:1; ++ bool heap_allocated:1; ++ enum printbuf_si si_units:1; ++ bool human_readable_units:1; ++ u8 tabstop; ++ u8 tabstops[4]; ++}; ++ ++int printbuf_make_room(struct printbuf *, unsigned); ++const char *printbuf_str(const struct printbuf *); ++void printbuf_exit(struct printbuf *); ++ ++void prt_newline(struct printbuf *); ++void printbuf_indent_add(struct printbuf *, unsigned); ++void printbuf_indent_sub(struct printbuf *, unsigned); ++void prt_tab(struct printbuf *); ++void prt_tab_rjust(struct printbuf *); ++void prt_human_readable_u64(struct printbuf *, u64); ++void prt_human_readable_s64(struct printbuf *, s64); ++void prt_units_u64(struct printbuf *, u64); ++void prt_units_s64(struct printbuf *, s64); ++ ++/* Initializer for a heap allocated printbuf: */ ++#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) ++ ++/* Initializer a printbuf that points to an external buffer: */ ++#define PRINTBUF_EXTERN(_buf, _size) \ ++((struct printbuf) { \ ++ .buf = _buf, \ ++ .size = _size, \ ++}) ++ ++/* ++ * Returns size remaining of output buffer: ++ */ ++static inline unsigned printbuf_remaining_size(struct printbuf *out) ++{ ++ return out->pos < out->size ? out->size - out->pos : 0; ++} ++ ++/* ++ * Returns number of characters we can print to the output buffer - i.e. ++ * excluding the terminating nul: ++ */ ++static inline unsigned printbuf_remaining(struct printbuf *out) ++{ ++ return out->pos < out->size ? out->size - out->pos - 1 : 0; ++} ++ ++static inline unsigned printbuf_written(struct printbuf *out) ++{ ++ return out->size ? min(out->pos, out->size - 1) : 0; ++} ++ ++/* ++ * Returns true if output was truncated: ++ */ ++static inline bool printbuf_overflowed(struct printbuf *out) ++{ ++ return out->pos >= out->size; ++} ++ ++static inline void printbuf_nul_terminate(struct printbuf *out) ++{ ++ printbuf_make_room(out, 1); ++ ++ if (out->pos < out->size) ++ out->buf[out->pos] = 0; ++ else if (out->size) ++ out->buf[out->size - 1] = 0; ++} ++ ++/* Doesn't call printbuf_make_room(), doesn't nul terminate: */ ++static inline void __prt_char_reserved(struct printbuf *out, char c) ++{ ++ if (printbuf_remaining(out)) ++ out->buf[out->pos] = c; ++ out->pos++; ++} ++ ++/* Doesn't nul terminate: */ ++static inline void __prt_char(struct printbuf *out, char c) ++{ ++ printbuf_make_room(out, 1); ++ __prt_char_reserved(out, c); ++} ++ ++static inline void prt_char(struct printbuf *out, char c) ++{ ++ __prt_char(out, c); ++ printbuf_nul_terminate(out); ++} ++ ++static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) ++{ ++ unsigned i, can_print = min(n, printbuf_remaining(out)); ++ ++ for (i = 0; i < can_print; i++) ++ out->buf[out->pos++] = c; ++ out->pos += n - can_print; ++} ++ ++static inline void prt_chars(struct printbuf *out, char c, unsigned n) ++{ ++ printbuf_make_room(out, n); ++ __prt_chars_reserved(out, c, n); ++ printbuf_nul_terminate(out); ++} ++ ++static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) ++{ ++ unsigned i, can_print; ++ ++ printbuf_make_room(out, n); ++ ++ can_print = min(n, printbuf_remaining(out)); ++ ++ for (i = 0; i < can_print; i++) ++ out->buf[out->pos++] = ((char *) b)[i]; ++ out->pos += n - can_print; ++ ++ printbuf_nul_terminate(out); ++} ++ ++static inline void prt_str(struct printbuf *out, const char *str) ++{ ++ prt_bytes(out, str, strlen(str)); ++} ++ ++static inline void prt_hex_byte(struct printbuf *out, u8 byte) ++{ ++ printbuf_make_room(out, 2); ++ __prt_char_reserved(out, hex_asc_hi(byte)); ++ __prt_char_reserved(out, hex_asc_lo(byte)); ++ printbuf_nul_terminate(out); ++} ++ ++static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) ++{ ++ printbuf_make_room(out, 2); ++ __prt_char_reserved(out, hex_asc_upper_hi(byte)); ++ __prt_char_reserved(out, hex_asc_upper_lo(byte)); ++ printbuf_nul_terminate(out); ++} ++ ++/** ++ * printbuf_reset - re-use a printbuf without freeing and re-initializing it: ++ */ ++static inline void printbuf_reset(struct printbuf *buf) ++{ ++ buf->pos = 0; ++ buf->allocation_failure = 0; ++ buf->indent = 0; ++ buf->tabstop = 0; ++} ++ ++/** ++ * printbuf_atomic_inc - mark as entering an atomic section ++ */ ++static inline void printbuf_atomic_inc(struct printbuf *buf) ++{ ++ buf->atomic++; ++} ++ ++/** ++ * printbuf_atomic_inc - mark as leaving an atomic section ++ */ ++static inline void printbuf_atomic_dec(struct printbuf *buf) ++{ ++ buf->atomic--; ++} ++ ++/* ++ * This is used for the %pf(%p) sprintf format extension, where we pass a pretty ++ * printer and arguments to the pretty-printer to sprintf ++ * ++ * Instead of passing a pretty-printer function to sprintf directly, we pass it ++ * a pointer to a struct call_pp, so that sprintf can check that the magic ++ * number is present, which in turn ensures that the CALL_PP() macro has been ++ * used in order to typecheck the arguments to the pretty printer function ++ * ++ * Example usage: ++ * sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev)); ++ */ ++struct call_pp { ++ unsigned long magic; ++ void *fn; ++}; ++ ++#define PP_TYPECHECK(fn, ...) \ ++ ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); }) ++ ++#define CALL_PP_MAGIC (unsigned long) 0xce0b92d22f6b6be4 ++ ++#define CALL_PP(fn, ...) \ ++ (PP_TYPECHECK(fn, ##__VA_ARGS__), \ ++ &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__ ++ ++#endif /* _LINUX_PRINTBUF_H */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c46f3a63b758..5038c87db740 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -857,6 +857,7 @@ struct task_struct { + + struct mm_struct *mm; + struct mm_struct *active_mm; ++ struct address_space *faults_disabled_mapping; + + /* Per-thread vma caching: */ + struct vmacache vmacache; +diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h +deleted file mode 100644 +index 5b31c5147969..000000000000 +--- a/include/linux/seq_buf.h ++++ /dev/null +@@ -1,162 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _LINUX_SEQ_BUF_H +-#define _LINUX_SEQ_BUF_H +- +-#include +- +-/* +- * Trace sequences are used to allow a function to call several other functions +- * to create a string of data to use. +- */ +- +-/** +- * seq_buf - seq buffer structure +- * @buffer: pointer to the buffer +- * @size: size of the buffer +- * @len: the amount of data inside the buffer +- * @readpos: The next position to read in the buffer. +- */ +-struct seq_buf { +- char *buffer; +- size_t size; +- size_t len; +- loff_t readpos; +-}; +- +-static inline void seq_buf_clear(struct seq_buf *s) +-{ +- s->len = 0; +- s->readpos = 0; +-} +- +-static inline void +-seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) +-{ +- s->buffer = buf; +- s->size = size; +- seq_buf_clear(s); +-} +- +-/* +- * seq_buf have a buffer that might overflow. When this happens +- * the len and size are set to be equal. +- */ +-static inline bool +-seq_buf_has_overflowed(struct seq_buf *s) +-{ +- return s->len > s->size; +-} +- +-static inline void +-seq_buf_set_overflow(struct seq_buf *s) +-{ +- s->len = s->size + 1; +-} +- +-/* +- * How much buffer is left on the seq_buf? +- */ +-static inline unsigned int +-seq_buf_buffer_left(struct seq_buf *s) +-{ +- if (seq_buf_has_overflowed(s)) +- return 0; +- +- return s->size - s->len; +-} +- +-/* How much buffer was written? */ +-static inline unsigned int seq_buf_used(struct seq_buf *s) +-{ +- return min(s->len, s->size); +-} +- +-/** +- * seq_buf_terminate - Make sure buffer is nul terminated +- * @s: the seq_buf descriptor to terminate. +- * +- * This makes sure that the buffer in @s is nul terminated and +- * safe to read as a string. +- * +- * Note, if this is called when the buffer has overflowed, then +- * the last byte of the buffer is zeroed, and the len will still +- * point passed it. +- * +- * After this function is called, s->buffer is safe to use +- * in string operations. +- */ +-static inline void seq_buf_terminate(struct seq_buf *s) +-{ +- if (WARN_ON(s->size == 0)) +- return; +- +- if (seq_buf_buffer_left(s)) +- s->buffer[s->len] = 0; +- else +- s->buffer[s->size - 1] = 0; +-} +- +-/** +- * seq_buf_get_buf - get buffer to write arbitrary data to +- * @s: the seq_buf handle +- * @bufp: the beginning of the buffer is stored here +- * +- * Return the number of bytes available in the buffer, or zero if +- * there's no space. +- */ +-static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp) +-{ +- WARN_ON(s->len > s->size + 1); +- +- if (s->len < s->size) { +- *bufp = s->buffer + s->len; +- return s->size - s->len; +- } +- +- *bufp = NULL; +- return 0; +-} +- +-/** +- * seq_buf_commit - commit data to the buffer +- * @s: the seq_buf handle +- * @num: the number of bytes to commit +- * +- * Commit @num bytes of data written to a buffer previously acquired +- * by seq_buf_get. To signal an error condition, or that the data +- * didn't fit in the available space, pass a negative @num value. +- */ +-static inline void seq_buf_commit(struct seq_buf *s, int num) +-{ +- if (num < 0) { +- seq_buf_set_overflow(s); +- } else { +- /* num must be negative on overflow */ +- BUG_ON(s->len + num > s->size); +- s->len += num; +- } +-} +- +-extern __printf(2, 3) +-int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); +-extern __printf(2, 0) +-int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args); +-extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s); +-extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, +- int cnt); +-extern int seq_buf_puts(struct seq_buf *s, const char *str); +-extern int seq_buf_putc(struct seq_buf *s, unsigned char c); +-extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len); +-extern int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, +- unsigned int len); +-extern int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc); +-extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, +- int prefix_type, int rowsize, int groupsize, +- const void *buf, size_t len, bool ascii); +- +-#ifdef CONFIG_BINARY_PRINTF +-extern int +-seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); +-#endif +- +-#endif /* _LINUX_SEQ_BUF_H */ +diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h +index 76fbf92b04d9..12967748f9f7 100644 +--- a/include/linux/shrinker.h ++++ b/include/linux/shrinker.h +@@ -2,6 +2,8 @@ + #ifndef _LINUX_SHRINKER_H + #define _LINUX_SHRINKER_H + ++struct printbuf; ++ + /* + * This struct is used to pass information from page reclaim to the shrinkers. + * We consolidate the values for easier extension later. +@@ -58,10 +60,12 @@ struct shrink_control { + * @flags determine the shrinker abilities, like numa awareness + */ + struct shrinker { ++ char name[32]; + unsigned long (*count_objects)(struct shrinker *, + struct shrink_control *sc); + unsigned long (*scan_objects)(struct shrinker *, + struct shrink_control *sc); ++ void (*to_text)(struct printbuf *, struct shrinker *); + + long batch; /* reclaim batch size, 0 = default */ + int seeks; /* seeks to recreate an obj */ +@@ -75,6 +79,9 @@ struct shrinker { + #endif + /* objs pending delete, per node */ + atomic_long_t *nr_deferred; ++ ++ atomic_long_t objects_requested_to_free; ++ atomic_long_t objects_freed; + }; + #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ + +@@ -94,4 +101,5 @@ extern int register_shrinker(struct shrinker *shrinker); + extern void unregister_shrinker(struct shrinker *shrinker); + extern void free_prealloced_shrinker(struct shrinker *shrinker); + extern void synchronize_shrinkers(void); ++void shrinkers_to_text(struct printbuf *); + #endif +diff --git a/include/linux/six.h b/include/linux/six.h +new file mode 100644 +index 000000000000..477c33eb00d7 +--- /dev/null ++++ b/include/linux/six.h +@@ -0,0 +1,203 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/* ++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw ++ * semaphores, except with a third intermediate state, intent. Basic operations ++ * are: ++ * ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * six_lock_intent(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * ++ * Intent locks block other intent locks, but do not block read locks, and you ++ * must have an intent lock held before taking a write lock, like so: ++ * ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade(): convert from intent to read ++ * six_lock_tryupgrade(): attempt to convert from read to intent ++ * ++ * Locks also embed a sequence number, which is incremented when the lock is ++ * locked or unlocked for write. The current sequence number can be grabbed ++ * while a lock is held from lock->state.seq; then, if you drop the lock you can ++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock ++ * iff it hasn't been locked for write in the meantime. ++ * ++ * There are also operations that take the lock type as a parameter, where the ++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: ++ * ++ * six_lock_type(lock, type) ++ * six_unlock_type(lock, type) ++ * six_relock(lock, type, seq) ++ * six_trylock_type(lock, type) ++ * six_trylock_convert(lock, from, to) ++ * ++ * A lock may be held multiple types by the same thread (for read or intent, ++ * not write). However, the six locks code does _not_ implement the actual ++ * recursive checks itself though - rather, if your code (e.g. btree iterator ++ * code) knows that the current thread already has a lock held, and for the ++ * correct type, six_lock_increment() may be used to bump up the counter for ++ * that type - the only effect is that one more call to unlock will be required ++ * before the lock is unlocked. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define SIX_LOCK_SEPARATE_LOCKFNS ++ ++union six_lock_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ /* for waitlist_bitnr() */ ++ unsigned long l; ++ }; ++ ++ struct { ++ unsigned read_lock:27; ++ unsigned write_locking:1; ++ unsigned intent_lock:1; ++ unsigned waiters:3; ++ /* ++ * seq works much like in seqlocks: it's incremented every time ++ * we lock and unlock for write. ++ * ++ * If it's odd write lock is held, even unlocked. ++ * ++ * Thus readers can unlock, and then lock again later iff it ++ * hasn't been modified in the meantime. ++ */ ++ u32 seq; ++ }; ++}; ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ union six_lock_state state; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ struct optimistic_spin_queue osq; ++ unsigned __percpu *readers; ++ ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list[2]; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++static __always_inline void __six_lock_init(struct six_lock *lock, ++ const char *name, ++ struct lock_class_key *key) ++{ ++ atomic64_set(&lock->state.counter, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++ ++#define six_lock_init(lock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *); \ ++bool six_relock_##type(struct six_lock *, u32); \ ++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ ++void six_unlock_##type(struct six_lock *); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++#define SIX_LOCK_DISPATCH(type, fn, ...) \ ++ switch (type) { \ ++ case SIX_LOCK_read: \ ++ return fn##_read(__VA_ARGS__); \ ++ case SIX_LOCK_intent: \ ++ return fn##_intent(__VA_ARGS__); \ ++ case SIX_LOCK_write: \ ++ return fn##_write(__VA_ARGS__); \ ++ default: \ ++ BUG(); \ ++ } ++ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_trylock, lock); ++} ++ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); ++} ++ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); ++} ++ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_unlock, lock); ++} ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++void six_lock_pcpu_free_rcu(struct six_lock *); ++void six_lock_pcpu_free(struct six_lock *); ++void six_lock_pcpu_alloc(struct six_lock *); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/include/linux/string.h b/include/linux/string.h +index 61ec7e4f6311..22a45d553fbc 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -195,7 +195,12 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); + */ + #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) + ++struct printbuf; ++ + #ifdef CONFIG_BINARY_PRINTF ++void prt_vbinprintf(struct printbuf *out, const char *fmt, va_list args); ++void prt_bstrprintf(struct printbuf *out, const char *fmt, const u32 *bin_buf); ++void prt_bprintf(struct printbuf *out, const char *fmt, ...) __printf(2, 3); + int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); + int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); + int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); +diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h +index 4d72258d42fd..52e0f1d283b9 100644 +--- a/include/linux/string_helpers.h ++++ b/include/linux/string_helpers.h +@@ -10,6 +10,7 @@ + struct device; + struct file; + struct task_struct; ++struct printbuf; + + /* Descriptions of the types of units to + * print in */ +@@ -18,8 +19,8 @@ enum string_size_units { + STRING_UNITS_2, /* use binary powers of 2^10 */ + }; + +-void string_get_size(u64 size, u64 blk_size, enum string_size_units units, +- char *buf, int len); ++int string_get_size(u64 size, u64 blk_size, enum string_size_units units, ++ char *buf, int len); + + #define UNESCAPE_SPACE BIT(0) + #define UNESCAPE_OCTAL BIT(1) +@@ -62,6 +63,8 @@ static inline int string_unescape_any_inplace(char *buf) + + #define ESCAPE_ALL_MASK GENMASK(8, 0) + ++void prt_escaped_string(struct printbuf *out, const char *src, size_t isz, ++ unsigned int flags, const char *only); + int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, + unsigned int flags, const char *only); + +@@ -71,6 +74,7 @@ static inline int string_escape_mem_any_np(const char *src, size_t isz, + return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only); + } + ++ + static inline int string_escape_str(const char *src, char *dst, size_t sz, + unsigned int flags, const char *only) + { +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index e6e95a9f07a5..48471e32f8e4 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -496,7 +496,7 @@ struct dynevent_cmd; + typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); + + struct dynevent_cmd { +- struct seq_buf seq; ++ struct printbuf seq; + const char *event_name; + unsigned int n_fields; + enum dynevent_type type; +diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h +index 5a2c650d9e1c..d2b51007b3b9 100644 +--- a/include/linux/trace_seq.h ++++ b/include/linux/trace_seq.h +@@ -2,10 +2,12 @@ + #ifndef _LINUX_TRACE_SEQ_H + #define _LINUX_TRACE_SEQ_H + +-#include ++#include + + #include + ++struct seq_file; ++ + /* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use (up to a max of PAGE_SIZE). +@@ -13,14 +15,16 @@ + + struct trace_seq { + char buffer[PAGE_SIZE]; +- struct seq_buf seq; ++ struct printbuf seq; ++ unsigned readpos; + int full; + }; + + static inline void + trace_seq_init(struct trace_seq *s) + { +- seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); ++ s->seq = PRINTBUF_EXTERN(s->buffer, PAGE_SIZE); ++ s->readpos = 0; + s->full = 0; + } + +@@ -39,7 +43,7 @@ trace_seq_init(struct trace_seq *s) + */ + static inline int trace_seq_used(struct trace_seq *s) + { +- return seq_buf_used(&s->seq); ++ return printbuf_written(&s->seq); + } + + /** +@@ -54,7 +58,7 @@ static inline int trace_seq_used(struct trace_seq *s) + static inline char * + trace_seq_buffer_ptr(struct trace_seq *s) + { +- return s->buffer + seq_buf_used(&s->seq); ++ return s->buffer + printbuf_written(&s->seq); + } + + /** +@@ -66,7 +70,7 @@ trace_seq_buffer_ptr(struct trace_seq *s) + */ + static inline bool trace_seq_has_overflowed(struct trace_seq *s) + { +- return s->full || seq_buf_has_overflowed(&s->seq); ++ return s->full || printbuf_overflowed(&s->seq); + } + + /* +@@ -87,6 +91,7 @@ extern void trace_seq_putc(struct trace_seq *s, unsigned char c); + extern void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len); + extern void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len); ++struct path; + extern int trace_seq_path(struct trace_seq *s, const struct path *path); + + extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 096d48aa3437..8d11e2e4ddc8 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -144,6 +144,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1); + extern void *vmalloc_user(unsigned long size) __alloc_size(1); + extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); + extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); ++extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1); + extern void *vmalloc_32(unsigned long size) __alloc_size(1); + extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h +index 24a509f559ee..0b20ee6854d6 100644 +--- a/include/net/9p/9p.h ++++ b/include/net/9p/9p.h +@@ -539,12 +539,12 @@ struct p9_rstatfs { + struct p9_fcall { + u32 size; + u8 id; ++ bool used_mempool; + u16 tag; + + size_t offset; + size_t capacity; + +- struct kmem_cache *cache; + u8 *sdata; + }; + +diff --git a/include/net/9p/client.h b/include/net/9p/client.h +index ec1d1706f43c..832dcc866a20 100644 +--- a/include/net/9p/client.h ++++ b/include/net/9p/client.h +@@ -9,6 +9,7 @@ + #ifndef NET_9P_CLIENT_H + #define NET_9P_CLIENT_H + ++#include + #include + #include + +@@ -76,7 +77,7 @@ enum p9_req_status_t { + struct p9_req_t { + int status; + int t_err; +- struct kref refcount; ++ refcount_t refcount; + wait_queue_head_t wq; + struct p9_fcall tc; + struct p9_fcall rc; +@@ -107,6 +108,14 @@ struct p9_client { + void *trans; + struct kmem_cache *fcall_cache; + ++ /* ++ * We need two identical mempools because it's not safe to allocate ++ * multiple elements from the same pool (without freeing the first); ++ * that will deadlock if multiple threads need the last element at the ++ * same time. ++ */ ++ mempool_t pools[2]; ++ + union { + struct { + int rfd; +@@ -222,20 +231,21 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, + kgid_t gid, struct p9_qid *qid); + int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status); + int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); +-void p9_fcall_fini(struct p9_fcall *fc); ++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx); + struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag); + + static inline void p9_req_get(struct p9_req_t *r) + { +- kref_get(&r->refcount); ++ refcount_inc(&r->refcount); + } + + static inline int p9_req_try_get(struct p9_req_t *r) + { +- return kref_get_unless_zero(&r->refcount); ++ return refcount_inc_not_zero(&r->refcount); + } + +-int p9_req_put(struct p9_req_t *r); ++int p9_req_put(struct p9_client *c, struct p9_req_t *r); + + void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); + +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +new file mode 100644 +index 000000000000..140834e7406e +--- /dev/null ++++ b/include/trace/events/bcachefs.h +@@ -0,0 +1,1048 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM bcachefs ++ ++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_BCACHE_H ++ ++#include ++ ++DECLARE_EVENT_CLASS(bpos, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = p->inode; ++ __entry->offset = p->offset; ++ ), ++ ++ TP_printk("%llu:%llu", __entry->inode, __entry->offset) ++); ++ ++DECLARE_EVENT_CLASS(bkey, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = k->p.inode; ++ __entry->offset = k->p.offset; ++ __entry->size = k->size; ++ ), ++ ++ TP_printk("%llu:%llu len %u", __entry->inode, ++ __entry->offset, __entry->size) ++); ++ ++DECLARE_EVENT_CLASS(bch_fs, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ ), ++ ++ TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ++); ++ ++DECLARE_EVENT_CLASS(bio, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(sector_t, sector ) ++ __field(unsigned int, nr_sector ) ++ __array(char, rwbs, 6 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; ++ __entry->sector = bio->bi_iter.bi_sector; ++ __entry->nr_sector = bio->bi_iter.bi_size >> 9; ++ blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ++ ), ++ ++ TP_printk("%d,%d %s %llu + %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, ++ (unsigned long long)__entry->sector, __entry->nr_sector) ++); ++ ++/* super-io.c: */ ++TRACE_EVENT(write_super, ++ TP_PROTO(struct bch_fs *c, unsigned long ip), ++ TP_ARGS(c, ip), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%d,%d for %pS", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ (void *) __entry->ip) ++); ++ ++/* io.c: */ ++ ++DEFINE_EVENT(bio, read_split, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_bounce, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_retry, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, promote, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* Journal */ ++ ++DEFINE_EVENT(bch_fs, journal_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, journal_entry_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bio, journal_write, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++TRACE_EVENT(journal_reclaim_start, ++ TP_PROTO(struct bch_fs *c, bool direct, bool kicked, ++ u64 min_nr, u64 min_key_cache, ++ u64 prereserved, u64 prereserved_total, ++ u64 btree_cache_dirty, u64 btree_cache_total, ++ u64 btree_key_cache_dirty, u64 btree_key_cache_total), ++ TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, ++ btree_cache_dirty, btree_cache_total, ++ btree_key_cache_dirty, btree_key_cache_total), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(bool, direct ) ++ __field(bool, kicked ) ++ __field(u64, min_nr ) ++ __field(u64, min_key_cache ) ++ __field(u64, prereserved ) ++ __field(u64, prereserved_total ) ++ __field(u64, btree_cache_dirty ) ++ __field(u64, btree_cache_total ) ++ __field(u64, btree_key_cache_dirty ) ++ __field(u64, btree_key_cache_total ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->direct = direct; ++ __entry->kicked = kicked; ++ __entry->min_nr = min_nr; ++ __entry->min_key_cache = min_key_cache; ++ __entry->prereserved = prereserved; ++ __entry->prereserved_total = prereserved_total; ++ __entry->btree_cache_dirty = btree_cache_dirty; ++ __entry->btree_cache_total = btree_cache_total; ++ __entry->btree_key_cache_dirty = btree_key_cache_dirty; ++ __entry->btree_key_cache_total = btree_key_cache_total; ++ ), ++ ++ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->direct, ++ __entry->kicked, ++ __entry->min_nr, ++ __entry->min_key_cache, ++ __entry->prereserved, ++ __entry->prereserved_total, ++ __entry->btree_cache_dirty, ++ __entry->btree_cache_total, ++ __entry->btree_key_cache_dirty, ++ __entry->btree_key_cache_total) ++); ++ ++TRACE_EVENT(journal_reclaim_finish, ++ TP_PROTO(struct bch_fs *c, u64 nr_flushed), ++ TP_ARGS(c, nr_flushed), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, nr_flushed ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->nr_flushed = nr_flushed; ++ ), ++ ++ TP_printk("%d,%d flushed %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->nr_flushed) ++); ++ ++/* allocator: */ ++ ++/* bset.c: */ ++ ++DEFINE_EVENT(bpos, bkey_pack_pos_fail, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p) ++); ++ ++/* Btree */ ++ ++DECLARE_EVENT_CLASS(btree_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u8, level ) ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->level = b->c.level; ++ __entry->id = b->c.btree_id; ++ __entry->inode = b->key.k.p.inode; ++ __entry->offset = b->key.k.p.offset; ++ ), ++ ++ TP_printk("%d,%d %u id %u %llu:%llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->level, __entry->id, ++ __entry->inode, __entry->offset) ++); ++ ++DEFINE_EVENT(btree_node, btree_read, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_write, ++ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), ++ TP_ARGS(b, bytes, sectors), ++ ++ TP_STRUCT__entry( ++ __field(enum btree_node_type, type) ++ __field(unsigned, bytes ) ++ __field(unsigned, sectors ) ++ ), ++ ++ TP_fast_assign( ++ __entry->type = btree_node_type(b); ++ __entry->bytes = bytes; ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("bkey type %u bytes %u sectors %u", ++ __entry->type , __entry->bytes, __entry->sectors) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_free, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_reap, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++TRACE_EVENT(btree_reserve_get_fail, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ size_t required), ++ TP_ARGS(trans_fn, caller_ip, required), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(size_t, required ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->required = required; ++ ), ++ ++ TP_printk("%s %pS required %zu", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->required) ++); ++ ++DEFINE_EVENT(btree_node, btree_split, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_compact, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_merge, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_rewrite, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_set_root, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_cache_scan, ++ TP_PROTO(long nr_to_scan, long can_free, long ret), ++ TP_ARGS(nr_to_scan, can_free, ret), ++ ++ TP_STRUCT__entry( ++ __field(long, nr_to_scan ) ++ __field(long, can_free ) ++ __field(long, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->nr_to_scan = nr_to_scan; ++ __entry->can_free = can_free; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("scanned for %li nodes, can free %li, ret %li", ++ __entry->nr_to_scan, __entry->can_free, __entry->ret) ++); ++ ++TRACE_EVENT(btree_node_relock_fail, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned long node, ++ u32 iter_lock_seq, ++ u32 node_lock_seq), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ __field(unsigned long, node ) ++ __field(u32, iter_lock_seq ) ++ __field(u32, node_lock_seq ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ __entry->node = node; ++ __entry->iter_lock_seq = iter_lock_seq; ++ __entry->node_lock_seq = node_lock_seq; ++ ), ++ ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot, ++ __entry->node, ++ __entry->iter_lock_seq, ++ __entry->node_lock_seq) ++); ++ ++/* Garbage collection */ ++ ++DEFINE_EVENT(bch_fs, gc_gens_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_gens_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++/* Allocator */ ++ ++TRACE_EVENT(bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), ++ TP_ARGS(ca, alloc_reserve), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __array(char, reserve, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->dev; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ ), ++ ++ TP_printk("%d,%d reserve %s", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->reserve) ++); ++ ++TRACE_EVENT(bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, ++ u64 free, ++ u64 avail, ++ u64 copygc_wait_amount, ++ s64 copygc_waiting_for, ++ u64 seen, ++ u64 open, ++ u64 need_journal_commit, ++ u64 nouse, ++ bool nonblocking, ++ const char *err), ++ TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, ++ seen, open, need_journal_commit, nouse, nonblocking, err), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __array(char, reserve, 16 ) ++ __field(u64, free ) ++ __field(u64, avail ) ++ __field(u64, copygc_wait_amount ) ++ __field(s64, copygc_waiting_for ) ++ __field(u64, seen ) ++ __field(u64, open ) ++ __field(u64, need_journal_commit ) ++ __field(u64, nouse ) ++ __field(bool, nonblocking ) ++ __array(char, err, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->dev; ++ strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); ++ __entry->free = free; ++ __entry->avail = avail; ++ __entry->copygc_wait_amount = copygc_wait_amount; ++ __entry->copygc_waiting_for = copygc_waiting_for; ++ __entry->seen = seen; ++ __entry->open = open; ++ __entry->need_journal_commit = need_journal_commit; ++ __entry->nouse = nouse; ++ __entry->nonblocking = nonblocking; ++ strlcpy(__entry->err, err, sizeof(__entry->err)); ++ ), ++ ++ TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->reserve, ++ __entry->free, ++ __entry->avail, ++ __entry->copygc_wait_amount, ++ __entry->copygc_waiting_for, ++ __entry->seen, ++ __entry->open, ++ __entry->need_journal_commit, ++ __entry->nouse, ++ __entry->nonblocking, ++ __entry->err) ++); ++ ++TRACE_EVENT(discard_buckets, ++ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, ++ u64 need_journal_commit, u64 discarded, const char *err), ++ TP_ARGS(c, seen, open, need_journal_commit, discarded, err), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, seen ) ++ __field(u64, open ) ++ __field(u64, need_journal_commit ) ++ __field(u64, discarded ) ++ __array(char, err, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->seen = seen; ++ __entry->open = open; ++ __entry->need_journal_commit = need_journal_commit; ++ __entry->discarded = discarded; ++ strlcpy(__entry->err, err, sizeof(__entry->err)); ++ ), ++ ++ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->seen, ++ __entry->open, ++ __entry->need_journal_commit, ++ __entry->discarded, ++ __entry->err) ++); ++ ++TRACE_EVENT(invalidate_bucket, ++ TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), ++ TP_ARGS(c, dev, bucket, sectors), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u32, dev_idx ) ++ __field(u32, sectors ) ++ __field(u64, bucket ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->dev_idx = dev; ++ __entry->sectors = sectors; ++ __entry->bucket = bucket; ++ ), ++ ++ TP_printk("%d:%d invalidated %u:%llu cached sectors %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->dev_idx, __entry->bucket, ++ __entry->sectors) ++); ++ ++/* Moving IO */ ++ ++DEFINE_EVENT(bkey, move_extent, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_alloc_mem_fail, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_race, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++TRACE_EVENT(move_data, ++ TP_PROTO(struct bch_fs *c, u64 sectors_moved, ++ u64 keys_moved), ++ TP_ARGS(c, sectors_moved, keys_moved), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, sectors_moved ) ++ __field(u64, keys_moved ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->sectors_moved = sectors_moved; ++ __entry->keys_moved = keys_moved; ++ ), ++ ++ TP_printk("%d,%d sectors_moved %llu keys_moved %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->sectors_moved, __entry->keys_moved) ++); ++ ++TRACE_EVENT(copygc, ++ TP_PROTO(struct bch_fs *c, ++ u64 sectors_moved, u64 sectors_not_moved, ++ u64 buckets_moved, u64 buckets_not_moved), ++ TP_ARGS(c, ++ sectors_moved, sectors_not_moved, ++ buckets_moved, buckets_not_moved), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_not_moved ) ++ __field(u64, buckets_moved ) ++ __field(u64, buckets_not_moved ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->sectors_moved = sectors_moved; ++ __entry->sectors_not_moved = sectors_not_moved; ++ __entry->buckets_moved = buckets_moved; ++ __entry->buckets_not_moved = buckets_moved; ++ ), ++ ++ TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) ++); ++ ++TRACE_EVENT(copygc_wait, ++ TP_PROTO(struct bch_fs *c, ++ u64 wait_amount, u64 until), ++ TP_ARGS(c, wait_amount, until), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(u64, wait_amount ) ++ __field(u64, until ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->wait_amount = wait_amount; ++ __entry->until = until; ++ ), ++ ++ TP_printk("%d,%u waiting for %llu sectors until %llu", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->wait_amount, __entry->until) ++); ++ ++DECLARE_EVENT_CLASS(transaction_event, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ ), ++ ++ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, transaction_commit, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, transaction_restart_ip, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, transaction_restart_injected, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_journal_res_get, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_journal_preres_get, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_fault_inject, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_traverse_all, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_mark_replicas, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip), ++ TP_ARGS(trans_fn, caller_ip) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart_iter, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, btree_id ) ++ __field(u64, pos_inode ) ++ __field(u64, pos_offset ) ++ __field(u32, pos_snapshot ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->pos_inode = pos->inode; ++ __entry->pos_offset = pos->offset; ++ __entry->pos_snapshot = pos->snapshot; ++ ), ++ ++ TP_printk("%s %pS btree %u pos %llu:%llu:%u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->btree_id, ++ __entry->pos_inode, ++ __entry->pos_offset, ++ __entry->pos_snapshot) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos) ++); ++ ++TRACE_EVENT(trans_restart_would_deadlock, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ bool in_traverse_all, ++ unsigned reason, ++ enum btree_id have_btree_id, ++ unsigned have_iter_type, ++ struct bpos *have_pos, ++ enum btree_id want_btree_id, ++ unsigned want_iter_type, ++ struct bpos *want_pos), ++ TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason, ++ have_btree_id, have_iter_type, have_pos, ++ want_btree_id, want_iter_type, want_pos), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, in_traverse_all ) ++ __field(u8, reason ) ++ __field(u8, have_btree_id ) ++ __field(u8, have_iter_type ) ++ __field(u8, want_btree_id ) ++ __field(u8, want_iter_type ) ++ ++ __field(u64, have_pos_inode ) ++ __field(u64, have_pos_offset ) ++ __field(u32, have_pos_snapshot) ++ __field(u32, want_pos_snapshot) ++ __field(u64, want_pos_inode ) ++ __field(u64, want_pos_offset ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->in_traverse_all = in_traverse_all; ++ __entry->reason = reason; ++ __entry->have_btree_id = have_btree_id; ++ __entry->have_iter_type = have_iter_type; ++ __entry->want_btree_id = want_btree_id; ++ __entry->want_iter_type = want_iter_type; ++ ++ __entry->have_pos_inode = have_pos->inode; ++ __entry->have_pos_offset = have_pos->offset; ++ __entry->have_pos_snapshot = have_pos->snapshot; ++ ++ __entry->want_pos_inode = want_pos->inode; ++ __entry->want_pos_offset = want_pos->offset; ++ __entry->want_pos_snapshot = want_pos->snapshot; ++ ), ++ ++ TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->in_traverse_all, ++ __entry->reason, ++ __entry->have_btree_id, ++ __entry->have_iter_type, ++ __entry->have_pos_inode, ++ __entry->have_pos_offset, ++ __entry->have_pos_snapshot, ++ __entry->want_btree_id, ++ __entry->want_iter_type, ++ __entry->want_pos_inode, ++ __entry->want_pos_offset, ++ __entry->want_pos_snapshot) ++); ++ ++TRACE_EVENT(trans_restart_would_deadlock_write, ++ TP_PROTO(const char *trans_fn), ++ TP_ARGS(trans_fn), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ ), ++ ++ TP_printk("%s", __entry->trans_fn) ++); ++ ++TRACE_EVENT(trans_restart_mem_realloced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ unsigned long bytes), ++ TP_ARGS(trans_fn, caller_ip, bytes), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(unsigned long, bytes ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->bytes = bytes; ++ ), ++ ++ TP_printk("%s %pS bytes %lu", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ __entry->bytes) ++); ++ ++TRACE_EVENT(trans_restart_key_cache_key_realloced, ++ TP_PROTO(const char *trans_fn, ++ unsigned long caller_ip, ++ enum btree_id btree_id, ++ struct bpos *pos, ++ unsigned old_u64s, ++ unsigned new_u64s), ++ TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 24 ) ++ __field(unsigned long, caller_ip ) ++ __field(enum btree_id, btree_id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, snapshot ) ++ __field(u32, old_u64s ) ++ __field(u32, new_u64s ) ++ ), ++ ++ TP_fast_assign( ++ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ++ __entry->caller_ip = caller_ip; ++ __entry->btree_id = btree_id; ++ __entry->inode = pos->inode; ++ __entry->offset = pos->offset; ++ __entry->snapshot = pos->snapshot; ++ __entry->old_u64s = old_u64s; ++ __entry->new_u64s = new_u64s; ++ ), ++ ++ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", ++ __entry->trans_fn, ++ (void *) __entry->caller_ip, ++ bch2_btree_ids[__entry->btree_id], ++ __entry->inode, ++ __entry->offset, ++ __entry->snapshot, ++ __entry->old_u64s, ++ __entry->new_u64s) ++); ++ ++#endif /* _TRACE_BCACHE_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/init/init_task.c b/init/init_task.c +index 73cc8f03511a..3e3aed110153 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -85,6 +85,7 @@ struct task_struct init_task + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, ++ .faults_disabled_mapping = NULL, + .restart_block = { + .fn = do_no_restart_syscall, + }, +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 4198f0273ecd..b2abd9a5d9ab 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB + config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP ++ ++config SIXLOCKS ++ bool +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index d51cabf28f38..cadbf6520c4b 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o + obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o ++obj-$(CONFIG_SIXLOCKS) += six.o +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index f06b91ca6482..0b1a3a949b47 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -6483,6 +6483,26 @@ void debug_check_no_locks_held(void) + } + EXPORT_SYMBOL_GPL(debug_check_no_locks_held); + ++#ifdef CONFIG_LOCKDEP ++int lock_class_is_held(struct lock_class_key *key) ++{ ++ struct task_struct *curr = current; ++ struct held_lock *hlock; ++ ++ if (unlikely(!debug_locks)) ++ return 0; ++ ++ for (hlock = curr->held_locks; ++ hlock < curr->held_locks + curr->lockdep_depth; ++ hlock++) ++ if (hlock->instance->key == key) ++ return 1; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(lock_class_is_held); ++#endif ++ + #ifdef __KERNEL__ + void debug_show_all_locks(void) + { +diff --git a/kernel/locking/six.c b/kernel/locking/six.c +new file mode 100644 +index 000000000000..fca1208720b6 +--- /dev/null ++++ b/kernel/locking/six.c +@@ -0,0 +1,759 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) ++#define six_release(l) lock_release(l, _RET_IP_) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u64 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u64 lock_fail; ++ ++ /* Value we add to the lock in order to release the lock: */ ++ u64 unlock_val; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u64 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) ++#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) ++#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) ++ ++#define LOCK_VALS { \ ++ [SIX_LOCK_read] = { \ ++ .lock_val = __SIX_VAL(read_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\ ++ .unlock_val = -__SIX_VAL(read_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_read, \ ++ .unlock_wakeup = SIX_LOCK_write, \ ++ }, \ ++ [SIX_LOCK_intent] = { \ ++ .lock_val = __SIX_VAL(intent_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_intent, \ ++ .unlock_val = -__SIX_VAL(intent_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_intent, \ ++ .unlock_wakeup = SIX_LOCK_intent, \ ++ }, \ ++ [SIX_LOCK_write] = { \ ++ .lock_val = __SIX_VAL(seq, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_read, \ ++ .unlock_val = __SIX_VAL(seq, 1), \ ++ .held_mask = __SIX_LOCK_HELD_write, \ ++ .unlock_wakeup = SIX_LOCK_read, \ ++ }, \ ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ union six_lock_state old) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!old.intent_lock) { ++ EBUG_ON(lock->owner); ++ lock->owner = current; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static inline unsigned pcpu_read_count(struct six_lock *lock) ++{ ++ unsigned read_count = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ read_count += *per_cpu_ptr(lock->readers, cpu); ++ return read_count; ++} ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++}; ++ ++/* This is probably up there with the more evil things I've done */ ++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) ++ ++static inline void six_lock_wakeup(struct six_lock *lock, ++ union six_lock_state state, ++ unsigned waitlist_id) ++{ ++ if (waitlist_id == SIX_LOCK_write) { ++ if (state.write_locking && !state.read_lock) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ if (p) ++ wake_up_process(p); ++ } ++ } else { ++ struct list_head *wait_list = &lock->wait_list[waitlist_id]; ++ struct six_lock_waiter *w, *next; ++ ++ if (!(state.waiters & (1 << waitlist_id))) ++ return; ++ ++ clear_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, wait_list, list) { ++ list_del_init(&w->list); ++ ++ if (wake_up_process(w->task) && ++ waitlist_id != SIX_LOCK_read) { ++ if (!list_empty(wait_list)) ++ set_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ break; ++ } ++ } ++ ++ raw_spin_unlock(&lock->wait_lock); ++ } ++} ++ ++static __always_inline bool do_six_trylock_type(struct six_lock *lock, ++ enum six_lock_type type, ++ bool try) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ bool ret; ++ u64 v; ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); ++ EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); ++ ++ EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); ++ ++ /* ++ * Percpu reader mode: ++ * ++ * The basic idea behind this algorithm is that you can implement a lock ++ * between two threads without any atomics, just memory barriers: ++ * ++ * For two threads you'll need two variables, one variable for "thread a ++ * has the lock" and another for "thread b has the lock". ++ * ++ * To take the lock, a thread sets its variable indicating that it holds ++ * the lock, then issues a full memory barrier, then reads from the ++ * other thread's variable to check if the other thread thinks it has ++ * the lock. If we raced, we backoff and retry/sleep. ++ */ ++ ++ if (type == SIX_LOCK_read && lock->readers) { ++retry: ++ preempt_disable(); ++ this_cpu_inc(*lock->readers); /* signal that we own lock */ ++ ++ smp_mb(); ++ ++ old.v = READ_ONCE(lock->state.v); ++ ret = !(old.v & l[type].lock_fail); ++ ++ this_cpu_sub(*lock->readers, !ret); ++ preempt_enable(); ++ ++ /* ++ * If we failed because a writer was trying to take the ++ * lock, issue a wakeup because we might have caused a ++ * spurious trylock failure: ++ */ ++ if (old.write_locking) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ } ++ ++ /* ++ * If we failed from the lock path and the waiting bit wasn't ++ * set, set it: ++ */ ++ if (!try && !ret) { ++ v = old.v; ++ ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) ++ goto retry; ++ ++ if (new.waiters & (1 << type)) ++ break; ++ ++ new.waiters |= 1 << type; ++ } while ((v = atomic64_cmpxchg(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ } ++ } else if (type == SIX_LOCK_write && lock->readers) { ++ if (try) { ++ atomic64_add(__SIX_VAL(write_locking, 1), ++ &lock->state.counter); ++ smp_mb__after_atomic(); ++ } ++ ++ ret = !pcpu_read_count(lock); ++ ++ /* ++ * On success, we increment lock->seq; also we clear ++ * write_locking unless we failed from the lock path: ++ */ ++ v = 0; ++ if (ret) ++ v += __SIX_VAL(seq, 1); ++ if (ret || try) ++ v -= __SIX_VAL(write_locking, 1); ++ ++ if (try && !ret) { ++ old.v = atomic64_add_return(v, &lock->state.counter); ++ six_lock_wakeup(lock, old, SIX_LOCK_read); ++ } else { ++ atomic64_add(v, &lock->state.counter); ++ } ++ } else { ++ v = READ_ONCE(lock->state.v); ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) { ++ new.v += l[type].lock_val; ++ ++ if (type == SIX_LOCK_write) ++ new.write_locking = 0; ++ } else if (!try && type != SIX_LOCK_write && ++ !(new.waiters & (1 << type))) ++ new.waiters |= 1 << type; ++ else ++ break; /* waiting bit already set */ ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ ret = !(old.v & l[type].lock_fail); ++ ++ EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); ++ } ++ ++ if (ret) ++ six_set_owner(lock, type, old); ++ ++ EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); ++ ++ return ret; ++} ++ ++__always_inline __flatten ++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ if (!do_six_trylock_type(lock, type, true)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v; ++ ++ EBUG_ON(type == SIX_LOCK_write); ++ ++ if (type == SIX_LOCK_read && ++ lock->readers) { ++ bool ret; ++ ++ preempt_disable(); ++ this_cpu_inc(*lock->readers); ++ ++ smp_mb(); ++ ++ old.v = READ_ONCE(lock->state.v); ++ ret = !(old.v & l[type].lock_fail) && old.seq == seq; ++ ++ this_cpu_sub(*lock->readers, !ret); ++ preempt_enable(); ++ ++ /* ++ * Similar to the lock path, we may have caused a spurious write ++ * lock fail and need to issue a wakeup: ++ */ ++ if (old.write_locking) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ } ++ ++ if (ret) ++ six_acquire(&lock->dep_map, 1); ++ ++ return ret; ++ } ++ ++ v = READ_ONCE(lock->state.v); ++ do { ++ old.v = v; ++ ++ if (old.seq != seq || old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline int six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ int retval = 1; ++ ++ if (need_resched()) ++ return 0; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ if (owner) ++ retval = owner->on_cpu; ++ rcu_read_unlock(); ++ /* ++ * if lock->owner is not set, the mutex owner may have just acquired ++ * it and not set the owner yet or the mutex has been released. ++ */ ++ return retval; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner) ++{ ++ bool ret = true; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner->on_cpu || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner)) ++ break; ++ ++ if (do_six_trylock_type(lock, type, false)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ union six_lock_state old; ++ struct six_lock_waiter wait; ++ int ret = 0; ++ ++ if (type == SIX_LOCK_write) { ++ EBUG_ON(lock->state.write_locking); ++ atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); ++ smp_mb__after_atomic(); ++ } ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ goto out_before_sleep; ++ ++ if (six_optimistic_spin(lock, type)) ++ goto out_before_sleep; ++ ++ lock_contended(&lock->dep_map, _RET_IP_); ++ ++ INIT_LIST_HEAD(&wait.list); ++ wait.task = current; ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (type == SIX_LOCK_write) ++ EBUG_ON(lock->owner != current); ++ else if (list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_add_tail(&wait.list, &lock->wait_list[type]); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ if (do_six_trylock_type(lock, type, false)) ++ break; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ break; ++ ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (!list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_del_init(&wait.list); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++out_before_sleep: ++ if (ret && type == SIX_LOCK_write) { ++ old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), ++ &lock->state.counter); ++ six_lock_wakeup(lock, old, SIX_LOCK_read); ++ } ++ ++ return ret; ++} ++ ++__always_inline ++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ int ret; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0); ++ ++ ret = do_six_trylock_type(lock, type, true) ? 0 ++ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ if (!ret) ++ lock_acquired(&lock->dep_map, _RET_IP_); ++ ++ return ret; ++} ++ ++__always_inline __flatten ++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state state; ++ ++ EBUG_ON(type == SIX_LOCK_write && ++ !(lock->state.v & __SIX_LOCK_HELD_intent)); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ ++ if (type == SIX_LOCK_intent) { ++ EBUG_ON(lock->owner != current); ++ ++ if (lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ lock->owner = NULL; ++ } ++ ++ if (type == SIX_LOCK_read && ++ lock->readers) { ++ smp_mb(); /* unlock barrier */ ++ this_cpu_dec(*lock->readers); ++ smp_mb(); /* between unlocking and checking for waiters */ ++ state.v = READ_ONCE(lock->state.v); ++ } else { ++ EBUG_ON(!(lock->state.v & l[type].held_mask)); ++ state.v = atomic64_add_return_release(l[type].unlock_val, ++ &lock->state.counter); ++ } ++ ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return __six_trylock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_trylock_##type); \ ++ \ ++bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ ++} \ ++EXPORT_SYMBOL_GPL(six_relock_##type); \ ++ \ ++int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p) \ ++{ \ ++ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ ++} \ ++EXPORT_SYMBOL_GPL(six_lock_##type); \ ++ \ ++void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ __six_unlock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_unlock_##type); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++ ++#undef __SIX_LOCK ++ ++/* Convert from intent to read: */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ union six_lock_state old, new; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ new.v = old.v = v; ++ ++ if (new.intent_lock) ++ return false; ++ ++ if (!lock->readers) { ++ EBUG_ON(!new.read_lock); ++ new.read_lock--; ++ } ++ ++ new.intent_lock = 1; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ if (lock->readers) ++ this_cpu_dec(*lock->readers); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/* ++ * Increment read/intent lock count, assuming we already have it read or intent ++ * locked: ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ ++ six_acquire(&lock->dep_map, 0); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ if (lock->readers) { ++ this_cpu_inc(*lock->readers); ++ } else { ++ EBUG_ON(!lock->state.read_lock && ++ !lock->state.intent_lock); ++ atomic64_add(l[type].lock_val, &lock->state.counter); ++ } ++ break; ++ case SIX_LOCK_intent: ++ EBUG_ON(!lock->state.intent_lock); ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ struct six_lock_waiter *w; ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry(w, &lock->wait_list[0], list) ++ wake_up_process(w->task); ++ list_for_each_entry(w, &lock->wait_list[1], list) ++ wake_up_process(w->task); ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); ++ ++struct free_pcpu_rcu { ++ struct rcu_head rcu; ++ void __percpu *p; ++}; ++ ++static void free_pcpu_rcu_fn(struct rcu_head *_rcu) ++{ ++ struct free_pcpu_rcu *rcu = ++ container_of(_rcu, struct free_pcpu_rcu, rcu); ++ ++ free_percpu(rcu->p); ++ kfree(rcu); ++} ++ ++void six_lock_pcpu_free_rcu(struct six_lock *lock) ++{ ++ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); ++ ++ if (!rcu) ++ return; ++ ++ rcu->p = lock->readers; ++ lock->readers = NULL; ++ ++ call_rcu(&rcu->rcu, free_pcpu_rcu_fn); ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); ++ ++void six_lock_pcpu_free(struct six_lock *lock) ++{ ++ BUG_ON(lock->readers && pcpu_read_count(lock)); ++ BUG_ON(lock->state.read_lock); ++ ++ free_percpu(lock->readers); ++ lock->readers = NULL; ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_free); ++ ++void six_lock_pcpu_alloc(struct six_lock *lock) ++{ ++#ifdef __KERNEL__ ++ if (!lock->readers) ++ lock->readers = alloc_percpu(unsigned); ++#endif ++} ++EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); +diff --git a/kernel/module/main.c b/kernel/module/main.c +index 0548151dd933..55ba98a99387 100644 +--- a/kernel/module/main.c ++++ b/kernel/module/main.c +@@ -1608,9 +1608,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) + + void * __weak module_alloc(unsigned long size) + { +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, +- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, +- NUMA_NO_NODE, __builtin_return_address(0)); ++ return vmalloc_exec(size, GFP_KERNEL); + } + + bool __weak module_init_section(const char *name) +diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c +index 9ed5ce989415..3428568bb3f1 100644 +--- a/kernel/stacktrace.c ++++ b/kernel/stacktrace.c +@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, + put_task_stack(tsk); + return c.len; + } ++EXPORT_SYMBOL(stack_trace_save_tsk); + + /** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array +@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, + save_stack_trace_tsk(task, &trace); + return trace.nr_entries; + } ++EXPORT_SYMBOL(stack_trace_save_tsk); + + /** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index b8dd54627075..26cfe909f9af 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -1673,15 +1673,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) + { + int len; + +- if (trace_seq_used(s) <= s->seq.readpos) ++ if (trace_seq_used(s) <= s->readpos) + return -EBUSY; + +- len = trace_seq_used(s) - s->seq.readpos; ++ len = trace_seq_used(s) - s->readpos; + if (cnt > len) + cnt = len; +- memcpy(buf, s->buffer + s->seq.readpos, cnt); ++ memcpy(buf, s->buffer + s->readpos, cnt); + +- s->seq.readpos += cnt; ++ s->readpos += cnt; + return cnt; + } + +@@ -3728,11 +3728,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, + + static const char *show_buffer(struct trace_seq *s) + { +- struct seq_buf *seq = &s->seq; +- +- seq_buf_terminate(seq); +- +- return seq->buffer; ++ return printbuf_str(&s->seq); + } + + static DEFINE_STATIC_KEY_FALSE(trace_no_verify); +@@ -6759,12 +6755,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, + trace_access_lock(iter->cpu_file); + while (trace_find_next_entry_inc(iter) != NULL) { + enum print_line_t ret; +- int save_len = iter->seq.seq.len; ++ int save_pos = iter->seq.seq.pos; + + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + /* don't print partial lines */ +- iter->seq.seq.len = save_len; ++ iter->seq.seq.pos = save_pos; + break; + } + if (ret != TRACE_TYPE_NO_CONSUME) +@@ -6786,7 +6782,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); +- if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) ++ if (iter->seq.readpos >= trace_seq_used(&iter->seq)) + trace_seq_init(&iter->seq); + + /* +@@ -6812,16 +6808,15 @@ static size_t + tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) + { + size_t count; +- int save_len; + int ret; + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { +- save_len = iter->seq.seq.len; ++ unsigned save_pos = iter->seq.seq.pos; + ret = print_trace_line(iter); + + if (trace_seq_has_overflowed(&iter->seq)) { +- iter->seq.seq.len = save_len; ++ iter->seq.seq.pos = save_pos; + break; + } + +@@ -6831,14 +6826,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) + * anyway to be safe. + */ + if (ret == TRACE_TYPE_PARTIAL_LINE) { +- iter->seq.seq.len = save_len; ++ iter->seq.seq.pos = save_pos; + break; + } + +- count = trace_seq_used(&iter->seq) - save_len; ++ count = trace_seq_used(&iter->seq) - save_pos; + if (rem < count) { + rem = 0; +- iter->seq.seq.len = save_len; ++ iter->seq.seq.pos = save_pos; + break; + } + +@@ -9827,20 +9822,8 @@ static struct notifier_block trace_die_notifier = { + void + trace_printk_seq(struct trace_seq *s) + { +- /* Probably should print a warning here. */ +- if (s->seq.len >= TRACE_MAX_PRINT) +- s->seq.len = TRACE_MAX_PRINT; +- +- /* +- * More paranoid code. Although the buffer size is set to +- * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just +- * an extra layer of protection. +- */ +- if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) +- s->seq.len = s->seq.size - 1; +- + /* should be zero ended, but we are paranoid. */ +- s->buffer[s->seq.len] = 0; ++ printbuf_nul_terminate(&s->seq); + + printk(KERN_TRACE "%s", s->buffer); + +diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c +index 076b447a1b88..30a106c16871 100644 +--- a/kernel/trace/trace_dynevent.c ++++ b/kernel/trace/trace_dynevent.c +@@ -290,21 +290,19 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, + struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg) + { +- int ret = 0; +- + if (check_arg) { +- ret = check_arg(arg); ++ int ret = check_arg(arg); + if (ret) + return ret; + } + +- ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator); +- if (ret) { ++ prt_printf(&cmd->seq, " %s%c", arg->str, arg->separator); ++ if (printbuf_overflowed(&cmd->seq)) { + pr_err("String is too long: %s%c\n", arg->str, arg->separator); + return -E2BIG; + } + +- return ret; ++ return 0; + } + + /** +@@ -335,25 +333,23 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg) + { +- int ret = 0; +- + if (check_arg) { +- ret = check_arg(arg_pair); ++ int ret = check_arg(arg_pair); + if (ret) + return ret; + } + +- ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs, +- arg_pair->operator, arg_pair->rhs, +- arg_pair->separator); +- if (ret) { ++ prt_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs, ++ arg_pair->operator, arg_pair->rhs, ++ arg_pair->separator); ++ if (printbuf_overflowed(&cmd->seq)) { + pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs, + arg_pair->operator, arg_pair->rhs, + arg_pair->separator); + return -E2BIG; + } + +- return ret; ++ return 0; + } + + /** +@@ -368,15 +364,13 @@ int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + */ + int dynevent_str_add(struct dynevent_cmd *cmd, const char *str) + { +- int ret = 0; +- +- ret = seq_buf_puts(&cmd->seq, str); +- if (ret) { ++ prt_str(&cmd->seq, str); ++ if (printbuf_overflowed(&cmd->seq)) { + pr_err("String is too long: %s\n", str); + return -E2BIG; + } + +- return ret; ++ return 0; + } + + /** +@@ -405,7 +399,7 @@ void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, + { + memset(cmd, '\0', sizeof(*cmd)); + +- seq_buf_init(&cmd->seq, buf, maxlen); ++ cmd->seq = PRINTBUF_EXTERN(buf, maxlen); + cmd->type = type; + cmd->run_command = run_command; + } +diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c +index 4b1057ab9d96..9d5137df1a15 100644 +--- a/kernel/trace/trace_events_filter.c ++++ b/kernel/trace/trace_events_filter.c +@@ -1059,7 +1059,7 @@ static void append_filter_err(struct trace_array *tr, + FILT_ERR_ERRNO, 0); + } + trace_seq_putc(s, 0); +- buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); ++ buf = kstrdup(printbuf_str(&s->seq), GFP_KERNEL); + if (buf) { + kfree(filter->filter_string); + filter->filter_string = buf; +diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c +index 5e8c07aef071..ddb2a2737b82 100644 +--- a/kernel/trace/trace_events_synth.c ++++ b/kernel/trace/trace_events_synth.c +@@ -5,13 +5,14 @@ + * Copyright (C) 2015, 2020 Tom Zanussi + */ + +-#include + #include +-#include ++#include + #include ++#include ++#include ++#include + #include + #include +-#include + #include + + /* for gfp flag names */ +@@ -611,7 +612,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv, + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + struct synth_field *field; + int len, ret = -ENOMEM; +- struct seq_buf s; ++ struct printbuf s; + ssize_t size; + + if (!strcmp(field_type, "unsigned")) { +@@ -666,17 +667,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv, + if (!field->type) + goto free; + +- seq_buf_init(&s, field->type, len); ++ s = PRINTBUF_EXTERN(field->type, len); + if (prefix) +- seq_buf_puts(&s, prefix); +- seq_buf_puts(&s, field_type); ++ prt_str(&s, prefix); ++ prt_str(&s, field_type); + if (array) +- seq_buf_puts(&s, array); +- if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) ++ prt_str(&s, array); ++ if (WARN_ON_ONCE(!printbuf_remaining(&s))) + goto free; + +- s.buffer[s.len] = '\0'; +- + size = synth_field_size(field->type); + if (size < 0) { + if (array) +@@ -694,13 +693,12 @@ static struct synth_field *parse_synth_field(int argc, char **argv, + if (!type) + goto free; + +- seq_buf_init(&s, type, len); +- seq_buf_puts(&s, "__data_loc "); +- seq_buf_puts(&s, field->type); ++ s = PRINTBUF_EXTERN(type, len); ++ prt_str(&s, "__data_loc "); ++ prt_str(&s, field->type); + +- if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) ++ if (WARN_ON_ONCE(!printbuf_remaining(&s))) + goto free; +- s.buffer[s.len] = '\0'; + + kfree(field->type); + field->type = type; +@@ -1514,7 +1512,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd) + struct synth_event *se; + int ret; + +- ret = create_or_delete_synth_event(cmd->seq.buffer); ++ ret = create_or_delete_synth_event(cmd->seq.buf); + if (ret) + return ret; + +diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c +index 203204cadf92..9f270fdde99b 100644 +--- a/kernel/trace/trace_functions_graph.c ++++ b/kernel/trace/trace_functions_graph.c +@@ -1022,9 +1022,9 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + goto out; + + /* Strip ending newline */ +- if (s->buffer[s->seq.len - 1] == '\n') { +- s->buffer[s->seq.len - 1] = '\0'; +- s->seq.len--; ++ if (s->buffer[s->seq.pos - 1] == '\n') { ++ s->buffer[s->seq.pos - 1] = '\0'; ++ s->seq.pos--; + } + + trace_seq_puts(s, " */\n"); +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index a245ea673715..c9f03c2d7c91 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -915,7 +915,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) + + static int trace_kprobe_run_command(struct dynevent_cmd *cmd) + { +- return create_or_delete_trace_kprobe(cmd->seq.buffer); ++ return create_or_delete_trace_kprobe(printbuf_str(&cmd->seq)); + } + + /** +diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c +index 9c90b3a7dce2..48c08f29c342 100644 +--- a/kernel/trace/trace_seq.c ++++ b/kernel/trace/trace_seq.c +@@ -25,11 +25,9 @@ + */ + #include + #include ++#include + #include + +-/* How much buffer is left on the trace_seq? */ +-#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) +- + /* + * trace_seq should work with being initialized with 0s. + */ +@@ -54,7 +52,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) + + __trace_seq_init(s); + +- ret = seq_buf_print_seq(m, &s->seq); ++ ret = seq_write(m, s->seq.buf, printbuf_written(&s->seq)); + + /* + * Only reset this buffer if we successfully wrote to the +@@ -80,7 +78,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) + */ + void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + va_list ap; + + if (s->full) +@@ -89,12 +87,12 @@ void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __trace_seq_init(s); + + va_start(ap, fmt); +- seq_buf_vprintf(&s->seq, fmt, ap); ++ prt_vprintf(&s->seq, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + } + } +@@ -111,17 +109,17 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); + void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return; + + __trace_seq_init(s); + +- seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp); ++ prt_printf(&s->seq, "%*pb", nmaskbits, maskp); + +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + } + } +@@ -140,18 +138,18 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); + */ + void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return; + + __trace_seq_init(s); + +- seq_buf_vprintf(&s->seq, fmt, args); ++ prt_vprintf(&s->seq, fmt, args); + + /* If we can't write it all, don't bother writing anything */ +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + } + } +@@ -174,18 +172,18 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf); + */ + void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return; + + __trace_seq_init(s); + +- seq_buf_bprintf(&s->seq, fmt, binary); ++ prt_bstrprintf(&s->seq, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(!printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + return; + } +@@ -211,12 +209,12 @@ void trace_seq_puts(struct trace_seq *s, const char *str) + + __trace_seq_init(s); + +- if (len > TRACE_SEQ_BUF_LEFT(s)) { ++ if (len > printbuf_remaining(&s->seq)) { + s->full = 1; + return; + } + +- seq_buf_putmem(&s->seq, str, len); ++ prt_bytes(&s->seq, str, len); + } + EXPORT_SYMBOL_GPL(trace_seq_puts); + +@@ -237,12 +235,12 @@ void trace_seq_putc(struct trace_seq *s, unsigned char c) + + __trace_seq_init(s); + +- if (TRACE_SEQ_BUF_LEFT(s) < 1) { ++ if (!printbuf_remaining(&s->seq)) { + s->full = 1; + return; + } + +- seq_buf_putc(&s->seq, c); ++ prt_char(&s->seq, c); + } + EXPORT_SYMBOL_GPL(trace_seq_putc); + +@@ -263,12 +261,12 @@ void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) + + __trace_seq_init(s); + +- if (len > TRACE_SEQ_BUF_LEFT(s)) { ++ if (len > printbuf_remaining(&s->seq)) { + s->full = 1; + return; + } + +- seq_buf_putmem(&s->seq, mem, len); ++ prt_bytes(&s->seq, mem, len); + } + EXPORT_SYMBOL_GPL(trace_seq_putmem); + +@@ -285,24 +283,17 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem); + void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return; + + __trace_seq_init(s); + +- /* Each byte is represented by two chars */ +- if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { +- s->full = 1; +- return; +- } ++ prt_hex_bytes(&s->seq, mem, len, 8, ' '); + +- /* The added spaces can still cause an overflow */ +- seq_buf_putmem_hex(&s->seq, mem, len); +- +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + return; + } +@@ -323,22 +314,22 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + */ + int trace_seq_path(struct trace_seq *s, const struct path *path) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return 0; + + __trace_seq_init(s); + +- if (TRACE_SEQ_BUF_LEFT(s) < 1) { ++ if (printbuf_remaining(&s->seq) < 1) { + s->full = 1; + return 0; + } + +- seq_buf_path(&s->seq, path, "\n"); ++ prt_path(&s->seq, path, "\n"); + +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + return 0; + } +@@ -369,8 +360,25 @@ EXPORT_SYMBOL_GPL(trace_seq_path); + */ + int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) + { ++ int ret, len; ++ + __trace_seq_init(s); +- return seq_buf_to_user(&s->seq, ubuf, cnt); ++ ++ len = printbuf_written(&s->seq); ++ if (len <= s->readpos) ++ return -EBUSY; ++ ++ len -= s->readpos; ++ if (cnt > len) ++ cnt = len; ++ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); ++ if (ret == cnt) ++ return -EFAULT; ++ ++ cnt -= ret; ++ ++ s->readpos += cnt; ++ return cnt; + } + EXPORT_SYMBOL_GPL(trace_seq_to_user); + +@@ -378,24 +386,19 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) + { +- unsigned int save_len = s->seq.len; ++ unsigned int save_pos = s->seq.pos; + + if (s->full) + return 0; + + __trace_seq_init(s); + +- if (TRACE_SEQ_BUF_LEFT(s) < 1) { +- s->full = 1; +- return 0; +- } +- +- seq_buf_hex_dump(&(s->seq), prefix_str, +- prefix_type, rowsize, groupsize, +- buf, len, ascii); ++ prt_hex_dump(&s->seq, buf, len, ++ prefix_str, prefix_type, ++ rowsize, groupsize, ascii); + +- if (unlikely(seq_buf_has_overflowed(&s->seq))) { +- s->seq.len = save_len; ++ if (unlikely(printbuf_overflowed(&s->seq))) { ++ s->seq.pos = save_pos; + s->full = 1; + return 0; + } +diff --git a/lib/Kconfig b/lib/Kconfig +index eaaad4d85bf2..8eb7050fb422 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -491,6 +491,9 @@ config ASSOCIATIVE_ARRAY + + for more information. + ++config CLOSURES ++ bool ++ + config HAS_IOMEM + bool + depends on !NO_IOMEM +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 2e24db4bff19..1d4ed12a5355 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1646,6 +1646,15 @@ config DEBUG_CREDENTIALS + + source "kernel/rcu/Kconfig.debug" + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL +diff --git a/lib/Makefile b/lib/Makefile +index f99bf61f8bbc..d24209a59df9 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -30,11 +30,11 @@ endif + lib-y := ctype.o string.o vsprintf.o cmdline.o \ + rbtree.o radix-tree.o timerqueue.o xarray.o \ + idr.o extable.o sha1.o irq_regs.o argv_split.o \ +- flex_proportions.o ratelimit.o show_mem.o \ ++ flex_proportions.o ratelimit.o \ + is_single_threaded.o plist.o decompress.o kobject_uevent.o \ +- earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ ++ earlycpio.o siphash.o dec_and_lock.o \ + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \ +- buildid.o ++ buildid.o printbuf.o pretty-printers.o + + lib-$(CONFIG_PRINTK) += dump_stack.o + lib-$(CONFIG_SMP) += cpumask.o +@@ -241,6 +241,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + + obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o + ++obj-$(CONFIG_CLOSURES) += closure.o ++ + obj-$(CONFIG_DQL) += dynamic_queue_limits.o + + obj-$(CONFIG_GLOB) += glob.o +diff --git a/drivers/md/bcache/closure.c b/lib/closure.c +similarity index 88% +rename from drivers/md/bcache/closure.c +rename to lib/closure.c +index d8d9394a6beb..b38ded00b9b0 100644 +--- a/drivers/md/bcache/closure.c ++++ b/lib/closure.c +@@ -6,13 +6,12 @@ + * Copyright 2012 Google, Inc. + */ + ++#include + #include +-#include ++#include + #include + #include + +-#include "closure.h" +- + static inline void closure_put_after_sub(struct closure *cl, int flags) + { + int r = flags & CLOSURE_REMAINING_MASK; +@@ -45,6 +44,7 @@ void closure_sub(struct closure *cl, int v) + { + closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); + } ++EXPORT_SYMBOL(closure_sub); + + /* + * closure_put - decrement a closure's refcount +@@ -53,6 +53,7 @@ void closure_put(struct closure *cl) + { + closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); + } ++EXPORT_SYMBOL(closure_put); + + /* + * closure_wake_up - wake up all closures on a wait list, without memory barrier +@@ -74,6 +75,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) + closure_sub(cl, CLOSURE_WAITING + 1); + } + } ++EXPORT_SYMBOL(__closure_wake_up); + + /** + * closure_wait - add a closure to a waitlist +@@ -93,6 +95,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) + + return true; + } ++EXPORT_SYMBOL(closure_wait); + + struct closure_syncer { + struct task_struct *task; +@@ -127,8 +130,9 @@ void __sched __closure_sync(struct closure *cl) + + __set_current_state(TASK_RUNNING); + } ++EXPORT_SYMBOL(__closure_sync); + +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG ++#ifdef CONFIG_DEBUG_CLOSURES + + static LIST_HEAD(closure_list); + static DEFINE_SPINLOCK(closure_list_lock); +@@ -144,6 +148,7 @@ void closure_debug_create(struct closure *cl) + list_add(&cl->all, &closure_list); + spin_unlock_irqrestore(&closure_list_lock, flags); + } ++EXPORT_SYMBOL(closure_debug_create); + + void closure_debug_destroy(struct closure *cl) + { +@@ -156,8 +161,7 @@ void closure_debug_destroy(struct closure *cl) + list_del(&cl->all); + spin_unlock_irqrestore(&closure_list_lock, flags); + } +- +-static struct dentry *closure_debug; ++EXPORT_SYMBOL(closure_debug_destroy); + + static int debug_show(struct seq_file *f, void *data) + { +@@ -181,7 +185,7 @@ static int debug_show(struct seq_file *f, void *data) + seq_printf(f, " W %pS\n", + (void *) cl->waiting_on); + +- seq_printf(f, "\n"); ++ seq_puts(f, "\n"); + } + + spin_unlock_irq(&closure_list_lock); +@@ -190,18 +194,11 @@ static int debug_show(struct seq_file *f, void *data) + + DEFINE_SHOW_ATTRIBUTE(debug); + +-void __init closure_debug_init(void) ++static int __init closure_debug_init(void) + { +- if (!IS_ERR_OR_NULL(bcache_debug)) +- /* +- * it is unnecessary to check return value of +- * debugfs_create_file(), we should not care +- * about this. +- */ +- closure_debug = debugfs_create_file( +- "closures", 0400, bcache_debug, NULL, &debug_fops); ++ debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); ++ return 0; + } +-#endif ++late_initcall(closure_debug_init) + +-MODULE_AUTHOR("Kent Overstreet "); +-MODULE_LICENSE("GPL"); ++#endif +diff --git a/lib/errname.c b/lib/errname.c +index 05cbf731545f..82ea4778f478 100644 +--- a/lib/errname.c ++++ b/lib/errname.c +@@ -222,3 +222,4 @@ const char *errname(int err) + + return err > 0 ? name + 1 : name; + } ++EXPORT_SYMBOL(errname); +diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c +index f25eb111c051..41f1bcdc4488 100644 +--- a/lib/generic-radix-tree.c ++++ b/lib/generic-radix-tree.c +@@ -1,4 +1,5 @@ + ++#include + #include + #include + #include +@@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter, + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; ++ ++ if (iter->offset == SIZE_MAX) ++ return NULL; ++ + restart: + r = READ_ONCE(radix->root); + if (!r) +@@ -184,10 +189,17 @@ void *__genradix_iter_peek(struct genradix_iter *iter, + (GENRADIX_ARY - 1); + + while (!n->children[i]) { ++ size_t objs_per_ptr = genradix_depth_size(level); ++ ++ if (iter->offset + objs_per_ptr < iter->offset) { ++ iter->offset = SIZE_MAX; ++ iter->pos = SIZE_MAX; ++ return NULL; ++ } ++ + i++; +- iter->offset = round_down(iter->offset + +- genradix_depth_size(level), +- genradix_depth_size(level)); ++ iter->offset = round_down(iter->offset + objs_per_ptr, ++ objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * + objs_per_page; + if (i == GENRADIX_ARY) +@@ -201,6 +213,64 @@ void *__genradix_iter_peek(struct genradix_iter *iter, + } + EXPORT_SYMBOL(__genradix_iter_peek); + ++void *__genradix_iter_peek_prev(struct genradix_iter *iter, ++ struct __genradix *radix, ++ size_t objs_per_page, ++ size_t obj_size_plus_page_remainder) ++{ ++ struct genradix_root *r; ++ struct genradix_node *n; ++ unsigned level, i; ++ ++ if (iter->offset == SIZE_MAX) ++ return NULL; ++ ++restart: ++ r = READ_ONCE(radix->root); ++ if (!r) ++ return NULL; ++ ++ n = genradix_root_to_node(r); ++ level = genradix_root_to_depth(r); ++ ++ if (ilog2(iter->offset) >= genradix_depth_shift(level)) { ++ iter->offset = genradix_depth_size(level); ++ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; ++ ++ iter->offset -= obj_size_plus_page_remainder; ++ iter->pos--; ++ } ++ ++ while (level) { ++ level--; ++ ++ i = (iter->offset >> genradix_depth_shift(level)) & ++ (GENRADIX_ARY - 1); ++ ++ while (!n->children[i]) { ++ size_t objs_per_ptr = genradix_depth_size(level); ++ ++ iter->offset = round_down(iter->offset, objs_per_ptr); ++ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; ++ ++ if (!iter->offset) ++ return NULL; ++ ++ iter->offset -= obj_size_plus_page_remainder; ++ iter->pos--; ++ ++ if (!i) ++ goto restart; ++ --i; ++ } ++ ++ n = n->children[i]; ++ } ++ ++ return &n->data[iter->offset & (PAGE_SIZE - 1)]; ++} ++EXPORT_SYMBOL(__genradix_iter_peek_prev); ++ + static void genradix_free_recurse(struct genradix_node *n, unsigned level) + { + if (level) { +diff --git a/lib/hexdump.c b/lib/hexdump.c +index 06833d404398..9556f15ad295 100644 +--- a/lib/hexdump.c ++++ b/lib/hexdump.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + + const char hex_asc[] = "0123456789abcdef"; +@@ -79,32 +80,40 @@ int hex2bin(u8 *dst, const char *src, size_t count) + EXPORT_SYMBOL(hex2bin); + + /** +- * bin2hex - convert binary data to an ascii hexadecimal string +- * @dst: ascii hexadecimal result +- * @src: binary data +- * @count: binary data length ++ * prt_hex_bytes - Print a string of hex bytes, with optional separator ++ * ++ * @out: The printbuf to output to ++ * @addr: Buffer to print ++ * @nr: Number of bytes to print ++ * @separator: Optional separator character between each byte + */ +-char *bin2hex(char *dst, const void *src, size_t count) ++void prt_hex_bytes(struct printbuf *out, const void *buf, unsigned len, ++ unsigned groupsize, unsigned separator) + { +- const unsigned char *_src = src; ++ const u8 *ptr = buf; ++ unsigned i; + +- while (count--) +- dst = hex_byte_pack(dst, *_src++); +- return dst; ++ if (!groupsize) ++ groupsize = 1; ++ ++ for (i = 0; i < len ; ++i) { ++ if (i && separator && !(i % groupsize)) ++ __prt_char(out, separator); ++ prt_hex_byte(out, ptr[i]); ++ } + } +-EXPORT_SYMBOL(bin2hex); ++EXPORT_SYMBOL(prt_hex_bytes); + + /** +- * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory ++ * prt_hex_line - convert a blob of data to "hex ASCII" in memory ++ * @out: printbuf to output to + * @buf: data blob to dump + * @len: number of bytes in the @buf + * @rowsize: number of bytes to print per line; must be 16 or 32 + * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) +- * @linebuf: where to put the converted data +- * @linebuflen: total size of @linebuf, including space for terminating NUL + * @ascii: include ASCII after the hex output + * +- * hex_dump_to_buffer() works on one "line" of output at a time, i.e., ++ * prt_hex_line() works on one "line" of output at a time, i.e., + * 16 or 32 bytes of input data converted to hex + ASCII output. + * + * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data +@@ -117,22 +126,13 @@ EXPORT_SYMBOL(bin2hex); + * + * example output buffer: + * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO +- * +- * Return: +- * The amount of bytes placed in the buffer without terminating NUL. If the +- * output was truncated, then the return value is the number of bytes +- * (excluding the terminating NUL) which would have been written to the final +- * string if enough space had been available. + */ +-int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, +- char *linebuf, size_t linebuflen, bool ascii) ++void prt_hex_line(struct printbuf *out, const void *buf, size_t len, ++ int rowsize, int groupsize, bool ascii) + { ++ unsigned saved_pos = out->pos; + const u8 *ptr = buf; +- int ngroups; +- u8 ch; +- int j, lx = 0; +- int ascii_column; +- int ret; ++ int i, ngroups; + + if (rowsize != 16 && rowsize != 32) + rowsize = 16; +@@ -145,84 +145,127 @@ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, + groupsize = 1; + + ngroups = len / groupsize; +- ascii_column = rowsize * 2 + rowsize / groupsize + 1; +- +- if (!linebuflen) +- goto overflow1; + + if (!len) +- goto nil; +- +- if (groupsize == 8) { +- const u64 *ptr8 = buf; +- +- for (j = 0; j < ngroups; j++) { +- ret = snprintf(linebuf + lx, linebuflen - lx, +- "%s%16.16llx", j ? " " : "", +- get_unaligned(ptr8 + j)); +- if (ret >= linebuflen - lx) +- goto overflow1; +- lx += ret; +- } +- } else if (groupsize == 4) { +- const u32 *ptr4 = buf; +- +- for (j = 0; j < ngroups; j++) { +- ret = snprintf(linebuf + lx, linebuflen - lx, +- "%s%8.8x", j ? " " : "", +- get_unaligned(ptr4 + j)); +- if (ret >= linebuflen - lx) +- goto overflow1; +- lx += ret; +- } +- } else if (groupsize == 2) { +- const u16 *ptr2 = buf; +- +- for (j = 0; j < ngroups; j++) { +- ret = snprintf(linebuf + lx, linebuflen - lx, +- "%s%4.4x", j ? " " : "", +- get_unaligned(ptr2 + j)); +- if (ret >= linebuflen - lx) +- goto overflow1; +- lx += ret; +- } +- } else { +- for (j = 0; j < len; j++) { +- if (linebuflen < lx + 2) +- goto overflow2; +- ch = ptr[j]; +- linebuf[lx++] = hex_asc_hi(ch); +- if (linebuflen < lx + 2) +- goto overflow2; +- linebuf[lx++] = hex_asc_lo(ch); +- if (linebuflen < lx + 2) +- goto overflow2; +- linebuf[lx++] = ' '; ++ return; ++ ++ prt_hex_bytes(out, ptr, len, groupsize, ' '); ++ ++ if (ascii) { ++ unsigned ascii_column = rowsize * 2 + rowsize / groupsize + 1; ++ ++ prt_chars(out, ' ', max_t(int, 0, ascii_column - (out->pos - saved_pos))); ++ ++ for (i = 0; i < len; i++) { ++ u8 ch = ptr[i]; ++ prt_char(out, isascii(ch) && isprint(ch) ? ch : '.'); + } +- if (j) +- lx--; + } +- if (!ascii) +- goto nil; ++} ++EXPORT_SYMBOL(prt_hex_line); + +- while (lx < ascii_column) { +- if (linebuflen < lx + 2) +- goto overflow2; +- linebuf[lx++] = ' '; +- } +- for (j = 0; j < len; j++) { +- if (linebuflen < lx + 2) +- goto overflow2; +- ch = ptr[j]; +- linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; ++/** ++ * prt_hex_dump - print multiline formatted hex dump ++ * @out: printbuf to output to ++ * @buf: data blob to dump ++ * @len: number of bytes in the @buf ++ * @prefix_str: string to prefix each line with; ++ * caller supplies trailing spaces for alignment if desired ++ * @prefix_type: controls whether prefix of an offset, address, or none ++ * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) ++ * @rowsize: number of bytes to print per line; must be 16 or 32 ++ * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) ++ * @ascii: include ASCII after the hex output ++ * ++ * Function is an analogue of print_hex_dump() and thus has similar interface. ++ * ++ * linebuf size is maximal length for one line. ++ * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for ++ * separating space ++ * 2 - spaces separating hex dump and ascii representation ++ * 32 - ascii representation ++ * 1 - terminating '\0' ++ */ ++void prt_hex_dump(struct printbuf *out, const void *buf, size_t len, ++ const char *prefix_str, int prefix_type, ++ unsigned rowsize, unsigned groupsize, bool ascii) ++{ ++ const u8 *ptr = buf; ++ size_t i; ++ ++ if (rowsize != 16 && rowsize != 32) ++ rowsize = 16; ++ ++ for (i = 0; i < len; i += rowsize) { ++ prt_str(out, prefix_str); ++ ++ switch (prefix_type) { ++ case DUMP_PREFIX_ADDRESS: ++ prt_printf(out, "%p: ", ptr + i); ++ break; ++ case DUMP_PREFIX_OFFSET: ++ prt_printf(out, "%.8zx: ", i); ++ break; ++ } ++ ++ prt_hex_line(out, ptr + i, min_t(size_t, len - i, rowsize), ++ rowsize, groupsize, ascii); ++ prt_char(out, '\n'); + } +-nil: +- linebuf[lx] = '\0'; +- return lx; +-overflow2: +- linebuf[lx++] = '\0'; +-overflow1: +- return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; ++} ++ ++/** ++ * bin2hex - convert binary data to an ascii hexadecimal string ++ * @dst: ascii hexadecimal result ++ * @src: binary data ++ * @count: binary data length ++ */ ++char *bin2hex(char *dst, const void *src, size_t count) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(dst, count * 4); ++ ++ prt_hex_bytes(&out, src, count, 0, 0); ++ return dst + out.pos; ++} ++EXPORT_SYMBOL(bin2hex); ++ ++/** ++ * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory ++ * @buf: data blob to dump ++ * @len: number of bytes in the @buf ++ * @rowsize: number of bytes to print per line; must be 16 or 32 ++ * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) ++ * @linebuf: where to put the converted data ++ * @linebuflen: total size of @linebuf, including space for terminating NUL ++ * @ascii: include ASCII after the hex output ++ * ++ * hex_dump_to_buffer() works on one "line" of output at a time, i.e., ++ * 16 or 32 bytes of input data converted to hex + ASCII output. ++ * ++ * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data ++ * to a hex + ASCII dump at the supplied memory location. ++ * The converted output is always NUL-terminated. ++ * ++ * E.g.: ++ * hex_dump_to_buffer(frame->data, frame->len, 16, 1, ++ * linebuf, sizeof(linebuf), true); ++ * ++ * example output buffer: ++ * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO ++ * ++ * Return: ++ * The amount of bytes placed in the buffer without terminating NUL. If the ++ * output was truncated, then the return value is the number of bytes ++ * (excluding the terminating NUL) which would have been written to the final ++ * string if enough space had been available. ++ */ ++int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, ++ char *linebuf, size_t linebuflen, bool ascii) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(linebuf, linebuflen); ++ ++ prt_hex_line(&out, buf, len, rowsize, groupsize, ascii); ++ return out.pos; + } + EXPORT_SYMBOL(hex_dump_to_buffer); + +@@ -262,6 +305,11 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, + int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) + { ++ /* ++ * XXX: this code does the exact same thing as prt_hex_dump(): we should ++ * be able to call that and printk() the result, except printk is ++ * restricted to 1024 bytes of output per call ++ */ + const u8 *ptr = buf; + int i, linelen, remaining = len; + unsigned char linebuf[32 * 3 + 2 + 32 + 1]; +diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c +new file mode 100644 +index 000000000000..addbac95e065 +--- /dev/null ++++ b/lib/pretty-printers.c +@@ -0,0 +1,60 @@ ++// SPDX-License-Identifier: LGPL-2.1+ ++/* Copyright (C) 2022 Kent Overstreet */ ++ ++#include ++#include ++#include ++#include ++ ++/** ++ * prt_string_option - Given a list of strings, print out the list and indicate ++ * which option is selected, with square brackets (sysfs style) ++ * ++ * @out: The printbuf to output to ++ * @list: List of strings to choose from ++ * @selected: The option to highlight, with square brackets ++ */ ++void prt_string_option(struct printbuf *out, ++ const char * const list[], ++ size_t selected) ++{ ++ size_t i; ++ ++ for (i = 0; list[i]; i++) { ++ if (i) ++ prt_char(out, ' '); ++ if (i == selected) ++ prt_char(out, '['); ++ prt_str(out, list[i]); ++ if (i == selected) ++ prt_char(out, ']'); ++ } ++} ++EXPORT_SYMBOL(prt_string_option); ++ ++/** ++ * prt_bitflags: Given a bitmap and a list of names for each bit, print out which ++ * bits are on, comma separated ++ * ++ * @out: The printbuf to output to ++ * @list: List of names for each bit ++ * @flags: Bits to print ++ */ ++void prt_bitflags(struct printbuf *out, ++ const char * const list[], u64 flags) ++{ ++ unsigned bit, nr = 0; ++ bool first = true; ++ ++ while (list[nr]) ++ nr++; ++ ++ while (flags && (bit = __ffs(flags)) < nr) { ++ if (!first) ++ prt_char(out, ','); ++ first = false; ++ prt_str(out, list[bit]); ++ flags ^= 1 << bit; ++ } ++} ++EXPORT_SYMBOL(prt_bitflags); +diff --git a/lib/printbuf.c b/lib/printbuf.c +new file mode 100644 +index 000000000000..047470025748 +--- /dev/null ++++ b/lib/printbuf.c +@@ -0,0 +1,258 @@ ++// SPDX-License-Identifier: LGPL-2.1+ ++/* Copyright (C) 2022 Kent Overstreet */ ++ ++#ifdef __KERNEL__ ++#include ++#include ++#else ++#define EXPORT_SYMBOL(x) ++#endif ++ ++#include ++#include ++#include ++#include ++ ++static inline size_t printbuf_linelen(struct printbuf *buf) ++{ ++ return buf->pos - buf->last_newline; ++} ++ ++int printbuf_make_room(struct printbuf *out, unsigned extra) ++{ ++ unsigned new_size; ++ char *buf; ++ ++ if (!out->heap_allocated) ++ return 0; ++ ++ /* Reserved space for terminating nul: */ ++ extra += 1; ++ ++ if (out->pos + extra < out->size) ++ return 0; ++ ++ new_size = roundup_pow_of_two(out->size + extra); ++ ++ /* ++ * Note: output buffer must be freeable with kfree(), it's not required ++ * that the user use printbuf_exit(). ++ */ ++ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); ++ ++ if (!buf) { ++ out->allocation_failure = true; ++ return -ENOMEM; ++ } ++ ++ out->buf = buf; ++ out->size = new_size; ++ return 0; ++} ++EXPORT_SYMBOL(printbuf_make_room); ++ ++/** ++ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null ++ * terminated ++ */ ++const char *printbuf_str(const struct printbuf *buf) ++{ ++ /* ++ * If we've written to a printbuf then it's guaranteed to be a null ++ * terminated string - but if we haven't, then we might not have ++ * allocated a buffer at all: ++ */ ++ return buf->pos ++ ? buf->buf ++ : ""; ++} ++EXPORT_SYMBOL(printbuf_str); ++ ++/** ++ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it ++ * against accidental use. ++ */ ++void printbuf_exit(struct printbuf *buf) ++{ ++ if (buf->heap_allocated) { ++ kfree(buf->buf); ++ buf->buf = ERR_PTR(-EINTR); /* poison value */ ++ } ++} ++EXPORT_SYMBOL(printbuf_exit); ++ ++void prt_newline(struct printbuf *buf) ++{ ++ unsigned i; ++ ++ printbuf_make_room(buf, 1 + buf->indent); ++ ++ __prt_char(buf, '\n'); ++ ++ buf->last_newline = buf->pos; ++ ++ for (i = 0; i < buf->indent; i++) ++ __prt_char(buf, ' '); ++ ++ printbuf_nul_terminate(buf); ++ ++ buf->last_field = buf->pos; ++ buf->tabstop = 0; ++} ++EXPORT_SYMBOL(prt_newline); ++ ++/** ++ * printbuf_indent_add - add to the current indent level ++ * ++ * @buf: printbuf to control ++ * @spaces: number of spaces to add to the current indent level ++ * ++ * Subsequent lines, and the current line if the output position is at the start ++ * of the current line, will be indented by @spaces more spaces. ++ */ ++void printbuf_indent_add(struct printbuf *buf, unsigned spaces) ++{ ++ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) ++ spaces = 0; ++ ++ buf->indent += spaces; ++ while (spaces--) ++ prt_char(buf, ' '); ++} ++EXPORT_SYMBOL(printbuf_indent_add); ++ ++/** ++ * printbuf_indent_sub - subtract from the current indent level ++ * ++ * @buf: printbuf to control ++ * @spaces: number of spaces to subtract from the current indent level ++ * ++ * Subsequent lines, and the current line if the output position is at the start ++ * of the current line, will be indented by @spaces less spaces. ++ */ ++void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) ++{ ++ if (WARN_ON_ONCE(spaces > buf->indent)) ++ spaces = buf->indent; ++ ++ if (buf->last_newline + buf->indent == buf->pos) { ++ buf->pos -= spaces; ++ printbuf_nul_terminate(buf); ++ } ++ buf->indent -= spaces; ++} ++EXPORT_SYMBOL(printbuf_indent_sub); ++ ++/** ++ * prt_tab - Advance printbuf to the next tabstop ++ * ++ * @buf: printbuf to control ++ * ++ * Advance output to the next tabstop by printing spaces. ++ */ ++void prt_tab(struct printbuf *out) ++{ ++ int spaces = max_t(int, 0, out->tabstops[out->tabstop] - printbuf_linelen(out)); ++ ++ BUG_ON(out->tabstop > ARRAY_SIZE(out->tabstops)); ++ ++ prt_chars(out, ' ', spaces); ++ ++ out->last_field = out->pos; ++ out->tabstop++; ++} ++EXPORT_SYMBOL(prt_tab); ++ ++/** ++ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying ++ * previous output ++ * ++ * @buf: printbuf to control ++ * ++ * Advance output to the next tabstop by inserting spaces immediately after the ++ * previous tabstop, right justifying previously outputted text. ++ */ ++void prt_tab_rjust(struct printbuf *buf) ++{ ++ BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops)); ++ ++ if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) { ++ unsigned move = buf->pos - buf->last_field; ++ unsigned shift = buf->tabstops[buf->tabstop] - ++ printbuf_linelen(buf); ++ ++ printbuf_make_room(buf, shift); ++ ++ if (buf->last_field + shift < buf->size) ++ memmove(buf->buf + buf->last_field + shift, ++ buf->buf + buf->last_field, ++ min(move, buf->size - 1 - buf->last_field - shift)); ++ ++ if (buf->last_field < buf->size) ++ memset(buf->buf + buf->last_field, ' ', ++ min(shift, buf->size - buf->last_field)); ++ ++ buf->pos += shift; ++ printbuf_nul_terminate(buf); ++ } ++ ++ buf->last_field = buf->pos; ++ buf->tabstop++; ++} ++EXPORT_SYMBOL(prt_tab_rjust); ++ ++/** ++ * prt_human_readable_u64 - Print out a u64 in human readable units ++ * ++ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units ++ */ ++void prt_human_readable_u64(struct printbuf *buf, u64 v) ++{ ++ printbuf_make_room(buf, 10); ++ buf->pos += string_get_size(v, 1, !buf->si_units, ++ buf->buf + buf->pos, ++ printbuf_remaining_size(buf)); ++} ++EXPORT_SYMBOL(prt_human_readable_u64); ++ ++/** ++ * prt_human_readable_s64 - Print out a s64 in human readable units ++ * ++ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units ++ */ ++void prt_human_readable_s64(struct printbuf *buf, s64 v) ++{ ++ if (v < 0) ++ prt_char(buf, '-'); ++ prt_human_readable_u64(buf, abs(v)); ++} ++EXPORT_SYMBOL(prt_human_readable_s64); ++ ++/** ++ * prt_units_u64 - Print out a u64 according to printbuf unit options ++ * ++ * Units are either raw (default), or human reabable units (controlled via ++ * @buf->human_readable_units) ++ */ ++void prt_units_u64(struct printbuf *out, u64 v) ++{ ++ if (out->human_readable_units) ++ prt_human_readable_u64(out, v); ++ else ++ prt_printf(out, "%llu", v); ++} ++EXPORT_SYMBOL(prt_units_u64); ++ ++/** ++ * prt_units_s64 - Print out a s64 according to printbuf unit options ++ * ++ * Units are either raw (default), or human reabable units (controlled via ++ * @buf->human_readable_units) ++ */ ++void prt_units_s64(struct printbuf *out, s64 v) ++{ ++ if (v < 0) ++ prt_char(out, '-'); ++ prt_units_u64(out, abs(v)); ++} ++EXPORT_SYMBOL(prt_units_s64); +diff --git a/lib/seq_buf.c b/lib/seq_buf.c +deleted file mode 100644 +index 0a68f7aa85d6..000000000000 +--- a/lib/seq_buf.c ++++ /dev/null +@@ -1,397 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * seq_buf.c +- * +- * Copyright (C) 2014 Red Hat Inc, Steven Rostedt +- * +- * The seq_buf is a handy tool that allows you to pass a descriptor around +- * to a buffer that other functions can write to. It is similar to the +- * seq_file functionality but has some differences. +- * +- * To use it, the seq_buf must be initialized with seq_buf_init(). +- * This will set up the counters within the descriptor. You can call +- * seq_buf_init() more than once to reset the seq_buf to start +- * from scratch. +- */ +-#include +-#include +-#include +- +-/** +- * seq_buf_can_fit - can the new data fit in the current buffer? +- * @s: the seq_buf descriptor +- * @len: The length to see if it can fit in the current buffer +- * +- * Returns true if there's enough unused space in the seq_buf buffer +- * to fit the amount of new data according to @len. +- */ +-static bool seq_buf_can_fit(struct seq_buf *s, size_t len) +-{ +- return s->len + len <= s->size; +-} +- +-/** +- * seq_buf_print_seq - move the contents of seq_buf into a seq_file +- * @m: the seq_file descriptor that is the destination +- * @s: the seq_buf descriptor that is the source. +- * +- * Returns zero on success, non zero otherwise +- */ +-int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s) +-{ +- unsigned int len = seq_buf_used(s); +- +- return seq_write(m, s->buffer, len); +-} +- +-/** +- * seq_buf_vprintf - sequence printing of information. +- * @s: seq_buf descriptor +- * @fmt: printf format string +- * @args: va_list of arguments from a printf() type function +- * +- * Writes a vnprintf() format into the sequencce buffer. +- * +- * Returns zero on success, -1 on overflow. +- */ +-int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) +-{ +- int len; +- +- WARN_ON(s->size == 0); +- +- if (s->len < s->size) { +- len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); +- if (s->len + len < s->size) { +- s->len += len; +- return 0; +- } +- } +- seq_buf_set_overflow(s); +- return -1; +-} +- +-/** +- * seq_buf_printf - sequence printing of information +- * @s: seq_buf descriptor +- * @fmt: printf format string +- * +- * Writes a printf() format into the sequence buffer. +- * +- * Returns zero on success, -1 on overflow. +- */ +-int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) +-{ +- va_list ap; +- int ret; +- +- va_start(ap, fmt); +- ret = seq_buf_vprintf(s, fmt, ap); +- va_end(ap); +- +- return ret; +-} +-EXPORT_SYMBOL_GPL(seq_buf_printf); +- +-#ifdef CONFIG_BINARY_PRINTF +-/** +- * seq_buf_bprintf - Write the printf string from binary arguments +- * @s: seq_buf descriptor +- * @fmt: The format string for the @binary arguments +- * @binary: The binary arguments for @fmt. +- * +- * When recording in a fast path, a printf may be recorded with just +- * saving the format and the arguments as they were passed to the +- * function, instead of wasting cycles converting the arguments into +- * ASCII characters. Instead, the arguments are saved in a 32 bit +- * word array that is defined by the format string constraints. +- * +- * This function will take the format and the binary array and finish +- * the conversion into the ASCII string within the buffer. +- * +- * Returns zero on success, -1 on overflow. +- */ +-int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary) +-{ +- unsigned int len = seq_buf_buffer_left(s); +- int ret; +- +- WARN_ON(s->size == 0); +- +- if (s->len < s->size) { +- ret = bstr_printf(s->buffer + s->len, len, fmt, binary); +- if (s->len + ret < s->size) { +- s->len += ret; +- return 0; +- } +- } +- seq_buf_set_overflow(s); +- return -1; +-} +-#endif /* CONFIG_BINARY_PRINTF */ +- +-/** +- * seq_buf_puts - sequence printing of simple string +- * @s: seq_buf descriptor +- * @str: simple string to record +- * +- * Copy a simple string into the sequence buffer. +- * +- * Returns zero on success, -1 on overflow +- */ +-int seq_buf_puts(struct seq_buf *s, const char *str) +-{ +- size_t len = strlen(str); +- +- WARN_ON(s->size == 0); +- +- /* Add 1 to len for the trailing null byte which must be there */ +- len += 1; +- +- if (seq_buf_can_fit(s, len)) { +- memcpy(s->buffer + s->len, str, len); +- /* Don't count the trailing null byte against the capacity */ +- s->len += len - 1; +- return 0; +- } +- seq_buf_set_overflow(s); +- return -1; +-} +- +-/** +- * seq_buf_putc - sequence printing of simple character +- * @s: seq_buf descriptor +- * @c: simple character to record +- * +- * Copy a single character into the sequence buffer. +- * +- * Returns zero on success, -1 on overflow +- */ +-int seq_buf_putc(struct seq_buf *s, unsigned char c) +-{ +- WARN_ON(s->size == 0); +- +- if (seq_buf_can_fit(s, 1)) { +- s->buffer[s->len++] = c; +- return 0; +- } +- seq_buf_set_overflow(s); +- return -1; +-} +- +-/** +- * seq_buf_putmem - write raw data into the sequenc buffer +- * @s: seq_buf descriptor +- * @mem: The raw memory to copy into the buffer +- * @len: The length of the raw memory to copy (in bytes) +- * +- * There may be cases where raw memory needs to be written into the +- * buffer and a strcpy() would not work. Using this function allows +- * for such cases. +- * +- * Returns zero on success, -1 on overflow +- */ +-int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) +-{ +- WARN_ON(s->size == 0); +- +- if (seq_buf_can_fit(s, len)) { +- memcpy(s->buffer + s->len, mem, len); +- s->len += len; +- return 0; +- } +- seq_buf_set_overflow(s); +- return -1; +-} +- +-#define MAX_MEMHEX_BYTES 8U +-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) +- +-/** +- * seq_buf_putmem_hex - write raw memory into the buffer in ASCII hex +- * @s: seq_buf descriptor +- * @mem: The raw memory to write its hex ASCII representation of +- * @len: The length of the raw memory to copy (in bytes) +- * +- * This is similar to seq_buf_putmem() except instead of just copying the +- * raw memory into the buffer it writes its ASCII representation of it +- * in hex characters. +- * +- * Returns zero on success, -1 on overflow +- */ +-int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, +- unsigned int len) +-{ +- unsigned char hex[HEX_CHARS]; +- const unsigned char *data = mem; +- unsigned int start_len; +- int i, j; +- +- WARN_ON(s->size == 0); +- +- BUILD_BUG_ON(MAX_MEMHEX_BYTES * 2 >= HEX_CHARS); +- +- while (len) { +- start_len = min(len, MAX_MEMHEX_BYTES); +-#ifdef __BIG_ENDIAN +- for (i = 0, j = 0; i < start_len; i++) { +-#else +- for (i = start_len-1, j = 0; i >= 0; i--) { +-#endif +- hex[j++] = hex_asc_hi(data[i]); +- hex[j++] = hex_asc_lo(data[i]); +- } +- if (WARN_ON_ONCE(j == 0 || j/2 > len)) +- break; +- +- /* j increments twice per loop */ +- hex[j++] = ' '; +- +- seq_buf_putmem(s, hex, j); +- if (seq_buf_has_overflowed(s)) +- return -1; +- +- len -= start_len; +- data += start_len; +- } +- return 0; +-} +- +-/** +- * seq_buf_path - copy a path into the sequence buffer +- * @s: seq_buf descriptor +- * @path: path to write into the sequence buffer. +- * @esc: set of characters to escape in the output +- * +- * Write a path name into the sequence buffer. +- * +- * Returns the number of written bytes on success, -1 on overflow +- */ +-int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc) +-{ +- char *buf; +- size_t size = seq_buf_get_buf(s, &buf); +- int res = -1; +- +- WARN_ON(s->size == 0); +- +- if (size) { +- char *p = d_path(path, buf, size); +- if (!IS_ERR(p)) { +- char *end = mangle_path(buf, p, esc); +- if (end) +- res = end - buf; +- } +- } +- seq_buf_commit(s, res); +- +- return res; +-} +- +-/** +- * seq_buf_to_user - copy the sequence buffer to user space +- * @s: seq_buf descriptor +- * @ubuf: The userspace memory location to copy to +- * @cnt: The amount to copy +- * +- * Copies the sequence buffer into the userspace memory pointed to +- * by @ubuf. It starts from the last read position (@s->readpos) +- * and writes up to @cnt characters or till it reaches the end of +- * the content in the buffer (@s->len), which ever comes first. +- * +- * On success, it returns a positive number of the number of bytes +- * it copied. +- * +- * On failure it returns -EBUSY if all of the content in the +- * sequence has been already read, which includes nothing in the +- * sequence (@s->len == @s->readpos). +- * +- * Returns -EFAULT if the copy to userspace fails. +- */ +-int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) +-{ +- int len; +- int ret; +- +- if (!cnt) +- return 0; +- +- len = seq_buf_used(s); +- +- if (len <= s->readpos) +- return -EBUSY; +- +- len -= s->readpos; +- if (cnt > len) +- cnt = len; +- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); +- if (ret == cnt) +- return -EFAULT; +- +- cnt -= ret; +- +- s->readpos += cnt; +- return cnt; +-} +- +-/** +- * seq_buf_hex_dump - print formatted hex dump into the sequence buffer +- * @s: seq_buf descriptor +- * @prefix_str: string to prefix each line with; +- * caller supplies trailing spaces for alignment if desired +- * @prefix_type: controls whether prefix of an offset, address, or none +- * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) +- * @rowsize: number of bytes to print per line; must be 16 or 32 +- * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) +- * @buf: data blob to dump +- * @len: number of bytes in the @buf +- * @ascii: include ASCII after the hex output +- * +- * Function is an analogue of print_hex_dump() and thus has similar interface. +- * +- * linebuf size is maximal length for one line. +- * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for +- * separating space +- * 2 - spaces separating hex dump and ascii representation +- * 32 - ascii representation +- * 1 - terminating '\0' +- * +- * Returns zero on success, -1 on overflow +- */ +-int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, +- int rowsize, int groupsize, +- const void *buf, size_t len, bool ascii) +-{ +- const u8 *ptr = buf; +- int i, linelen, remaining = len; +- unsigned char linebuf[32 * 3 + 2 + 32 + 1]; +- int ret; +- +- if (rowsize != 16 && rowsize != 32) +- rowsize = 16; +- +- for (i = 0; i < len; i += rowsize) { +- linelen = min(remaining, rowsize); +- remaining -= rowsize; +- +- hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, +- linebuf, sizeof(linebuf), ascii); +- +- switch (prefix_type) { +- case DUMP_PREFIX_ADDRESS: +- ret = seq_buf_printf(s, "%s%p: %s\n", +- prefix_str, ptr + i, linebuf); +- break; +- case DUMP_PREFIX_OFFSET: +- ret = seq_buf_printf(s, "%s%.8x: %s\n", +- prefix_str, i, linebuf); +- break; +- default: +- ret = seq_buf_printf(s, "%s%s\n", prefix_str, linebuf); +- break; +- } +- if (ret) +- return ret; +- } +- return 0; +-} +diff --git a/lib/string_helpers.c b/lib/string_helpers.c +index 5ed3beb066e6..d247bf945f16 100644 +--- a/lib/string_helpers.c ++++ b/lib/string_helpers.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -32,8 +33,8 @@ + * at least 9 bytes and will always be zero terminated. + * + */ +-void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, +- char *buf, int len) ++int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, ++ char *buf, int len) + { + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" +@@ -126,8 +127,7 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, + else + unit = units_str[units][i]; + +- snprintf(buf, len, "%u%s %s", (u32)size, +- tmp, unit); ++ return snprintf(buf, len, "%u%s %s", (u32)size, tmp, unit); + } + EXPORT_SYMBOL(string_get_size); + +@@ -301,19 +301,14 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) + } + EXPORT_SYMBOL(string_unescape); + +-static bool escape_passthrough(unsigned char c, char **dst, char *end) ++static bool escape_passthrough(struct printbuf *out, unsigned char c) + { +- char *out = *dst; +- +- if (out < end) +- *out = c; +- *dst = out + 1; ++ prt_char(out, c); + return true; + } + +-static bool escape_space(unsigned char c, char **dst, char *end) ++static bool escape_space(struct printbuf *out, unsigned char c) + { +- char *out = *dst; + unsigned char to; + + switch (c) { +@@ -336,20 +331,13 @@ static bool escape_space(unsigned char c, char **dst, char *end) + return false; + } + +- if (out < end) +- *out = '\\'; +- ++out; +- if (out < end) +- *out = to; +- ++out; +- +- *dst = out; ++ prt_char(out, '\\'); ++ prt_char(out, to); + return true; + } + +-static bool escape_special(unsigned char c, char **dst, char *end) ++static bool escape_special(struct printbuf *out, unsigned char c) + { +- char *out = *dst; + unsigned char to; + + switch (c) { +@@ -369,83 +357,43 @@ static bool escape_special(unsigned char c, char **dst, char *end) + return false; + } + +- if (out < end) +- *out = '\\'; +- ++out; +- if (out < end) +- *out = to; +- ++out; +- +- *dst = out; ++ prt_char(out, '\\'); ++ prt_char(out, to); + return true; + } + +-static bool escape_null(unsigned char c, char **dst, char *end) ++static bool escape_null(struct printbuf *out, unsigned char c) + { +- char *out = *dst; +- + if (c) + return false; + +- if (out < end) +- *out = '\\'; +- ++out; +- if (out < end) +- *out = '0'; +- ++out; +- +- *dst = out; ++ prt_char(out, '\\'); ++ prt_char(out, '0'); + return true; + } + +-static bool escape_octal(unsigned char c, char **dst, char *end) ++static bool escape_octal(struct printbuf *out, unsigned char c) + { +- char *out = *dst; +- +- if (out < end) +- *out = '\\'; +- ++out; +- if (out < end) +- *out = ((c >> 6) & 0x07) + '0'; +- ++out; +- if (out < end) +- *out = ((c >> 3) & 0x07) + '0'; +- ++out; +- if (out < end) +- *out = ((c >> 0) & 0x07) + '0'; +- ++out; +- +- *dst = out; ++ prt_char(out, '\\'); ++ prt_char(out, ((c >> 6) & 0x07) + '0'); ++ prt_char(out, ((c >> 3) & 0x07) + '0'); ++ prt_char(out, ((c >> 0) & 0x07) + '0'); + return true; + } + +-static bool escape_hex(unsigned char c, char **dst, char *end) ++static bool escape_hex(struct printbuf *out, unsigned char c) + { +- char *out = *dst; +- +- if (out < end) +- *out = '\\'; +- ++out; +- if (out < end) +- *out = 'x'; +- ++out; +- if (out < end) +- *out = hex_asc_hi(c); +- ++out; +- if (out < end) +- *out = hex_asc_lo(c); +- ++out; +- +- *dst = out; ++ prt_char(out, '\\'); ++ prt_char(out, 'x'); ++ prt_hex_byte(out, c); + return true; + } + + /** +- * string_escape_mem - quote characters in the given memory buffer ++ * prt_escaped_string - quote characters in the given memory buffer ++ * @out: printbuf to output to (escaped) + * @src: source buffer (unescaped) + * @isz: source buffer size +- * @dst: destination buffer (escaped) +- * @osz: destination buffer size + * @flags: combination of the flags + * @only: NULL-terminated string containing characters used to limit + * the selected escape class. If characters are included in @only +@@ -510,18 +458,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end) + * or %ESCAPE_HEX, because they cover most of the other character classes. + * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to + * the above. +- * +- * Return: +- * The total size of the escaped output that would be generated for +- * the given input and flags. To check whether the output was +- * truncated, compare the return value to osz. There is room left in +- * dst for a '\0' terminator if and only if ret < osz. + */ +-int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, +- unsigned int flags, const char *only) ++void prt_escaped_string(struct printbuf *out, ++ const char *src, size_t isz, ++ unsigned int flags, const char *only) + { +- char *p = dst; +- char *end = p + osz; + bool is_dict = only && *only; + bool is_append = flags & ESCAPE_APPEND; + +@@ -549,41 +490,126 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, + * %ESCAPE_NA cases. + */ + if (!(is_append || in_dict) && is_dict && +- escape_passthrough(c, &p, end)) ++ escape_passthrough(out, c)) + continue; + + if (!(is_append && in_dict) && isascii(c) && isprint(c) && +- flags & ESCAPE_NAP && escape_passthrough(c, &p, end)) ++ flags & ESCAPE_NAP && escape_passthrough(out, c)) + continue; + + if (!(is_append && in_dict) && isprint(c) && +- flags & ESCAPE_NP && escape_passthrough(c, &p, end)) ++ flags & ESCAPE_NP && escape_passthrough(out, c)) + continue; + + if (!(is_append && in_dict) && isascii(c) && +- flags & ESCAPE_NA && escape_passthrough(c, &p, end)) ++ flags & ESCAPE_NA && escape_passthrough(out, c)) + continue; + +- if (flags & ESCAPE_SPACE && escape_space(c, &p, end)) ++ if (flags & ESCAPE_SPACE && escape_space(out, c)) + continue; + +- if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end)) ++ if (flags & ESCAPE_SPECIAL && escape_special(out, c)) + continue; + +- if (flags & ESCAPE_NULL && escape_null(c, &p, end)) ++ if (flags & ESCAPE_NULL && escape_null(out, c)) + continue; + + /* ESCAPE_OCTAL and ESCAPE_HEX always go last */ +- if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end)) ++ if (flags & ESCAPE_OCTAL && escape_octal(out, c)) + continue; + +- if (flags & ESCAPE_HEX && escape_hex(c, &p, end)) ++ if (flags & ESCAPE_HEX && escape_hex(out, c)) + continue; + +- escape_passthrough(c, &p, end); ++ escape_passthrough(out, c); + } ++} ++EXPORT_SYMBOL(prt_escaped_string); ++ ++/** ++ * string_escape_mem - quote characters in the given memory buffer ++ * @src: source buffer (unescaped) ++ * @isz: source buffer size ++ * @dst: destination buffer (escaped) ++ * @osz: destination buffer size ++ * @flags: combination of the flags ++ * @only: NULL-terminated string containing characters used to limit ++ * the selected escape class. If characters are included in @only ++ * that would not normally be escaped by the classes selected ++ * in @flags, they will be copied to @dst unescaped. ++ * ++ * Description: ++ * The process of escaping byte buffer includes several parts. They are applied ++ * in the following sequence. ++ * ++ * 1. The character is not matched to the one from @only string and thus ++ * must go as-is to the output. ++ * 2. The character is matched to the printable and ASCII classes, if asked, ++ * and in case of match it passes through to the output. ++ * 3. The character is matched to the printable or ASCII class, if asked, ++ * and in case of match it passes through to the output. ++ * 4. The character is checked if it falls into the class given by @flags. ++ * %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any ++ * character. Note that they actually can't go together, otherwise ++ * %ESCAPE_HEX will be ignored. ++ * ++ * Caller must provide valid source and destination pointers. Be aware that ++ * destination buffer will not be NULL-terminated, thus caller have to append ++ * it if needs. The supported flags are:: ++ * ++ * %ESCAPE_SPACE: (special white space, not space itself) ++ * '\f' - form feed ++ * '\n' - new line ++ * '\r' - carriage return ++ * '\t' - horizontal tab ++ * '\v' - vertical tab ++ * %ESCAPE_SPECIAL: ++ * '\"' - double quote ++ * '\\' - backslash ++ * '\a' - alert (BEL) ++ * '\e' - escape ++ * %ESCAPE_NULL: ++ * '\0' - null ++ * %ESCAPE_OCTAL: ++ * '\NNN' - byte with octal value NNN (3 digits) ++ * %ESCAPE_ANY: ++ * all previous together ++ * %ESCAPE_NP: ++ * escape only non-printable characters, checked by isprint() ++ * %ESCAPE_ANY_NP: ++ * all previous together ++ * %ESCAPE_HEX: ++ * '\xHH' - byte with hexadecimal value HH (2 digits) ++ * %ESCAPE_NA: ++ * escape only non-ascii characters, checked by isascii() ++ * %ESCAPE_NAP: ++ * escape only non-printable or non-ascii characters ++ * %ESCAPE_APPEND: ++ * append characters from @only to be escaped by the given classes ++ * ++ * %ESCAPE_APPEND would help to pass additional characters to the escaped, when ++ * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided. ++ * ++ * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the ++ * higher priority than the rest of the flags (%ESCAPE_NAP is the highest). ++ * It doesn't make much sense to use either of them without %ESCAPE_OCTAL ++ * or %ESCAPE_HEX, because they cover most of the other character classes. ++ * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to ++ * the above. ++ * ++ * Return: ++ * The total size of the escaped output that would be generated for ++ * the given input and flags. To check whether the output was ++ * truncated, compare the return value to osz. There is room left in ++ * dst for a '\0' terminator if and only if ret < osz. ++ */ ++int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, ++ unsigned int flags, const char *only) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(dst, osz); + +- return p - dst; ++ prt_escaped_string(&out, src, isz, flags, only); ++ return out.pos; + } + EXPORT_SYMBOL(string_escape_mem); + +diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c +index 5144899d3c6b..f9e97879dcdf 100644 +--- a/lib/test_hexdump.c ++++ b/lib/test_hexdump.c +@@ -25,36 +25,19 @@ static const char * const test_data_1[] __initconst = { + "4c", "d1", "19", "99", "43", "b1", "af", "0c", + }; + +-static const char * const test_data_2_le[] __initconst = { +- "32be", "7bdb", "180a", "b293", +- "ba70", "24c4", "837d", "9b34", +- "9ca6", "ad31", "0f9c", "e9ac", +- "d14c", "9919", "b143", "0caf", +-}; +- +-static const char * const test_data_2_be[] __initconst = { ++static const char * const test_data_2[] __initconst = { + "be32", "db7b", "0a18", "93b2", + "70ba", "c424", "7d83", "349b", + "a69c", "31ad", "9c0f", "ace9", + "4cd1", "1999", "43b1", "af0c", + }; + +-static const char * const test_data_4_le[] __initconst = { +- "7bdb32be", "b293180a", "24c4ba70", "9b34837d", +- "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", +-}; +- +-static const char * const test_data_4_be[] __initconst = { ++static const char * const test_data_4[] __initconst = { + "be32db7b", "0a1893b2", "70bac424", "7d83349b", + "a69c31ad", "9c0face9", "4cd11999", "43b1af0c", + }; + +-static const char * const test_data_8_le[] __initconst = { +- "b293180a7bdb32be", "9b34837d24c4ba70", +- "e9ac0f9cad319ca6", "0cafb1439919d14c", +-}; +- +-static const char * const test_data_8_be[] __initconst = { ++static const char * const test_data_8[] __initconst = { + "be32db7b0a1893b2", "70bac4247d83349b", + "a69c31ad9c0face9", "4cd1199943b1af0c", + }; +@@ -73,7 +56,6 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, + size_t l = len; + int gs = groupsize, rs = rowsize; + unsigned int i; +- const bool is_be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + + if (rs != 16 && rs != 32) + rs = 16; +@@ -85,11 +67,11 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, + gs = 1; + + if (gs == 8) +- result = is_be ? test_data_8_be : test_data_8_le; ++ result = test_data_8; + else if (gs == 4) +- result = is_be ? test_data_4_be : test_data_4_le; ++ result = test_data_4; + else if (gs == 2) +- result = is_be ? test_data_2_be : test_data_2_le; ++ result = test_data_2; + else + result = test_data_1; + +diff --git a/lib/test_printf.c b/lib/test_printf.c +index 07309c45f327..ac5f9f0eb4e0 100644 +--- a/lib/test_printf.c ++++ b/lib/test_printf.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,12 +79,6 @@ do_test(int bufsize, const char *expect, int elen, + return 1; + } + +- if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) { +- pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n", +- bufsize, fmt); +- return 1; +- } +- + if (memcmp(test_buffer, expect, written)) { + pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", + bufsize, fmt, test_buffer, written, expect); +@@ -783,6 +778,31 @@ test_pointer(void) + fourcc_pointer(); + } + ++static void printf_test_fn_0(struct printbuf *out) ++{ ++ prt_str(out, "0"); ++} ++ ++static void printf_test_fn_1(struct printbuf *out, void *p) ++{ ++ int *i = p; ++ ++ prt_printf(out, "%i", *i); ++} ++ ++static void __init ++test_fn(void) ++{ ++ int i = 1; ++ ++ test("0", "%pf()", CALL_PP(printf_test_fn_0)); ++ test("1", "%pf(%p)", CALL_PP(printf_test_fn_1, &i)); ++ /* ++ * Not tested, so we don't fail the build with -Werror: ++ */ ++ //test("1", "%(%p)", printf_test_fn, &i); ++} ++ + static void __init selftest(void) + { + alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL); +@@ -794,6 +814,7 @@ static void __init selftest(void) + test_number(); + test_string(); + test_pointer(); ++ test_fn(); + + kfree(alloced_buffer); + } +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 3c1853a9d1c0..d92a212db2f5 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -44,6 +44,7 @@ + #ifdef CONFIG_BLOCK + #include + #endif ++#include + + #include "../mm/internal.h" /* For the trace_print_flags arrays */ + +@@ -52,6 +53,7 @@ + #include + + #include ++#include + #include "kstrtox.h" + + /* Disable pointer hashing if requested */ +@@ -367,41 +369,51 @@ char *put_dec(char *buf, unsigned long long n) + + #endif + +-/* +- * Convert passed number to decimal string. +- * Returns the length of string. On buffer overflow, returns 0. +- * +- * If speed is not important, use snprintf(). It's easy to read the code. ++/** ++ * prt_u64_minwidth - print a u64, in decimal, with zero padding ++ * @out: printbuf to output to ++ * @num: u64 to print ++ * @width: minimum width + */ +-int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) ++void prt_u64_minwidth(struct printbuf *out, u64 num, unsigned width) + { + /* put_dec requires 2-byte alignment of the buffer. */ + char tmp[sizeof(num) * 3] __aligned(2); +- int idx, len; ++ unsigned len = put_dec(tmp, num) - tmp; + +- /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ +- if (num <= 9) { +- tmp[0] = '0' + num; +- len = 1; +- } else { +- len = put_dec(tmp, num) - tmp; +- } ++ printbuf_make_room(out, max(len, width)); + +- if (len > size || width > size) +- return 0; ++ if (width > len) ++ __prt_chars_reserved(out, '0', width - len); + +- if (width > len) { +- width = width - len; +- for (idx = 0; idx < width; idx++) +- buf[idx] = ' '; +- } else { +- width = 0; +- } ++ while (len) ++ __prt_char_reserved(out, tmp[--len]); ++ printbuf_nul_terminate(out); ++} + +- for (idx = 0; idx < len; ++idx) +- buf[idx + width] = tmp[len - idx - 1]; ++/** ++ * prt_u64 - print a simple u64, in decimal ++ * @out: printbuf to output to ++ * @num: u64 to print ++ */ ++void prt_u64(struct printbuf *out, u64 num) ++{ ++ prt_u64_minwidth(out, num, 0); ++} ++ ++/* ++ * Convert passed number to decimal string. ++ * Returns the length of string. On buffer overflow, returns 0. ++ * ++ * Consider switching to printbufs and using prt_u64() or prt_u64_minwith() ++ * instead. ++ */ ++int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(buf, size); + +- return len + width; ++ prt_u64_minwidth(&out, num, width); ++ return out.pos; + } + + #define SIGN 1 /* unsigned/signed, must be 1 */ +@@ -435,7 +447,8 @@ enum format_type { + FORMAT_TYPE_UINT, + FORMAT_TYPE_INT, + FORMAT_TYPE_SIZE_T, +- FORMAT_TYPE_PTRDIFF ++ FORMAT_TYPE_PTRDIFF, ++ FORMAT_TYPE_FN, + }; + + struct printf_spec { +@@ -451,128 +464,103 @@ static_assert(sizeof(struct printf_spec) == 8); + #define PRECISION_MAX ((1 << 15) - 1) + + static noinline_for_stack +-char *number(char *buf, char *end, unsigned long long num, +- struct printf_spec spec) ++void number(struct printbuf *out, unsigned long long num, ++ struct printf_spec spec) + { + /* put_dec requires 2-byte alignment of the buffer. */ + char tmp[3 * sizeof(num)] __aligned(2); +- char sign; +- char locase; ++ char sign = 0; ++ /* locase = 0 or 0x20. ORing digits or letters with 'locase' ++ * produces same digits or (maybe lowercased) letters */ ++ char locase = (spec.flags & SMALL); + int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); +- int i; + bool is_zero = num == 0LL; + int field_width = spec.field_width; + int precision = spec.precision; ++ int nr_digits = 0; ++ int output_bytes = 0; + +- /* locase = 0 or 0x20. ORing digits or letters with 'locase' +- * produces same digits or (maybe lowercased) letters */ +- locase = (spec.flags & SMALL); + if (spec.flags & LEFT) + spec.flags &= ~ZEROPAD; +- sign = 0; + if (spec.flags & SIGN) { + if ((signed long long)num < 0) { + sign = '-'; + num = -(signed long long)num; +- field_width--; ++ output_bytes++; + } else if (spec.flags & PLUS) { + sign = '+'; +- field_width--; ++ output_bytes++; + } else if (spec.flags & SPACE) { + sign = ' '; +- field_width--; ++ output_bytes++; + } + } + if (need_pfx) { + if (spec.base == 16) +- field_width -= 2; ++ output_bytes += 2; + else if (!is_zero) +- field_width--; ++ output_bytes++; + } + + /* generate full string in tmp[], in reverse order */ +- i = 0; +- if (num < spec.base) +- tmp[i++] = hex_asc_upper[num] | locase; +- else if (spec.base != 10) { /* 8 or 16 */ ++ if (spec.base == 10) { ++ nr_digits = put_dec(tmp, num) - tmp; ++ } else { /* 8 or 16 */ + int mask = spec.base - 1; +- int shift = 3; ++ int shift = ilog2((unsigned) spec.base); + +- if (spec.base == 16) +- shift = 4; + do { +- tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); ++ tmp[nr_digits++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); + num >>= shift; + } while (num); +- } else { /* base 10 */ +- i = put_dec(tmp, num) - tmp; + } + + /* printing 100 using %2d gives "100", not "00" */ +- if (i > precision) +- precision = i; ++ precision = max(nr_digits, precision); ++ output_bytes += precision; ++ field_width = max(0, field_width - output_bytes); ++ ++ printbuf_make_room(out, field_width + output_bytes); ++ + /* leading space padding */ +- field_width -= precision; +- if (!(spec.flags & (ZEROPAD | LEFT))) { +- while (--field_width >= 0) { +- if (buf < end) +- *buf = ' '; +- ++buf; +- } ++ if (!(spec.flags & (ZEROPAD | LEFT)) && field_width) { ++ __prt_chars_reserved(out, ' ', field_width); ++ field_width = 0; + } ++ + /* sign */ +- if (sign) { +- if (buf < end) +- *buf = sign; +- ++buf; +- } ++ if (sign) ++ __prt_char_reserved(out, sign); ++ + /* "0x" / "0" prefix */ + if (need_pfx) { +- if (spec.base == 16 || !is_zero) { +- if (buf < end) +- *buf = '0'; +- ++buf; +- } +- if (spec.base == 16) { +- if (buf < end) +- *buf = ('X' | locase); +- ++buf; +- } ++ if (spec.base == 16 || !is_zero) ++ __prt_char_reserved(out, '0'); ++ if (spec.base == 16) ++ __prt_char_reserved(out, 'X' | locase); + } +- /* zero or space padding */ +- if (!(spec.flags & LEFT)) { +- char c = ' ' + (spec.flags & ZEROPAD); + +- while (--field_width >= 0) { +- if (buf < end) +- *buf = c; +- ++buf; +- } +- } +- /* hmm even more zero padding? */ +- while (i <= --precision) { +- if (buf < end) +- *buf = '0'; +- ++buf; +- } ++ /* zero padding */ ++ if (!(spec.flags & LEFT) && field_width) ++ __prt_chars_reserved(out, '0', field_width); ++ ++ /* zero padding from precision */ ++ if (precision > nr_digits) ++ __prt_chars_reserved(out, '0', precision - nr_digits); ++ + /* actual digits of result */ +- while (--i >= 0) { +- if (buf < end) +- *buf = tmp[i]; +- ++buf; +- } ++ while (--nr_digits >= 0) ++ __prt_char_reserved(out, tmp[nr_digits]); ++ + /* trailing space padding */ +- while (--field_width >= 0) { +- if (buf < end) +- *buf = ' '; +- ++buf; +- } ++ if ((spec.flags & LEFT) && field_width) ++ __prt_chars_reserved(out, ' ', field_width); + +- return buf; ++ printbuf_nul_terminate(out); + } + + static noinline_for_stack +-char *special_hex_number(char *buf, char *end, unsigned long long num, int size) ++void special_hex_number(struct printbuf *out, unsigned long long num, int size) + { + struct printf_spec spec; + +@@ -582,25 +570,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size) + spec.base = 16; + spec.precision = -1; + +- return number(buf, end, num, spec); ++ number(out, num, spec); + } + +-static void move_right(char *buf, char *end, unsigned len, unsigned spaces) ++/* ++ * inserts @spaces spaces @len from the end of @out ++ */ ++static void move_right(struct printbuf *out, ++ unsigned len, unsigned spaces) + { +- size_t size; +- if (buf >= end) /* nowhere to put anything */ +- return; +- size = end - buf; +- if (size <= spaces) { +- memset(buf, ' ', size); +- return; +- } +- if (len) { +- if (len > size - spaces) +- len = size - spaces; +- memmove(buf + spaces, buf, len); +- } +- memset(buf, ' ', spaces); ++ unsigned move_src = out->pos - len; ++ unsigned move_dst = move_src + spaces; ++ unsigned remaining_from_dst = move_dst < out->size ? out->size - move_dst : 0; ++ unsigned remaining_from_src = move_src < out->size ? out->size - move_src : 0; ++ ++ BUG_ON(len > out->pos); ++ ++ memmove(out->buf + move_dst, ++ out->buf + move_src, ++ min(remaining_from_dst, len)); ++ memset(out->buf + move_src, ' ', ++ min(remaining_from_src, spaces)); ++ out->pos += spaces; + } + + /* +@@ -612,67 +603,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces) + * Returns: new buffer position after padding. + */ + static noinline_for_stack +-char *widen_string(char *buf, int n, char *end, struct printf_spec spec) ++void widen_string(struct printbuf *out, int n, ++ struct printf_spec spec) + { + unsigned spaces; + + if (likely(n >= spec.field_width)) +- return buf; ++ return; + /* we want to pad the sucker */ + spaces = spec.field_width - n; +- if (!(spec.flags & LEFT)) { +- move_right(buf - n, end, n, spaces); +- return buf + spaces; +- } +- while (spaces--) { +- if (buf < end) +- *buf = ' '; +- ++buf; ++ if (!(spec.flags & LEFT)) ++ move_right(out, n, spaces); ++ else ++ prt_chars(out, ' ', spaces); ++} ++ ++static void do_width_precision(struct printbuf *out, unsigned prev_pos, ++ struct printf_spec spec) ++{ ++ unsigned n = out->pos - prev_pos; ++ ++ if (n > spec.precision) { ++ out->pos -= n - spec.precision; ++ n = spec.precision; + } +- return buf; ++ ++ widen_string(out, n, spec); + } + + /* Handle string from a well known address. */ +-static char *string_nocheck(char *buf, char *end, const char *s, +- struct printf_spec spec) ++static void string_nocheck(struct printbuf *out, ++ const char *s, ++ struct printf_spec spec) + { +- int len = 0; +- int lim = spec.precision; ++ int len = strnlen(s, spec.precision); + +- while (lim--) { +- char c = *s++; +- if (!c) +- break; +- if (buf < end) +- *buf = c; +- ++buf; +- ++len; +- } +- return widen_string(buf, len, end, spec); ++ prt_bytes(out, s, len); ++ widen_string(out, len, spec); + } + +-static char *err_ptr(char *buf, char *end, void *ptr, +- struct printf_spec spec) ++static void err_ptr(struct printbuf *out, void *ptr, ++ struct printf_spec spec) + { + int err = PTR_ERR(ptr); + const char *sym = errname(err); + +- if (sym) +- return string_nocheck(buf, end, sym, spec); +- +- /* +- * Somebody passed ERR_PTR(-1234) or some other non-existing +- * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to +- * printing it as its decimal representation. +- */ +- spec.flags |= SIGN; +- spec.base = 10; +- return number(buf, end, err, spec); ++ if (sym) { ++ string_nocheck(out, sym, spec); ++ } else { ++ /* ++ * Somebody passed ERR_PTR(-1234) or some other non-existing ++ * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to ++ * printing it as its decimal representation. ++ */ ++ spec.flags |= SIGN; ++ spec.base = 10; ++ number(out, err, spec); ++ } + } + + /* Be careful: error messages must fit into the given buffer. */ +-static char *error_string(char *buf, char *end, const char *s, +- struct printf_spec spec) ++static void error_string_spec(struct printbuf *out, const char *s, ++ struct printf_spec spec) + { + /* + * Hard limit to avoid a completely insane messages. It actually +@@ -682,7 +674,7 @@ static char *error_string(char *buf, char *end, const char *s, + if (spec.precision == -1) + spec.precision = 2 * sizeof(void *); + +- return string_nocheck(buf, end, s, spec); ++ string_nocheck(out, s, spec); + } + + /* +@@ -701,14 +693,15 @@ static const char *check_pointer_msg(const void *ptr) + return NULL; + } + +-static int check_pointer(char **buf, char *end, const void *ptr, ++static int check_pointer_spec(struct printbuf *out, ++ const void *ptr, + struct printf_spec spec) + { + const char *err_msg; + + err_msg = check_pointer_msg(ptr); + if (err_msg) { +- *buf = error_string(*buf, end, err_msg, spec); ++ error_string_spec(out, err_msg, spec); + return -EFAULT; + } + +@@ -716,18 +709,50 @@ static int check_pointer(char **buf, char *end, const void *ptr, + } + + static noinline_for_stack +-char *string(char *buf, char *end, const char *s, +- struct printf_spec spec) ++void string_spec(struct printbuf *out, ++ const char *s, ++ struct printf_spec spec) + { +- if (check_pointer(&buf, end, s, spec)) +- return buf; ++ if (check_pointer_spec(out, s, spec)) ++ return; + +- return string_nocheck(buf, end, s, spec); ++ string_nocheck(out, s, spec); + } + +-static char *pointer_string(char *buf, char *end, +- const void *ptr, +- struct printf_spec spec) ++static void error_string(struct printbuf *out, const char *s) ++{ ++ /* ++ * Hard limit to avoid a completely insane messages. It actually ++ * works pretty well because most error messages are in ++ * the many pointer format modifiers. ++ */ ++ prt_bytes(out, s, min(strlen(s), 2 * sizeof(void *))); ++} ++ ++static int check_pointer(struct printbuf *out, const void *ptr) ++{ ++ const char *err_msg; ++ ++ err_msg = check_pointer_msg(ptr); ++ if (err_msg) { ++ error_string(out, err_msg); ++ return -EFAULT; ++ } ++ ++ return 0; ++} ++ ++static void string(struct printbuf *out, const char *s) ++{ ++ if (check_pointer(out, s)) ++ return; ++ ++ prt_str(out, s); ++} ++ ++static void pointer_string(struct printbuf *out, ++ const void *ptr, ++ struct printf_spec spec) + { + spec.base = 16; + spec.flags |= SMALL; +@@ -736,7 +761,7 @@ static char *pointer_string(char *buf, char *end, + spec.flags |= ZEROPAD; + } + +- return number(buf, end, (unsigned long int)ptr, spec); ++ number(out, (unsigned long int)ptr, spec); + } + + /* Make pointers available for printing early in the boot sequence. */ +@@ -801,8 +826,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) + return __ptr_to_hashval(ptr, hashval_out); + } + +-static char *ptr_to_id(char *buf, char *end, const void *ptr, +- struct printf_spec spec) ++static void ptr_to_id(struct printbuf *out, ++ const void *ptr, ++ struct printf_spec spec) + { + const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; + unsigned long hashval; +@@ -813,47 +839,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, + * as they are not actual addresses. + */ + if (IS_ERR_OR_NULL(ptr)) +- return pointer_string(buf, end, ptr, spec); ++ return pointer_string(out, ptr, spec); + + /* When debugging early boot use non-cryptographically secure hash. */ + if (unlikely(debug_boot_weak_hash)) { + hashval = hash_long((unsigned long)ptr, 32); +- return pointer_string(buf, end, (const void *)hashval, spec); ++ return pointer_string(out, (const void *)hashval, spec); + } + + ret = __ptr_to_hashval(ptr, &hashval); + if (ret) { + spec.field_width = 2 * sizeof(ptr); + /* string length must be less than default_width */ +- return error_string(buf, end, str, spec); ++ return error_string_spec(out, str, spec); + } + +- return pointer_string(buf, end, (const void *)hashval, spec); ++ pointer_string(out, (const void *)hashval, spec); + } + +-static char *default_pointer(char *buf, char *end, const void *ptr, +- struct printf_spec spec) ++static void default_pointer(struct printbuf *out, ++ const void *ptr, ++ struct printf_spec spec) + { + /* + * default is to _not_ leak addresses, so hash before printing, + * unless no_hash_pointers is specified on the command line. + */ + if (unlikely(no_hash_pointers)) +- return pointer_string(buf, end, ptr, spec); ++ return pointer_string(out, ptr, spec); + +- return ptr_to_id(buf, end, ptr, spec); ++ return ptr_to_id(out, ptr, spec); + } + + int kptr_restrict __read_mostly; + + static noinline_for_stack +-char *restricted_pointer(char *buf, char *end, const void *ptr, +- struct printf_spec spec) ++void restricted_pointer(struct printbuf *out, ++ const void *ptr, ++ struct printf_spec spec) + { + switch (kptr_restrict) { + case 0: + /* Handle as %p, hash and do _not_ leak addresses. */ +- return default_pointer(buf, end, ptr, spec); ++ return default_pointer(out, ptr, spec); + case 1: { + const struct cred *cred; + +@@ -864,7 +892,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, + if (in_irq() || in_serving_softirq() || in_nmi()) { + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(ptr); +- return error_string(buf, end, "pK-error", spec); ++ return error_string_spec(out, "pK-error", spec); + } + + /* +@@ -890,17 +918,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr, + break; + } + +- return pointer_string(buf, end, ptr, spec); ++ return pointer_string(out, ptr, spec); + } + + static noinline_for_stack +-char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, +- const char *fmt) ++void dentry_name(struct printbuf *out, const struct dentry *d, ++ const char *fmt) + { +- const char *array[4], *s; ++ const char *array[4]; + const struct dentry *p; +- int depth; +- int i, n; ++ int i, depth; + + switch (fmt[1]) { + case '2': case '3': case '4': +@@ -912,9 +939,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp + + rcu_read_lock(); + for (i = 0; i < depth; i++, d = p) { +- if (check_pointer(&buf, end, d, spec)) { ++ if (check_pointer(out, d)) { + rcu_read_unlock(); +- return buf; ++ return; + } + + p = READ_ONCE(d->d_parent); +@@ -926,58 +953,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp + break; + } + } +- s = array[--i]; +- for (n = 0; n != spec.precision; n++, buf++) { +- char c = *s++; +- if (!c) { +- if (!i) +- break; +- c = '/'; +- s = array[--i]; +- } +- if (buf < end) +- *buf = c; ++ while (1) { ++ prt_str(out, array[--i]); ++ if (!i) ++ break; ++ prt_char(out, '/'); + } + rcu_read_unlock(); +- return widen_string(buf, n, end, spec); + } + + static noinline_for_stack +-char *file_dentry_name(char *buf, char *end, const struct file *f, +- struct printf_spec spec, const char *fmt) ++void file_dentry_name(struct printbuf *out, const struct file *f, ++ const char *fmt) + { +- if (check_pointer(&buf, end, f, spec)) +- return buf; ++ if (check_pointer(out, f)) ++ return; + +- return dentry_name(buf, end, f->f_path.dentry, spec, fmt); ++ return dentry_name(out, f->f_path.dentry, fmt); + } + #ifdef CONFIG_BLOCK + static noinline_for_stack +-char *bdev_name(char *buf, char *end, struct block_device *bdev, +- struct printf_spec spec, const char *fmt) ++void bdev_name(struct printbuf *out, struct block_device *bdev) + { + struct gendisk *hd; + +- if (check_pointer(&buf, end, bdev, spec)) +- return buf; ++ if (check_pointer(out, bdev)) ++ return; + + hd = bdev->bd_disk; +- buf = string(buf, end, hd->disk_name, spec); ++ string(out, hd->disk_name); + if (bdev->bd_partno) { +- if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { +- if (buf < end) +- *buf = 'p'; +- buf++; +- } +- buf = number(buf, end, bdev->bd_partno, spec); ++ if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) ++ prt_char(out, 'p'); ++ prt_u64(out, bdev->bd_partno); + } +- return buf; + } + #endif + + static noinline_for_stack +-char *symbol_string(char *buf, char *end, void *ptr, +- struct printf_spec spec, const char *fmt) ++void symbol_string(struct printbuf *out, void *ptr, ++ const char *fmt) + { + unsigned long value; + #ifdef CONFIG_KALLSYMS +@@ -1000,17 +1015,12 @@ char *symbol_string(char *buf, char *end, void *ptr, + else + sprint_symbol_no_offset(sym, value); + +- return string_nocheck(buf, end, sym, spec); ++ prt_str(out, sym); + #else +- return special_hex_number(buf, end, value, sizeof(void *)); ++ special_hex_number(out, value, sizeof(void *)); + #endif + } + +-static const struct printf_spec default_str_spec = { +- .field_width = -1, +- .precision = -1, +-}; +- + static const struct printf_spec default_flag_spec = { + .base = 16, + .precision = -1, +@@ -1022,23 +1032,9 @@ static const struct printf_spec default_dec_spec = { + .precision = -1, + }; + +-static const struct printf_spec default_dec02_spec = { +- .base = 10, +- .field_width = 2, +- .precision = -1, +- .flags = ZEROPAD, +-}; +- +-static const struct printf_spec default_dec04_spec = { +- .base = 10, +- .field_width = 4, +- .precision = -1, +- .flags = ZEROPAD, +-}; +- + static noinline_for_stack +-char *resource_string(char *buf, char *end, struct resource *res, +- struct printf_spec spec, const char *fmt) ++void resource_string(struct printbuf *out, struct resource *res, ++ int decode) + { + #ifndef IO_RSRC_PRINTK_SIZE + #define IO_RSRC_PRINTK_SIZE 6 +@@ -1077,80 +1073,79 @@ char *resource_string(char *buf, char *end, struct resource *res, + #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) + #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") + #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") +- char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, +- 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; +- +- char *p = sym, *pend = sym + sizeof(sym); +- int decode = (fmt[0] == 'R') ? 1 : 0; + const struct printf_spec *specp; + +- if (check_pointer(&buf, end, res, spec)) +- return buf; ++ if (check_pointer(out, res)) ++ return; + +- *p++ = '['; ++ prt_char(out, '['); + if (res->flags & IORESOURCE_IO) { +- p = string_nocheck(p, pend, "io ", str_spec); ++ string_nocheck(out, "io ", str_spec); + specp = &io_spec; + } else if (res->flags & IORESOURCE_MEM) { +- p = string_nocheck(p, pend, "mem ", str_spec); ++ string_nocheck(out, "mem ", str_spec); + specp = &mem_spec; + } else if (res->flags & IORESOURCE_IRQ) { +- p = string_nocheck(p, pend, "irq ", str_spec); ++ string_nocheck(out, "irq ", str_spec); + specp = &default_dec_spec; + } else if (res->flags & IORESOURCE_DMA) { +- p = string_nocheck(p, pend, "dma ", str_spec); ++ string_nocheck(out, "dma ", str_spec); + specp = &default_dec_spec; + } else if (res->flags & IORESOURCE_BUS) { +- p = string_nocheck(p, pend, "bus ", str_spec); ++ string_nocheck(out, "bus ", str_spec); + specp = &bus_spec; + } else { +- p = string_nocheck(p, pend, "??? ", str_spec); ++ string_nocheck(out, "??? ", str_spec); + specp = &mem_spec; + decode = 0; + } + if (decode && res->flags & IORESOURCE_UNSET) { +- p = string_nocheck(p, pend, "size ", str_spec); +- p = number(p, pend, resource_size(res), *specp); ++ string_nocheck(out, "size ", str_spec); ++ number(out, resource_size(res), *specp); + } else { +- p = number(p, pend, res->start, *specp); ++ number(out, res->start, *specp); + if (res->start != res->end) { +- *p++ = '-'; +- p = number(p, pend, res->end, *specp); ++ prt_char(out, '-'); ++ number(out, res->end, *specp); + } + } + if (decode) { + if (res->flags & IORESOURCE_MEM_64) +- p = string_nocheck(p, pend, " 64bit", str_spec); ++ string_nocheck(out, " 64bit", str_spec); + if (res->flags & IORESOURCE_PREFETCH) +- p = string_nocheck(p, pend, " pref", str_spec); ++ string_nocheck(out, " pref", str_spec); + if (res->flags & IORESOURCE_WINDOW) +- p = string_nocheck(p, pend, " window", str_spec); ++ string_nocheck(out, " window", str_spec); + if (res->flags & IORESOURCE_DISABLED) +- p = string_nocheck(p, pend, " disabled", str_spec); ++ string_nocheck(out, " disabled", str_spec); + } else { +- p = string_nocheck(p, pend, " flags ", str_spec); +- p = number(p, pend, res->flags, default_flag_spec); ++ string_nocheck(out, " flags ", str_spec); ++ number(out, res->flags, default_flag_spec); + } +- *p++ = ']'; +- *p = '\0'; ++ prt_char(out, ']'); + +- return string_nocheck(buf, end, sym, spec); ++ printbuf_nul_terminate(out); + } + + static noinline_for_stack +-char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, +- const char *fmt) ++void hex_string(struct printbuf *out, const u8 *addr, ++ int len, const char *fmt) + { +- int i, len = 1; /* if we pass '%ph[CDN]', field width remains +- negative value, fallback to the default */ + char separator; + +- if (spec.field_width == 0) +- /* nothing to print */ +- return buf; ++ /* nothing to print */ ++ if (len == 0) ++ return; ++ ++ /* if we pass '%ph[CDN]', field width remains ++ negative value, fallback to the default */ ++ if (len < 0) ++ len = 1; + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ len = min(len, 64); ++ ++ if (check_pointer(out, addr)) ++ return; + + switch (fmt[1]) { + case 'C': +@@ -1167,41 +1162,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, + break; + } + +- if (spec.field_width > 0) +- len = min_t(int, spec.field_width, 64); +- +- for (i = 0; i < len; ++i) { +- if (buf < end) +- *buf = hex_asc_hi(addr[i]); +- ++buf; +- if (buf < end) +- *buf = hex_asc_lo(addr[i]); +- ++buf; +- +- if (separator && i != len - 1) { +- if (buf < end) +- *buf = separator; +- ++buf; +- } +- } +- +- return buf; ++ prt_hex_bytes(out, addr, len, 1, separator); + } + + static noinline_for_stack +-char *bitmap_string(char *buf, char *end, unsigned long *bitmap, +- struct printf_spec spec, const char *fmt) ++void bitmap_string(struct printbuf *out, unsigned long *bitmap, int nr_bits) + { ++ struct printf_spec spec = { .flags = SMALL | ZEROPAD, .base = 16 }; + const int CHUNKSZ = 32; +- int nr_bits = max_t(int, spec.field_width, 0); + int i, chunksz; + bool first = true; + +- if (check_pointer(&buf, end, bitmap, spec)) +- return buf; ++ nr_bits = max(nr_bits, 0); + +- /* reused to print numbers */ +- spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 }; ++ if (check_pointer(out, bitmap)) ++ return; + + chunksz = nr_bits & (CHUNKSZ - 1); + if (chunksz == 0) +@@ -1217,63 +1192,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap, + bit = i % BITS_PER_LONG; + val = (bitmap[word] >> bit) & chunkmask; + +- if (!first) { +- if (buf < end) +- *buf = ','; +- buf++; +- } ++ if (!first) ++ prt_char(out, ','); + first = false; + + spec.field_width = DIV_ROUND_UP(chunksz, 4); +- buf = number(buf, end, val, spec); ++ number(out, val, spec); + + chunksz = CHUNKSZ; + } +- return buf; + } + + static noinline_for_stack +-char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, +- struct printf_spec spec, const char *fmt) ++void bitmap_list_string(struct printbuf *out, unsigned long *bitmap, ++ int nr_bits) + { +- int nr_bits = max_t(int, spec.field_width, 0); + bool first = true; + int rbot, rtop; + +- if (check_pointer(&buf, end, bitmap, spec)) +- return buf; ++ nr_bits = max(nr_bits, 0); ++ ++ if (check_pointer(out, bitmap)) ++ return ; + + for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { +- if (!first) { +- if (buf < end) +- *buf = ','; +- buf++; +- } ++ if (!first) ++ prt_char(out, ','); + first = false; + +- buf = number(buf, end, rbot, default_dec_spec); ++ prt_u64(out, rbot); + if (rtop == rbot + 1) + continue; + +- if (buf < end) +- *buf = '-'; +- buf = number(++buf, end, rtop - 1, default_dec_spec); ++ prt_char(out, '-'); ++ prt_u64(out, rtop - 1); + } +- return buf; + } + + static noinline_for_stack +-char *mac_address_string(char *buf, char *end, u8 *addr, +- struct printf_spec spec, const char *fmt) ++void mac_address_string(struct printbuf *out, u8 *addr, ++ const char *fmt) + { +- char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; +- char *p = mac_addr; + int i; + char separator; + bool reversed = false; + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ if (check_pointer(out, addr)) ++ return; + + switch (fmt[1]) { + case 'F': +@@ -1291,25 +1256,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr, + + for (i = 0; i < 6; i++) { + if (reversed) +- p = hex_byte_pack(p, addr[5 - i]); ++ prt_hex_byte(out, addr[5 - i]); + else +- p = hex_byte_pack(p, addr[i]); ++ prt_hex_byte(out, addr[i]); + + if (fmt[0] == 'M' && i != 5) +- *p++ = separator; ++ prt_char(out, separator); + } +- *p = '\0'; +- +- return string_nocheck(buf, end, mac_addr, spec); + } + + static noinline_for_stack +-char *ip4_string(char *p, const u8 *addr, const char *fmt) ++void ip4_string(struct printbuf *out, const u8 *addr, const char *fmt) + { +- int i; +- bool leading_zeros = (fmt[0] == 'i'); +- int index; +- int step; ++ struct printf_spec spec = default_dec_spec; ++ int i, index, step; ++ ++ if (fmt[0] == 'i') ++ spec.precision = 3; + + switch (fmt[2]) { + case 'h': +@@ -1333,28 +1296,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) + break; + } + for (i = 0; i < 4; i++) { +- char temp[4] __aligned(2); /* hold each IP quad in reverse order */ +- int digits = put_dec_trunc8(temp, addr[index]) - temp; +- if (leading_zeros) { +- if (digits < 3) +- *p++ = '0'; +- if (digits < 2) +- *p++ = '0'; +- } +- /* reverse the digits in the quad */ +- while (digits--) +- *p++ = temp[digits]; +- if (i < 3) +- *p++ = '.'; ++ if (i) ++ prt_char(out, '.'); ++ number(out, addr[index], spec); + index += step; + } +- *p = '\0'; +- +- return p; + } + + static noinline_for_stack +-char *ip6_compressed_string(char *p, const char *addr) ++void ip6_compressed_string(struct printbuf *out, const char *addr) + { + int i, j, range; + unsigned char zerolength[8]; +@@ -1398,14 +1348,14 @@ char *ip6_compressed_string(char *p, const char *addr) + for (i = 0; i < range; i++) { + if (i == colonpos) { + if (needcolon || i == 0) +- *p++ = ':'; +- *p++ = ':'; ++ __prt_char(out, ':'); ++ __prt_char(out, ':'); + needcolon = false; + i += longest - 1; + continue; + } + if (needcolon) { +- *p++ = ':'; ++ __prt_char(out, ':'); + needcolon = false; + } + /* hex u16 without leading 0s */ +@@ -1414,81 +1364,56 @@ char *ip6_compressed_string(char *p, const char *addr) + lo = word & 0xff; + if (hi) { + if (hi > 0x0f) +- p = hex_byte_pack(p, hi); ++ prt_hex_byte(out, hi); + else +- *p++ = hex_asc_lo(hi); +- p = hex_byte_pack(p, lo); ++ __prt_char(out, hex_asc_lo(hi)); ++ prt_hex_byte(out, lo); + } + else if (lo > 0x0f) +- p = hex_byte_pack(p, lo); ++ prt_hex_byte(out, lo); + else +- *p++ = hex_asc_lo(lo); ++ __prt_char(out, hex_asc_lo(lo)); + needcolon = true; + } + + if (useIPv4) { + if (needcolon) +- *p++ = ':'; +- p = ip4_string(p, &in6.s6_addr[12], "I4"); ++ __prt_char(out, ':'); ++ ip4_string(out, &in6.s6_addr[12], "I4"); + } +- *p = '\0'; +- +- return p; + } + + static noinline_for_stack +-char *ip6_string(char *p, const char *addr, const char *fmt) ++void ip6_string(struct printbuf *out, const char *addr, const char *fmt) + { + int i; + + for (i = 0; i < 8; i++) { +- p = hex_byte_pack(p, *addr++); +- p = hex_byte_pack(p, *addr++); ++ prt_hex_byte(out, *addr++); ++ prt_hex_byte(out, *addr++); + if (fmt[0] == 'I' && i != 7) +- *p++ = ':'; ++ prt_char(out, ':'); + } +- *p = '\0'; +- +- return p; + } + + static noinline_for_stack +-char *ip6_addr_string(char *buf, char *end, const u8 *addr, +- struct printf_spec spec, const char *fmt) ++void ip6_addr_string(struct printbuf *out, const u8 *addr, ++ const char *fmt) + { +- char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; +- + if (fmt[0] == 'I' && fmt[2] == 'c') +- ip6_compressed_string(ip6_addr, addr); ++ ip6_compressed_string(out, addr); + else +- ip6_string(ip6_addr, addr, fmt); +- +- return string_nocheck(buf, end, ip6_addr, spec); +-} +- +-static noinline_for_stack +-char *ip4_addr_string(char *buf, char *end, const u8 *addr, +- struct printf_spec spec, const char *fmt) +-{ +- char ip4_addr[sizeof("255.255.255.255")]; +- +- ip4_string(ip4_addr, addr, fmt); +- +- return string_nocheck(buf, end, ip4_addr, spec); ++ ip6_string(out, addr, fmt); + } + + static noinline_for_stack +-char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, +- struct printf_spec spec, const char *fmt) ++void ip6_addr_string_sa(struct printbuf *out, ++ const struct sockaddr_in6 *sa, ++ const char *fmt) + { + bool have_p = false, have_s = false, have_f = false, have_c = false; +- char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + +- sizeof(":12345") + sizeof("/123456789") + +- sizeof("%1234567890")]; +- char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr); + const u8 *addr = (const u8 *) &sa->sin6_addr; + char fmt6[2] = { fmt[0], '6' }; +- u8 off = 0; + + fmt++; + while (isalpha(*++fmt)) { +@@ -1508,44 +1433,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, + } + } + +- if (have_p || have_s || have_f) { +- *p = '['; +- off = 1; +- } ++ if (have_p || have_s || have_f) ++ prt_char(out, '['); + + if (fmt6[0] == 'I' && have_c) +- p = ip6_compressed_string(ip6_addr + off, addr); ++ ip6_compressed_string(out, addr); + else +- p = ip6_string(ip6_addr + off, addr, fmt6); ++ ip6_string(out, addr, fmt6); + + if (have_p || have_s || have_f) +- *p++ = ']'; ++ prt_char(out, ']'); + + if (have_p) { +- *p++ = ':'; +- p = number(p, pend, ntohs(sa->sin6_port), spec); ++ prt_char(out, ':'); ++ prt_u64(out, ntohs(sa->sin6_port)); + } + if (have_f) { +- *p++ = '/'; +- p = number(p, pend, ntohl(sa->sin6_flowinfo & +- IPV6_FLOWINFO_MASK), spec); ++ prt_char(out, '/'); ++ prt_u64(out, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK)); + } + if (have_s) { +- *p++ = '%'; +- p = number(p, pend, sa->sin6_scope_id, spec); ++ prt_char(out, '%'); ++ prt_u64(out, sa->sin6_scope_id); + } +- *p = '\0'; +- +- return string_nocheck(buf, end, ip6_addr, spec); + } + + static noinline_for_stack +-char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, +- struct printf_spec spec, const char *fmt) ++void ip4_addr_string_sa(struct printbuf *out, const struct sockaddr_in *sa, ++ const char *fmt) + { + bool have_p = false; +- char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")]; +- char *pend = ip4_addr + sizeof(ip4_addr); + const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; + char fmt4[3] = { fmt[0], '4', 0 }; + +@@ -1564,30 +1481,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, + } + } + +- p = ip4_string(ip4_addr, addr, fmt4); ++ ip4_string(out, addr, fmt4); + if (have_p) { +- *p++ = ':'; +- p = number(p, pend, ntohs(sa->sin_port), spec); ++ prt_char(out, ':'); ++ prt_u64(out, ntohs(sa->sin_port)); + } +- *p = '\0'; +- +- return string_nocheck(buf, end, ip4_addr, spec); + } + + static noinline_for_stack +-char *ip_addr_string(char *buf, char *end, const void *ptr, +- struct printf_spec spec, const char *fmt) ++void ip_addr_string(struct printbuf *out, const void *ptr, ++ const char *fmt) + { + char *err_fmt_msg; + +- if (check_pointer(&buf, end, ptr, spec)) +- return buf; ++ if (check_pointer(out, ptr)) ++ return; + + switch (fmt[1]) { + case '6': +- return ip6_addr_string(buf, end, ptr, spec, fmt); ++ return ip6_addr_string(out, ptr, fmt); + case '4': +- return ip4_addr_string(buf, end, ptr, spec, fmt); ++ return ip4_string(out, ptr, fmt); + case 'S': { + const union { + struct sockaddr raw; +@@ -1597,21 +1511,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr, + + switch (sa->raw.sa_family) { + case AF_INET: +- return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt); ++ return ip4_addr_string_sa(out, &sa->v4, fmt); + case AF_INET6: +- return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt); ++ return ip6_addr_string_sa(out, &sa->v6, fmt); + default: +- return error_string(buf, end, "(einval)", spec); ++ return error_string(out, "(einval)"); + }} + } + + err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)"; +- return error_string(buf, end, err_fmt_msg, spec); ++ error_string(out, err_fmt_msg); + } + + static noinline_for_stack +-char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, +- const char *fmt) ++void escaped_string(struct printbuf *out, u8 *addr, ++ struct printf_spec spec, const char *fmt) + { + bool found = true; + int count = 1; +@@ -1619,10 +1533,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, + int len; + + if (spec.field_width == 0) +- return buf; /* nothing to print */ ++ return; /* nothing to print */ + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ if (check_pointer_spec(out, addr, spec)) ++ return; + + do { + switch (fmt[count++]) { +@@ -1657,44 +1571,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, + flags = ESCAPE_ANY_NP; + + len = spec.field_width < 0 ? 1 : spec.field_width; +- +- /* +- * string_escape_mem() writes as many characters as it can to +- * the given buffer, and returns the total size of the output +- * had the buffer been big enough. +- */ +- buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); +- +- return buf; ++ prt_escaped_string(out, addr, len, flags, NULL); + } + +-static char *va_format(char *buf, char *end, struct va_format *va_fmt, +- struct printf_spec spec, const char *fmt) ++static void va_format(struct printbuf *out, ++ struct va_format *va_fmt, ++ struct printf_spec spec, const char *fmt) + { + va_list va; + +- if (check_pointer(&buf, end, va_fmt, spec)) +- return buf; ++ if (check_pointer_spec(out, va_fmt, spec)) ++ return; + + va_copy(va, *va_fmt->va); +- buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); ++ prt_vprintf(out, va_fmt->fmt, va); + va_end(va); +- +- return buf; + } + + static noinline_for_stack +-char *uuid_string(char *buf, char *end, const u8 *addr, +- struct printf_spec spec, const char *fmt) ++void uuid_string(struct printbuf *out, const u8 *addr, const char *fmt) + { +- char uuid[UUID_STRING_LEN + 1]; +- char *p = uuid; + int i; + const u8 *index = uuid_index; + bool uc = false; + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ if (check_pointer(out, addr)) ++ return; + + switch (*(++fmt)) { + case 'L': +@@ -1710,60 +1612,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr, + + for (i = 0; i < 16; i++) { + if (uc) +- p = hex_byte_pack_upper(p, addr[index[i]]); ++ prt_hex_byte_upper(out, addr[index[i]]); + else +- p = hex_byte_pack(p, addr[index[i]]); ++ prt_hex_byte(out, addr[index[i]]); + switch (i) { + case 3: + case 5: + case 7: + case 9: +- *p++ = '-'; ++ prt_char(out, '-'); + break; + } + } +- +- *p = 0; +- +- return string_nocheck(buf, end, uuid, spec); + } + + static noinline_for_stack +-char *netdev_bits(char *buf, char *end, const void *addr, +- struct printf_spec spec, const char *fmt) ++void netdev_bits(struct printbuf *out, const void *addr, ++ const char *fmt) + { + unsigned long long num; + int size; + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ if (check_pointer(out, addr)) ++ return; + + switch (fmt[1]) { + case 'F': + num = *(const netdev_features_t *)addr; + size = sizeof(netdev_features_t); ++ special_hex_number(out, num, size); + break; + default: +- return error_string(buf, end, "(%pN?)", spec); ++ error_string(out, "(%pN?)"); ++ break; + } +- +- return special_hex_number(buf, end, num, size); + } + + static noinline_for_stack +-char *fourcc_string(char *buf, char *end, const u32 *fourcc, +- struct printf_spec spec, const char *fmt) ++void fourcc_string(struct printbuf *out, const u32 *fourcc, ++ const char *fmt) + { +- char output[sizeof("0123 little-endian (0x01234567)")]; +- char *p = output; + unsigned int i; + u32 orig, val; + + if (fmt[1] != 'c' || fmt[2] != 'c') +- return error_string(buf, end, "(%p4?)", spec); ++ return error_string(out, "(%p4?)"); + +- if (check_pointer(&buf, end, fourcc, spec)) +- return buf; ++ if (check_pointer(out, fourcc)) ++ return; + + orig = get_unaligned(fourcc); + val = orig & ~BIT(31); +@@ -1772,31 +1668,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc, + unsigned char c = val >> (i * 8); + + /* Print non-control ASCII characters as-is, dot otherwise */ +- *p++ = isascii(c) && isprint(c) ? c : '.'; ++ prt_char(out, isascii(c) && isprint(c) ? c : '.'); + } + +- *p++ = ' '; +- strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); +- p += strlen(p); +- +- *p++ = ' '; +- *p++ = '('; +- p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32)); +- *p++ = ')'; +- *p = '\0'; ++ prt_char(out, ' '); ++ prt_str(out, orig & BIT(31) ? "big-endian" : "little-endian"); + +- return string(buf, end, output, spec); ++ prt_char(out, ' '); ++ prt_char(out, '('); ++ special_hex_number(out, orig, sizeof(u32)); ++ prt_char(out, ')'); + } + + static noinline_for_stack +-char *address_val(char *buf, char *end, const void *addr, +- struct printf_spec spec, const char *fmt) ++void address_val(struct printbuf *out, const void *addr, ++ const char *fmt) + { + unsigned long long num; + int size; + +- if (check_pointer(&buf, end, addr, spec)) +- return buf; ++ if (check_pointer(out, addr)) ++ return; + + switch (fmt[1]) { + case 'd': +@@ -1810,55 +1702,44 @@ char *address_val(char *buf, char *end, const void *addr, + break; + } + +- return special_hex_number(buf, end, num, size); ++ special_hex_number(out, num, size); + } + + static noinline_for_stack +-char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r) ++void date_str(struct printbuf *out, ++ const struct rtc_time *tm, bool r) + { + int year = tm->tm_year + (r ? 0 : 1900); + int mon = tm->tm_mon + (r ? 0 : 1); + +- buf = number(buf, end, year, default_dec04_spec); +- if (buf < end) +- *buf = '-'; +- buf++; +- +- buf = number(buf, end, mon, default_dec02_spec); +- if (buf < end) +- *buf = '-'; +- buf++; +- +- return number(buf, end, tm->tm_mday, default_dec02_spec); ++ prt_u64_minwidth(out, year, 4); ++ prt_char(out, '-'); ++ prt_u64_minwidth(out, mon, 2); ++ prt_char(out, '-'); ++ prt_u64_minwidth(out, tm->tm_mday, 2); + } + + static noinline_for_stack +-char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r) ++void time_str(struct printbuf *out, const struct rtc_time *tm, bool r) + { +- buf = number(buf, end, tm->tm_hour, default_dec02_spec); +- if (buf < end) +- *buf = ':'; +- buf++; +- +- buf = number(buf, end, tm->tm_min, default_dec02_spec); +- if (buf < end) +- *buf = ':'; +- buf++; +- +- return number(buf, end, tm->tm_sec, default_dec02_spec); ++ prt_u64_minwidth(out, tm->tm_hour, 2); ++ prt_char(out, ':'); ++ prt_u64_minwidth(out, tm->tm_min, 2); ++ prt_char(out, ':'); ++ prt_u64_minwidth(out, tm->tm_sec, 2); + } + + static noinline_for_stack +-char *rtc_str(char *buf, char *end, const struct rtc_time *tm, +- struct printf_spec spec, const char *fmt) ++void rtc_str(struct printbuf *out, const struct rtc_time *tm, ++ const char *fmt) + { + bool have_t = true, have_d = true; + bool raw = false, iso8601_separator = true; + bool found = true; + int count = 2; + +- if (check_pointer(&buf, end, tm, spec)) +- return buf; ++ if (check_pointer(out, tm)) ++ return; + + switch (fmt[count]) { + case 'd': +@@ -1886,21 +1767,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm, + } while (found); + + if (have_d) +- buf = date_str(buf, end, tm, raw); +- if (have_d && have_t) { +- if (buf < end) +- *buf = iso8601_separator ? 'T' : ' '; +- buf++; +- } ++ date_str(out, tm, raw); ++ if (have_d && have_t) ++ prt_char(out, iso8601_separator ? 'T' : ' '); + if (have_t) +- buf = time_str(buf, end, tm, raw); +- +- return buf; ++ time_str(out, tm, raw); + } + + static noinline_for_stack +-char *time64_str(char *buf, char *end, const time64_t time, +- struct printf_spec spec, const char *fmt) ++void time64_str(struct printbuf *out, const time64_t time, ++ const char *fmt) + { + struct rtc_time rtc_time; + struct tm tm; +@@ -1918,47 +1794,47 @@ char *time64_str(char *buf, char *end, const time64_t time, + + rtc_time.tm_isdst = 0; + +- return rtc_str(buf, end, &rtc_time, spec, fmt); ++ rtc_str(out, &rtc_time, fmt); + } + + static noinline_for_stack +-char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec, +- const char *fmt) ++void time_and_date(struct printbuf *out, void *ptr, ++ const char *fmt) + { + switch (fmt[1]) { + case 'R': +- return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt); ++ return rtc_str(out, (const struct rtc_time *)ptr, fmt); + case 'T': +- return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt); ++ return time64_str(out, *(const time64_t *)ptr, fmt); + default: +- return error_string(buf, end, "(%pt?)", spec); ++ return error_string(out, "(%pt?)"); + } + } + + static noinline_for_stack +-char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, +- const char *fmt) ++void clock(struct printbuf *out, struct clk *clk, ++ struct printf_spec spec, const char *fmt) + { + if (!IS_ENABLED(CONFIG_HAVE_CLK)) +- return error_string(buf, end, "(%pC?)", spec); ++ return error_string_spec(out, "(%pC?)", spec); + +- if (check_pointer(&buf, end, clk, spec)) +- return buf; ++ if (check_pointer_spec(out, clk, spec)) ++ return; + + switch (fmt[1]) { + case 'n': + default: + #ifdef CONFIG_COMMON_CLK +- return string(buf, end, __clk_get_name(clk), spec); ++ return string_spec(out, __clk_get_name(clk), spec); + #else +- return ptr_to_id(buf, end, clk, spec); ++ return ptr_to_id(out, clk, spec); + #endif + } + } + + static +-char *format_flags(char *buf, char *end, unsigned long flags, +- const struct trace_print_flags *names) ++void format_flags(struct printbuf *out, unsigned long flags, ++ const struct trace_print_flags *names) + { + unsigned long mask; + +@@ -1967,20 +1843,15 @@ char *format_flags(char *buf, char *end, unsigned long flags, + if ((flags & mask) != mask) + continue; + +- buf = string(buf, end, names->name, default_str_spec); ++ string(out, names->name); + + flags &= ~mask; +- if (flags) { +- if (buf < end) +- *buf = '|'; +- buf++; +- } ++ if (flags) ++ prt_char(out, '|'); + } + + if (flags) +- buf = number(buf, end, flags, default_flag_spec); +- +- return buf; ++ number(out, flags, default_flag_spec); + } + + struct page_flags_fields { +@@ -2005,20 +1876,18 @@ static const struct page_flags_fields pff[] = { + }; + + static +-char *format_page_flags(char *buf, char *end, unsigned long flags) ++void format_page_flags(struct printbuf *out, unsigned long flags) + { + unsigned long main_flags = flags & PAGEFLAGS_MASK; + bool append = false; + int i; + +- buf = number(buf, end, flags, default_flag_spec); +- if (buf < end) +- *buf = '('; +- buf++; ++ number(out, flags, default_flag_spec); ++ prt_char(out, '('); + + /* Page flags from the main area. */ + if (main_flags) { +- buf = format_flags(buf, end, main_flags, pageflag_names); ++ format_flags(out, main_flags, pageflag_names); + append = true; + } + +@@ -2029,41 +1898,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) + continue; + + /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ +- if (append) { +- if (buf < end) +- *buf = '|'; +- buf++; +- } ++ if (append) ++ prt_char(out, '|'); + +- buf = string(buf, end, pff[i].name, default_str_spec); +- if (buf < end) +- *buf = '='; +- buf++; +- buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, +- *pff[i].spec); ++ string(out, pff[i].name); ++ prt_char(out, '='); ++ number(out, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec); + + append = true; + } +- if (buf < end) +- *buf = ')'; +- buf++; +- +- return buf; ++ prt_char(out, ')'); + } + + static noinline_for_stack +-char *flags_string(char *buf, char *end, void *flags_ptr, +- struct printf_spec spec, const char *fmt) ++void flags_string(struct printbuf *out, void *flags_ptr, ++ const char *fmt) + { + unsigned long flags; + const struct trace_print_flags *names; + +- if (check_pointer(&buf, end, flags_ptr, spec)) +- return buf; ++ if (check_pointer(out, flags_ptr)) ++ return; + + switch (fmt[1]) { + case 'p': +- return format_page_flags(buf, end, *(unsigned long *)flags_ptr); ++ return format_page_flags(out, *(unsigned long *)flags_ptr); + case 'v': + flags = *(unsigned long *)flags_ptr; + names = vmaflag_names; +@@ -2073,15 +1932,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr, + names = gfpflag_names; + break; + default: +- return error_string(buf, end, "(%pG?)", spec); ++ return error_string(out, "(%pG?)"); + } + +- return format_flags(buf, end, flags, names); ++ return format_flags(out, flags, names); + } + + static noinline_for_stack +-char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, +- char *end) ++void fwnode_full_name_string(struct printbuf *out, ++ struct fwnode_handle *fwnode) + { + int depth; + +@@ -2090,39 +1949,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, + struct fwnode_handle *__fwnode = + fwnode_get_nth_parent(fwnode, depth); + +- buf = string(buf, end, fwnode_get_name_prefix(__fwnode), +- default_str_spec); +- buf = string(buf, end, fwnode_get_name(__fwnode), +- default_str_spec); ++ string(out, fwnode_get_name_prefix(__fwnode)); ++ string(out, fwnode_get_name(__fwnode)); + + fwnode_handle_put(__fwnode); + } +- +- return buf; + } + + static noinline_for_stack +-char *device_node_string(char *buf, char *end, struct device_node *dn, +- struct printf_spec spec, const char *fmt) ++void device_node_string(struct printbuf *out, struct device_node *dn, ++ const char *fmt) + { +- char tbuf[sizeof("xxxx") + 1]; + const char *p; + int ret; +- char *buf_start = buf; + struct property *prop; + bool has_mult, pass; + +- struct printf_spec str_spec = spec; +- str_spec.field_width = -1; +- + if (fmt[0] != 'F') +- return error_string(buf, end, "(%pO?)", spec); ++ return error_string(out, "(%pO?)"); + + if (!IS_ENABLED(CONFIG_OF)) +- return error_string(buf, end, "(%pOF?)", spec); ++ return error_string(out, "(%pOF?)"); + +- if (check_pointer(&buf, end, dn, spec)) +- return buf; ++ if (check_pointer(out, dn)) ++ return; + + /* simple case without anything any more format specifiers */ + fmt++; +@@ -2130,55 +1980,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, + fmt = "f"; + + for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { +- int precision; +- if (pass) { +- if (buf < end) +- *buf = ':'; +- buf++; +- } ++ if (pass) ++ prt_char(out, ':'); + + switch (*fmt) { + case 'f': /* full_name */ +- buf = fwnode_full_name_string(of_fwnode_handle(dn), buf, +- end); ++ fwnode_full_name_string(out, of_fwnode_handle(dn)); + break; +- case 'n': /* name */ +- p = fwnode_get_name(of_fwnode_handle(dn)); +- precision = str_spec.precision; +- str_spec.precision = strchrnul(p, '@') - p; +- buf = string(buf, end, p, str_spec); +- str_spec.precision = precision; ++ case 'n': { /* name */ ++ const char *name = fwnode_get_name(of_fwnode_handle(dn)); ++ unsigned len = strchrnul(name, '@') - name; ++ ++ prt_bytes(out, name, len); + break; ++ } + case 'p': /* phandle */ +- buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec); ++ prt_u64(out, dn->phandle); + break; + case 'P': /* path-spec */ + p = fwnode_get_name(of_fwnode_handle(dn)); + if (!p[1]) + p = "/"; +- buf = string(buf, end, p, str_spec); ++ string(out, p); + break; + case 'F': /* flags */ +- tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'; +- tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'; +- tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'; +- tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'; +- tbuf[4] = 0; +- buf = string_nocheck(buf, end, tbuf, str_spec); +- break; +- case 'c': /* major compatible string */ ++ prt_char(out, of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'); ++ prt_char(out, of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'); ++ prt_char(out, of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'); ++ prt_char(out, of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'); ++ break; ++ case 'c': /* major compatible string_spec */ + ret = of_property_read_string(dn, "compatible", &p); + if (!ret) +- buf = string(buf, end, p, str_spec); ++ string(out, p); + break; +- case 'C': /* full compatible string */ ++ case 'C': /* full compatible string_spec */ + has_mult = false; + of_property_for_each_string(dn, "compatible", prop, p) { + if (has_mult) +- buf = string_nocheck(buf, end, ",", str_spec); +- buf = string_nocheck(buf, end, "\"", str_spec); +- buf = string(buf, end, p, str_spec); +- buf = string_nocheck(buf, end, "\"", str_spec); ++ prt_char(out, ','); ++ prt_char(out, '\"'); ++ string(out, p); ++ prt_char(out, '\"'); + + has_mult = true; + } +@@ -2187,38 +2030,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, + break; + } + } +- +- return widen_string(buf, buf - buf_start, end, spec); + } + + static noinline_for_stack +-char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, +- struct printf_spec spec, const char *fmt) ++void fwnode_string(struct printbuf *out, ++ struct fwnode_handle *fwnode, ++ const char *fmt) + { +- struct printf_spec str_spec = spec; +- char *buf_start = buf; +- +- str_spec.field_width = -1; +- + if (*fmt != 'w') +- return error_string(buf, end, "(%pf?)", spec); ++ return error_string(out, "(%pf?)"); + +- if (check_pointer(&buf, end, fwnode, spec)) +- return buf; ++ if (check_pointer(out, fwnode)) ++ return; + + fmt++; + + switch (*fmt) { + case 'P': /* name */ +- buf = string(buf, end, fwnode_get_name(fwnode), str_spec); ++ string(out, fwnode_get_name(fwnode)); + break; + case 'f': /* full_name */ + default: +- buf = fwnode_full_name_string(fwnode, buf, end); ++ fwnode_full_name_string(out, fwnode); + break; + } +- +- return widen_string(buf, buf - buf_start, end, spec); + } + + int __init no_hash_pointers_enable(char *str) +@@ -2374,33 +2209,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable); + * rendering it useful as a unique identifier. + */ + static noinline_for_stack +-char *pointer(const char *fmt, char *buf, char *end, void *ptr, +- struct printf_spec spec) ++void pointer(struct printbuf *out, const char *fmt, ++ void *ptr, struct printf_spec spec) + { ++ unsigned prev_pos = out->pos; ++ + switch (*fmt) { + case 'S': + case 's': + ptr = dereference_symbol_descriptor(ptr); + fallthrough; + case 'B': +- return symbol_string(buf, end, ptr, spec, fmt); ++ symbol_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'R': + case 'r': +- return resource_string(buf, end, ptr, spec, fmt); ++ resource_string(out, ptr, fmt[0] == 'R'); ++ return do_width_precision(out, prev_pos, spec); + case 'h': +- return hex_string(buf, end, ptr, spec, fmt); ++ /* Uses field_width but _not_ as field size */ ++ return hex_string(out, ptr, spec.field_width, fmt); + case 'b': ++ /* Uses field_width but _not_ as field size */ + switch (fmt[1]) { + case 'l': +- return bitmap_list_string(buf, end, ptr, spec, fmt); ++ return bitmap_list_string(out, ptr, spec.field_width); + default: +- return bitmap_string(buf, end, ptr, spec, fmt); ++ return bitmap_string(out, ptr, spec.field_width); + } + case 'M': /* Colon separated: 00:01:02:03:04:05 */ + case 'm': /* Contiguous: 000102030405 */ + /* [mM]F (FDDI) */ + /* [mM]R (Reverse order; Bluetooth) */ +- return mac_address_string(buf, end, ptr, spec, fmt); ++ mac_address_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'I': /* Formatted IP supported + * 4: 1.2.3.4 + * 6: 0001:0203:...:0708 +@@ -2410,57 +2252,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, + * 4: 001.002.003.004 + * 6: 000102...0f + */ +- return ip_addr_string(buf, end, ptr, spec, fmt); ++ ip_addr_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'E': +- return escaped_string(buf, end, ptr, spec, fmt); ++ return escaped_string(out, ptr, spec, fmt); + case 'U': +- return uuid_string(buf, end, ptr, spec, fmt); ++ uuid_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'V': +- return va_format(buf, end, ptr, spec, fmt); ++ return va_format(out, ptr, spec, fmt); + case 'K': +- return restricted_pointer(buf, end, ptr, spec); ++ return restricted_pointer(out, ptr, spec); + case 'N': +- return netdev_bits(buf, end, ptr, spec, fmt); ++ netdev_bits(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case '4': +- return fourcc_string(buf, end, ptr, spec, fmt); ++ fourcc_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'a': +- return address_val(buf, end, ptr, spec, fmt); ++ address_val(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'd': +- return dentry_name(buf, end, ptr, spec, fmt); ++ dentry_name(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 't': +- return time_and_date(buf, end, ptr, spec, fmt); ++ time_and_date(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'C': +- return clock(buf, end, ptr, spec, fmt); ++ return clock(out, ptr, spec, fmt); + case 'D': +- return file_dentry_name(buf, end, ptr, spec, fmt); ++ file_dentry_name(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + #ifdef CONFIG_BLOCK + case 'g': +- return bdev_name(buf, end, ptr, spec, fmt); ++ bdev_name(out, ptr); ++ return do_width_precision(out, prev_pos, spec); + #endif + + case 'G': +- return flags_string(buf, end, ptr, spec, fmt); ++ flags_string(out, ptr, fmt); ++ return do_width_precision(out, prev_pos, spec); + case 'O': +- return device_node_string(buf, end, ptr, spec, fmt + 1); ++ device_node_string(out, ptr, fmt + 1); ++ return do_width_precision(out, prev_pos, spec); + case 'f': +- return fwnode_string(buf, end, ptr, spec, fmt + 1); ++ fwnode_string(out, ptr, fmt + 1); ++ return do_width_precision(out, prev_pos, spec); + case 'x': +- return pointer_string(buf, end, ptr, spec); ++ return pointer_string(out, ptr, spec); + case 'e': + /* %pe with a non-ERR_PTR gets treated as plain %p */ + if (!IS_ERR(ptr)) +- return default_pointer(buf, end, ptr, spec); +- return err_ptr(buf, end, ptr, spec); ++ return default_pointer(out, ptr, spec); ++ return err_ptr(out, ptr, spec); + case 'u': + case 'k': + switch (fmt[1]) { + case 's': +- return string(buf, end, ptr, spec); ++ return string_spec(out, ptr, spec); + default: +- return error_string(buf, end, "(einval)", spec); ++ return error_string_spec(out, "(einval)", spec); + } + default: +- return default_pointer(buf, end, ptr, spec); ++ return default_pointer(out, ptr, spec); + } + } + +@@ -2599,8 +2453,14 @@ int format_decode(const char *fmt, struct printf_spec *spec) + return ++fmt - start; + + case 'p': +- spec->type = FORMAT_TYPE_PTR; +- return ++fmt - start; ++ fmt++; ++ if (fmt[0] == 'f' && ++ fmt[1] == '(') { ++ fmt += 2; ++ spec->type = FORMAT_TYPE_FN; ++ } else ++ spec->type = FORMAT_TYPE_PTR; ++ return fmt - start; + + case '%': + spec->type = FORMAT_TYPE_PERCENT_CHAR; +@@ -2681,53 +2541,89 @@ set_precision(struct printf_spec *spec, int prec) + } + } + ++static void call_prt_fn(struct printbuf *out, struct call_pp *call_pp, void **fn_args, unsigned nr_args) ++{ ++ typedef void (*printf_fn_0)(struct printbuf *); ++ typedef void (*printf_fn_1)(struct printbuf *, void *); ++ typedef void (*printf_fn_2)(struct printbuf *, void *, void *); ++ typedef void (*printf_fn_3)(struct printbuf *, void *, void *, void *); ++ typedef void (*printf_fn_4)(struct printbuf *, void *, void *, void *, void *); ++ typedef void (*printf_fn_5)(struct printbuf *, void *, void *, void *, void *, void *); ++ typedef void (*printf_fn_6)(struct printbuf *, void *, void *, void *, void *, void *, void *); ++ typedef void (*printf_fn_7)(struct printbuf *, void *, void *, void *, void *, void *, void *, void *); ++ typedef void (*printf_fn_8)(struct printbuf *, void *, void *, void *, void *, void *, void *, void *, void *); ++ void *fn; ++ unsigned i; ++ ++ if (check_pointer(out, call_pp)) ++ return; ++ ++ if (call_pp->magic != CALL_PP_MAGIC) { ++ error_string(out, "bad pretty-printer magic"); ++ return; ++ } ++ ++ fn = call_pp->fn; ++ if (check_pointer(out, fn)) ++ return; ++ ++ for (i = 0; i < nr_args; i++) ++ if (check_pointer(out, fn_args[i])) ++ return; ++ ++ switch (nr_args) { ++ case 0: ++ ((printf_fn_0)fn)(out); ++ break; ++ case 1: ++ ((printf_fn_1)fn)(out, fn_args[0]); ++ break; ++ case 2: ++ ((printf_fn_2)fn)(out, fn_args[0], fn_args[1]); ++ break; ++ case 3: ++ ((printf_fn_3)fn)(out, fn_args[0], fn_args[1], fn_args[2]); ++ break; ++ case 4: ++ ((printf_fn_4)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3]); ++ break; ++ case 5: ++ ((printf_fn_5)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4]); ++ break; ++ case 6: ++ ((printf_fn_6)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5]); ++ break; ++ case 7: ++ ((printf_fn_7)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5], fn_args[6]); ++ break; ++ case 8: ++ ((printf_fn_8)fn)(out, fn_args[0], fn_args[1], fn_args[2], fn_args[3], fn_args[4], fn_args[5], fn_args[6], fn_args[7]); ++ break; ++ } ++} ++ + /** +- * vsnprintf - Format a string and place it in a buffer +- * @buf: The buffer to place the result into +- * @size: The size of the buffer, including the trailing null space ++ * prt_vprintf - Format a string, outputting to a printbuf ++ * @out: The printbuf to output to + * @fmt: The format string to use + * @args: Arguments for the format string + * +- * This function generally follows C99 vsnprintf, but has some +- * extensions and a few limitations: ++ * prt_vprintf works much like the traditional vsnprintf(), but outputs to a ++ * printbuf instead of raw pointer/size. + * +- * - ``%n`` is unsupported +- * - ``%p*`` is handled by pointer() +- * +- * See pointer() or Documentation/core-api/printk-formats.rst for more +- * extensive description. ++ * If you're not already dealing with a va_list consider using prt_printf(). + * +- * **Please update the documentation in both places when making changes** +- * +- * The return value is the number of characters which would +- * be generated for the given input, excluding the trailing +- * '\0', as per ISO C99. If you want to have the exact +- * number of characters written into @buf as return value +- * (not including the trailing '\0'), use vscnprintf(). If the +- * return is greater than or equal to @size, the resulting +- * string is truncated. +- * +- * If you're not already dealing with a va_list consider using snprintf(). ++ * See the vsnprintf() documentation for format string extensions over C99. + */ +-int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) ++void prt_vprintf(struct printbuf *out, const char *fmt, va_list args) + { + unsigned long long num; +- char *str, *end; + struct printf_spec spec = {0}; + + /* Reject out-of-range values early. Large positive sizes are + used for unknown buffer sizes. */ +- if (WARN_ON_ONCE(size > INT_MAX)) +- return 0; +- +- str = buf; +- end = buf + size; +- +- /* Make sure end is always >= buf */ +- if (end < buf) { +- end = ((void *)-1); +- size = end - buf; +- } ++ if (WARN_ON_ONCE(out->size > INT_MAX)) ++ return; + + while (*fmt) { + const char *old_fmt = fmt; +@@ -2736,16 +2632,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + fmt += read; + + switch (spec.type) { +- case FORMAT_TYPE_NONE: { +- int copy = read; +- if (str < end) { +- if (copy > end - str) +- copy = end - str; +- memcpy(str, old_fmt, copy); +- } +- str += read; ++ case FORMAT_TYPE_NONE: ++ prt_bytes(out, old_fmt, read); + break; +- } + + case FORMAT_TYPE_WIDTH: + set_field_width(&spec, va_arg(args, int)); +@@ -2755,44 +2644,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + set_precision(&spec, va_arg(args, int)); + break; + +- case FORMAT_TYPE_CHAR: { +- char c; ++ case FORMAT_TYPE_CHAR: ++ if (spec.field_width > 0 && !(spec.flags & LEFT)) ++ prt_chars(out, spec.field_width, ' '); + +- if (!(spec.flags & LEFT)) { +- while (--spec.field_width > 0) { +- if (str < end) +- *str = ' '; +- ++str; ++ __prt_char(out, (unsigned char) va_arg(args, int)); + +- } +- } +- c = (unsigned char) va_arg(args, int); +- if (str < end) +- *str = c; +- ++str; +- while (--spec.field_width > 0) { +- if (str < end) +- *str = ' '; +- ++str; +- } ++ if (spec.field_width > 0 && (spec.flags & LEFT)) ++ prt_chars(out, spec.field_width, ' '); ++ spec.field_width = 0; + break; +- } + + case FORMAT_TYPE_STR: +- str = string(str, end, va_arg(args, char *), spec); ++ /* ++ * we can't use string() then do_width_precision ++ * afterwards: people use the field width for passing ++ * non nul terminated strings ++ */ ++ string_spec(out, va_arg(args, char *), spec); + break; + + case FORMAT_TYPE_PTR: +- str = pointer(fmt, str, end, va_arg(args, void *), +- spec); ++ pointer(out, fmt, va_arg(args, void *), spec); + while (isalnum(*fmt)) + fmt++; + break; + ++ case FORMAT_TYPE_FN: { ++ unsigned nr_args = 0; ++ void *fn_args[8]; ++ void *fn = va_arg(args, void *); ++ ++ while (*fmt != ')') { ++ if (nr_args) { ++ if (fmt[0] != ',') ++ goto out; ++ fmt++; ++ } ++ ++ if (fmt[0] != '%' || fmt[1] != 'p') ++ goto out; ++ fmt += 2; ++ ++ if (WARN_ON_ONCE(nr_args == ARRAY_SIZE(fn_args))) ++ goto out; ++ fn_args[nr_args++] = va_arg(args, void *); ++ } ++ ++ call_prt_fn(out, fn, fn_args, nr_args); ++ fmt++; /* past trailing ) */ ++ break; ++ } ++ + case FORMAT_TYPE_PERCENT_CHAR: +- if (str < end) +- *str = '%'; +- ++str; ++ __prt_char(out, '%'); + break; + + case FORMAT_TYPE_INVALID: +@@ -2845,21 +2750,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + num = va_arg(args, unsigned int); + } + +- str = number(str, end, num, spec); ++ number(out, num, spec); + } + } +- + out: +- if (size > 0) { +- if (str < end) +- *str = '\0'; +- else +- end[-1] = '\0'; +- } ++ printbuf_nul_terminate(out); ++} ++EXPORT_SYMBOL(prt_vprintf); + +- /* the trailing null byte doesn't count towards the total */ +- return str-buf; ++/** ++ * prt_printf - Format a string, outputting to a printbuf ++ * @out: The printbuf to output to ++ * @fmt: The format string to use ++ * @args: Arguments for the format string ++ * ++ * ++ * prt_printf works much like the traditional sprintf(), but outputs to a ++ * printbuf instead of raw pointer/size. ++ * ++ * See the vsnprintf() documentation for format string extensions over C99. ++ */ ++void prt_printf(struct printbuf *out, const char *fmt, ...) ++{ ++ va_list args; ++ ++ va_start(args, fmt); ++ prt_vprintf(out, fmt, args); ++ va_end(args); ++} ++EXPORT_SYMBOL(prt_printf); ++ ++/** ++ * vsnprintf - Format a string and place it in a buffer ++ * @buf: The buffer to place the result into ++ * @size: The size of the buffer, including the trailing null space ++ * @fmt: The format string to use ++ * @args: Arguments for the format string ++ * ++ * This function generally follows C99 vsnprintf, but has some ++ * extensions and a few limitations: ++ * ++ * - ``%n`` is unsupported ++ * - ``%p*`` is handled by pointer() ++ * ++ * See pointer() or Documentation/core-api/printk-formats.rst for more ++ * extensive description. ++ * ++ * **Please update the documentation in both places when making changes** ++ * ++ * The return value is the number of characters which would ++ * be generated for the given input, excluding the trailing ++ * '\0', as per ISO C99. If you want to have the exact ++ * number of characters written into @buf as return value ++ * (not including the trailing '\0'), use vscnprintf(). If the ++ * return is greater than or equal to @size, the resulting ++ * string is truncated. ++ * ++ * If you're not already dealing with a va_list consider using snprintf(). ++ */ ++int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(buf, size); + ++ prt_vprintf(&out, fmt, args); ++ return out.pos; + } + EXPORT_SYMBOL(vsnprintf); + +@@ -2997,53 +2951,46 @@ EXPORT_SYMBOL(sprintf); + * bstr_printf() - Binary data to text string + */ + ++static inline void printbuf_align(struct printbuf *out, unsigned align) ++{ ++ /* Assumes output buffer is correctly aligned: */ ++ out->pos += align - 1; ++ out->pos &= ~(align - 1); ++} ++ + /** +- * vbin_printf - Parse a format string and place args' binary value in a buffer +- * @bin_buf: The buffer to place args' binary value +- * @size: The size of the buffer(by words(32bits), not characters) ++ * prt_vbinprintf - Parse a format string and place args' binary value in a buffer ++ * @out: The buffer to place args' binary value + * @fmt: The format string to use + * @args: Arguments for the format string + * + * The format follows C99 vsnprintf, except %n is ignored, and its argument + * is skipped. + * +- * The return value is the number of words(32bits) which would be generated for +- * the given input. +- * + * NOTE: + * If the return value is greater than @size, the resulting bin_buf is NOT + * valid for bstr_printf(). + */ +-int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) ++void prt_vbinprintf(struct printbuf *out, const char *fmt, va_list args) + { + struct printf_spec spec = {0}; +- char *str, *end; + int width; + +- str = (char *)bin_buf; +- end = (char *)(bin_buf + size); +- + #define save_arg(type) \ + ({ \ + unsigned long long value; \ + if (sizeof(type) == 8) { \ +- unsigned long long val8; \ +- str = PTR_ALIGN(str, sizeof(u32)); \ +- val8 = va_arg(args, unsigned long long); \ +- if (str + sizeof(type) <= end) { \ +- *(u32 *)str = *(u32 *)&val8; \ +- *(u32 *)(str + 4) = *((u32 *)&val8 + 1); \ +- } \ ++ u64 val8 = va_arg(args, u64); \ ++ printbuf_align(out, sizeof(u32)); \ ++ prt_bytes(out, (u32 *) &val8, 4); \ ++ prt_bytes(out, ((u32 *) &val8) + 1, 4); \ + value = val8; \ + } else { \ +- unsigned int val4; \ +- str = PTR_ALIGN(str, sizeof(type)); \ +- val4 = va_arg(args, int); \ +- if (str + sizeof(type) <= end) \ +- *(typeof(type) *)str = (type)(long)val4; \ ++ u32 val4 = va_arg(args, u32); \ ++ printbuf_align(out, sizeof(type)); \ ++ prt_bytes(out, &val4, sizeof(type)); \ + value = (unsigned long long)val4; \ + } \ +- str += sizeof(type); \ + value; \ + }) + +@@ -3074,16 +3021,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) + case FORMAT_TYPE_STR: { + const char *save_str = va_arg(args, char *); + const char *err_msg; +- size_t len; + + err_msg = check_pointer_msg(save_str); + if (err_msg) + save_str = err_msg; + +- len = strlen(save_str) + 1; +- if (str + len < end) +- memcpy(str, save_str, len); +- str += len; ++ prt_str(out, save_str); + break; + } + +@@ -3103,12 +3046,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) + save_arg(void *); + break; + } +- str = pointer(fmt, str, end, va_arg(args, void *), +- spec); +- if (str + 1 < end) +- *str++ = '\0'; +- else +- end[-1] = '\0'; /* Must be nul terminated */ ++ pointer(out, fmt, va_arg(args, void *), spec); + } + /* skip all alphanumeric pointer suffixes */ + while (isalnum(*fmt)) +@@ -3146,15 +3084,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) + } + + out: +- return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; ++ printbuf_nul_terminate(out); ++ printbuf_align(out, 4); + #undef save_arg + } +-EXPORT_SYMBOL_GPL(vbin_printf); ++EXPORT_SYMBOL_GPL(prt_vbinprintf); + + /** +- * bstr_printf - Format a string from binary arguments and place it in a buffer ++ * prt_bstrprintf - Format a string from binary arguments and place it in a buffer + * @buf: The buffer to place the result into +- * @size: The size of the buffer, including the trailing null space + * @fmt: The format string to use + * @bin_buf: Binary arguments for the format string + * +@@ -3164,26 +3102,14 @@ EXPORT_SYMBOL_GPL(vbin_printf); + * + * The format follows C99 vsnprintf, but has some extensions: + * see vsnprintf comment for details. +- * +- * The return value is the number of characters which would +- * be generated for the given input, excluding the trailing +- * '\0', as per ISO C99. If you want to have the exact +- * number of characters written into @buf as return value +- * (not including the trailing '\0'), use vscnprintf(). If the +- * return is greater than or equal to @size, the resulting +- * string is truncated. + */ +-int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) ++void prt_bstrprintf(struct printbuf *out, const char *fmt, const u32 *bin_buf) + { + struct printf_spec spec = {0}; +- char *str, *end; + const char *args = (const char *)bin_buf; + +- if (WARN_ON_ONCE(size > INT_MAX)) +- return 0; +- +- str = buf; +- end = buf + size; ++ if (WARN_ON_ONCE(out->size > INT_MAX)) ++ return; + + #define get_arg(type) \ + ({ \ +@@ -3200,12 +3126,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + value; \ + }) + +- /* Make sure end is always >= buf */ +- if (end < buf) { +- end = ((void *)-1); +- size = end - buf; +- } +- + while (*fmt) { + const char *old_fmt = fmt; + int read = format_decode(fmt, &spec); +@@ -3213,16 +3133,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + fmt += read; + + switch (spec.type) { +- case FORMAT_TYPE_NONE: { +- int copy = read; +- if (str < end) { +- if (copy > end - str) +- copy = end - str; +- memcpy(str, old_fmt, copy); +- } +- str += read; ++ case FORMAT_TYPE_NONE: ++ prt_bytes(out, old_fmt, read); + break; +- } + + case FORMAT_TYPE_WIDTH: + set_field_width(&spec, get_arg(int)); +@@ -3232,38 +3145,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + set_precision(&spec, get_arg(int)); + break; + +- case FORMAT_TYPE_CHAR: { +- char c; +- +- if (!(spec.flags & LEFT)) { +- while (--spec.field_width > 0) { +- if (str < end) +- *str = ' '; +- ++str; +- } +- } +- c = (unsigned char) get_arg(char); +- if (str < end) +- *str = c; +- ++str; +- while (--spec.field_width > 0) { +- if (str < end) +- *str = ' '; +- ++str; +- } ++ case FORMAT_TYPE_CHAR: ++ if (!(spec.flags & LEFT)) ++ prt_chars(out, spec.field_width, ' '); ++ __prt_char(out, (unsigned char) get_arg(char)); ++ if ((spec.flags & LEFT)) ++ prt_chars(out, spec.field_width, ' '); + break; +- } + + case FORMAT_TYPE_STR: { + const char *str_arg = args; + args += strlen(str_arg) + 1; +- str = string(str, end, (char *)str_arg, spec); ++ string_spec(out, (char *)str_arg, spec); + break; + } + + case FORMAT_TYPE_PTR: { + bool process = false; +- int copy, len; ++ int len; + /* Non function dereferences were already done */ + switch (*fmt) { + case 'S': +@@ -3279,17 +3178,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + break; + } + /* Pointer dereference was already processed */ +- if (str < end) { +- len = copy = strlen(args); +- if (copy > end - str) +- copy = end - str; +- memcpy(str, args, copy); +- str += len; +- args += len + 1; +- } ++ len = strlen(args); ++ prt_bytes(out, args, len); ++ args += len + 1; + } + if (process) +- str = pointer(fmt, str, end, get_arg(void *), spec); ++ pointer(out, fmt, get_arg(void *), spec); + + while (isalnum(*fmt)) + fmt++; +@@ -3297,9 +3191,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + } + + case FORMAT_TYPE_PERCENT_CHAR: +- if (str < end) +- *str = '%'; +- ++str; ++ __prt_char(out, '%'); + break; + + case FORMAT_TYPE_INVALID: +@@ -3342,23 +3234,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) + num = get_arg(int); + } + +- str = number(str, end, num, spec); ++ number(out, num, spec); + } /* default: */ + } /* switch(spec.type) */ + } /* while(*fmt) */ + + out: +- if (size > 0) { +- if (str < end) +- *str = '\0'; +- else +- end[-1] = '\0'; +- } +- + #undef get_arg ++ printbuf_nul_terminate(out); ++} ++EXPORT_SYMBOL_GPL(prt_bstrprintf); ++ ++/** ++ * prt_bprintf - Parse a format string and place args' binary value in a buffer ++ * @out: The buffer to place args' binary value ++ * @fmt: The format string to use ++ * @...: Arguments for the format string ++ */ ++void prt_bprintf(struct printbuf *out, const char *fmt, ...) ++{ ++ va_list args; ++ ++ va_start(args, fmt); ++ prt_vbinprintf(out, fmt, args); ++ va_end(args); ++} ++EXPORT_SYMBOL_GPL(prt_bprintf); ++ ++/** ++ * vbin_printf - Parse a format string and place args' binary value in a buffer ++ * @bin_buf: The buffer to place args' binary value ++ * @size: The size of the buffer(by words(32bits), not characters) ++ * @fmt: The format string to use ++ * @args: Arguments for the format string ++ * ++ * The format follows C99 vsnprintf, except %n is ignored, and its argument ++ * is skipped. ++ * ++ * The return value is the number of words(32bits) which would be generated for ++ * the given input. ++ * ++ * NOTE: ++ * If the return value is greater than @size, the resulting bin_buf is NOT ++ * valid for bstr_printf(). ++ */ ++int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) ++{ ++ struct printbuf out = PRINTBUF_EXTERN((char *) bin_buf, size); ++ ++ prt_vbinprintf(&out, fmt, args); ++ return out.pos; ++} ++EXPORT_SYMBOL_GPL(vbin_printf); ++ ++/** ++ * bstr_printf - Format a string from binary arguments and place it in a buffer ++ * @buf: The buffer to place the result into ++ * @size: The size of the buffer, including the trailing null space ++ * @fmt: The format string to use ++ * @bin_buf: Binary arguments for the format string ++ * ++ * This function like C99 vsnprintf, but the difference is that vsnprintf gets ++ * arguments from stack, and bstr_printf gets arguments from @bin_buf which is ++ * a binary buffer that generated by vbin_printf. ++ * ++ * The format follows C99 vsnprintf, but has some extensions: ++ * see vsnprintf comment for details. ++ * ++ * The return value is the number of characters which would ++ * be generated for the given input, excluding the trailing ++ * '\0', as per ISO C99. If you want to have the exact ++ * number of characters written into @buf as return value ++ * (not including the trailing '\0'), use vscnprintf(). If the ++ * return is greater than or equal to @size, the resulting ++ * string is truncated. ++ */ ++int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) ++{ ++ struct printbuf out = PRINTBUF_EXTERN(buf, size); + +- /* the trailing null byte doesn't count towards the total */ +- return str - buf; ++ prt_bstrprintf(&out, fmt, bin_buf); ++ return out.pos; + } + EXPORT_SYMBOL_GPL(bstr_printf); + +diff --git a/mm/Makefile b/mm/Makefile +index 6f9ffa968a1a..9731f495bbce 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -54,7 +54,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ + mm_init.o percpu.o slab_common.o \ + compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ +- debug.o gup.o mmap_lock.o $(mmu-y) ++ debug.o gup.o mmap_lock.o show_mem.o $(mmu-y) + + # Give 'page_alloc' its own module-parameter namespace + page-alloc-y := page_alloc.o +diff --git a/mm/filemap.c b/mm/filemap.c +index ffdfbc8b0e3c..8b9e18f79f2b 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2223,6 +2223,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + + return ret; + } ++EXPORT_SYMBOL(find_get_pages_range); + + /** + * find_get_pages_contig - gang contiguous pagecache lookup +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 618c366a2f07..660ddd48267d 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -62,7 +62,7 @@ + #include + #include + #include +-#include ++#include + #include "internal.h" + #include + #include +@@ -1462,13 +1462,9 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, + + static char *memory_stat_format(struct mem_cgroup *memcg) + { +- struct seq_buf s; ++ struct printbuf buf = PRINTBUF; + int i; + +- seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); +- if (!s.buffer) +- return NULL; +- + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. +@@ -1485,37 +1481,37 @@ static char *memory_stat_format(struct mem_cgroup *memcg) + u64 size; + + size = memcg_page_state_output(memcg, memory_stats[i].idx); +- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); ++ prt_printf(&buf, "%s %llu\n", memory_stats[i].name, size); + + if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { + size += memcg_page_state_output(memcg, + NR_SLAB_RECLAIMABLE_B); +- seq_buf_printf(&s, "slab %llu\n", size); ++ prt_printf(&buf, "slab %llu\n", size); + } + } + + /* Accumulated memory events */ + +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), +- memcg_events(memcg, PGFAULT)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), +- memcg_events(memcg, PGMAJFAULT)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), +- memcg_events(memcg, PGREFILL)); +- seq_buf_printf(&s, "pgscan %lu\n", +- memcg_events(memcg, PGSCAN_KSWAPD) + +- memcg_events(memcg, PGSCAN_DIRECT)); +- seq_buf_printf(&s, "pgsteal %lu\n", +- memcg_events(memcg, PGSTEAL_KSWAPD) + +- memcg_events(memcg, PGSTEAL_DIRECT)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), +- memcg_events(memcg, PGACTIVATE)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), +- memcg_events(memcg, PGDEACTIVATE)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), +- memcg_events(memcg, PGLAZYFREE)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), +- memcg_events(memcg, PGLAZYFREED)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGFAULT), ++ memcg_events(memcg, PGFAULT)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGMAJFAULT), ++ memcg_events(memcg, PGMAJFAULT)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGREFILL), ++ memcg_events(memcg, PGREFILL)); ++ prt_printf(&buf, "pgscan %lu\n", ++ memcg_events(memcg, PGSCAN_KSWAPD) + ++ memcg_events(memcg, PGSCAN_DIRECT)); ++ prt_printf(&buf, "pgsteal %lu\n", ++ memcg_events(memcg, PGSTEAL_KSWAPD) + ++ memcg_events(memcg, PGSTEAL_DIRECT)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGACTIVATE), ++ memcg_events(memcg, PGACTIVATE)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGDEACTIVATE), ++ memcg_events(memcg, PGDEACTIVATE)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGLAZYFREE), ++ memcg_events(memcg, PGLAZYFREE)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(PGLAZYFREED), ++ memcg_events(memcg, PGLAZYFREED)); + + #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN), +@@ -1525,16 +1521,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg) + #endif + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), +- memcg_events(memcg, THP_FAULT_ALLOC)); +- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), +- memcg_events(memcg, THP_COLLAPSE_ALLOC)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), ++ memcg_events(memcg, THP_FAULT_ALLOC)); ++ prt_printf(&buf, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), ++ memcg_events(memcg, THP_COLLAPSE_ALLOC)); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +- /* The above should easily fit into one page */ +- WARN_ON_ONCE(seq_buf_has_overflowed(&s)); ++ if (buf.allocation_failure) { ++ printbuf_exit(&buf); ++ return NULL; ++ } + +- return s.buffer; ++ return buf.buf; + } + + #define K(x) ((x) << (PAGE_SHIFT-10)) +diff --git a/mm/nommu.c b/mm/nommu.c +index 9d7afc2d959e..dd53020262d8 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -281,6 +281,24 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ */ ++ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc(size, gfp_mask); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + /** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index 3c6cf9e3cd66..e4dca11dc54a 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p) + return false; + } + +-/* +- * Check whether unreclaimable slab amount is greater than +- * all user memory(LRU pages). +- * dump_unreclaimable_slab() could help in the case that +- * oom due to too much unreclaimable slab used by kernel. +-*/ +-static bool should_dump_unreclaim_slab(void) +-{ +- unsigned long nr_lru; +- +- nr_lru = global_node_page_state(NR_ACTIVE_ANON) + +- global_node_page_state(NR_INACTIVE_ANON) + +- global_node_page_state(NR_ACTIVE_FILE) + +- global_node_page_state(NR_INACTIVE_FILE) + +- global_node_page_state(NR_ISOLATED_ANON) + +- global_node_page_state(NR_ISOLATED_FILE) + +- global_node_page_state(NR_UNEVICTABLE); +- +- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); +-} +- + /** + * oom_badness - heuristic function to determine which candidate task to kill + * @p: task struct of which task we should calculate +@@ -462,8 +441,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) + mem_cgroup_print_oom_meminfo(oc->memcg); + else { + show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); +- if (should_dump_unreclaim_slab()) +- dump_unreclaimable_slab(); + } + if (sysctl_oom_dump_tasks) + dump_tasks(oc); +diff --git a/lib/show_mem.c b/mm/show_mem.c +similarity index 83% +rename from lib/show_mem.c +rename to mm/show_mem.c +index 1c26c14ffbb9..47225158ce49 100644 +--- a/lib/show_mem.c ++++ b/mm/show_mem.c +@@ -7,6 +7,9 @@ + + #include + #include ++#include ++ ++#include "slab.h" + + void show_mem(unsigned int filter, nodemask_t *nodemask) + { +@@ -41,4 +44,9 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) + #ifdef CONFIG_MEMORY_FAILURE + printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); + #endif ++ printk("Unreclaimable slab info:\n"); ++ printk("%pf()", CALL_PP(dump_unreclaimable_slab)); ++ ++ printk("Shrinkers:\n"); ++ printk("%pf()", CALL_PP(shrinkers_to_text)); + } +diff --git a/mm/slab.h b/mm/slab.h +index db9fb5c8dae7..502616394f7f 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -806,10 +806,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) + + #endif + ++struct printbuf; ++ + #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +-void dump_unreclaimable_slab(void); ++void dump_unreclaimable_slab(struct printbuf *); + #else +-static inline void dump_unreclaimable_slab(void) ++static inline void dump_unreclaimable_slab(struct printbuf *out) + { + } + #endif +diff --git a/mm/slab_common.c b/mm/slab_common.c +index 77c3adf40e50..3be0d468a599 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include +@@ -1085,10 +1086,15 @@ static int slab_show(struct seq_file *m, void *p) + return 0; + } + +-void dump_unreclaimable_slab(void) ++void dump_unreclaimable_slab(struct printbuf *out) + { + struct kmem_cache *s; + struct slabinfo sinfo; ++ struct slab_by_mem { ++ struct kmem_cache *s; ++ size_t total, active; ++ } slabs_by_mem[10], n; ++ int i, nr = 0; + + /* + * Here acquiring slab_mutex is risky since we don't prefer to get +@@ -1098,12 +1104,11 @@ void dump_unreclaimable_slab(void) + * without acquiring the mutex. + */ + if (!mutex_trylock(&slab_mutex)) { +- pr_warn("excessive unreclaimable slab but cannot dump stats\n"); ++ prt_str(out, "excessive unreclaimable slab but cannot dump stats\n"); + return; + } + +- pr_info("Unreclaimable slab info:\n"); +- pr_info("Name Used Total\n"); ++ printbuf_atomic_inc(out); + + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & SLAB_RECLAIM_ACCOUNT) +@@ -1111,11 +1116,43 @@ void dump_unreclaimable_slab(void) + + get_slabinfo(s, &sinfo); + +- if (sinfo.num_objs > 0) +- pr_info("%-17s %10luKB %10luKB\n", s->name, +- (sinfo.active_objs * s->size) / 1024, +- (sinfo.num_objs * s->size) / 1024); ++ if (!sinfo.num_objs) ++ continue; ++ ++ n.s = s; ++ n.total = sinfo.num_objs * s->size; ++ n.active = sinfo.active_objs * s->size; ++ ++ for (i = 0; i < nr; i++) ++ if (n.total < slabs_by_mem[i].total) ++ break; ++ ++ if (nr < ARRAY_SIZE(slabs_by_mem)) { ++ memmove(&slabs_by_mem[i + 1], ++ &slabs_by_mem[i], ++ sizeof(slabs_by_mem[0]) * (nr - i)); ++ nr++; ++ } else if (i) { ++ i--; ++ memmove(&slabs_by_mem[0], ++ &slabs_by_mem[1], ++ sizeof(slabs_by_mem[0]) * i); ++ } else { ++ continue; ++ } ++ ++ slabs_by_mem[i] = n; ++ } ++ ++ for (i = nr - 1; i >= 0; --i) { ++ prt_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); ++ prt_human_readable_u64(out, slabs_by_mem[i].total); ++ prt_printf(out, " active: "); ++ prt_human_readable_u64(out, slabs_by_mem[i].active); ++ prt_newline(out); + } ++ ++ printbuf_atomic_dec(out); + mutex_unlock(&slab_mutex); + } + +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index effd1ff6a4b4..ea6375c960a2 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -3361,6 +3361,27 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ * ++ * Return: pointer to the allocated memory or %NULL on error ++ */ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, ++ NUMA_NO_NODE, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) + #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) + #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +diff --git a/mm/vmscan.c b/mm/vmscan.c +index f7d9a683e3a7..0ea3ce8e258f 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -699,6 +700,89 @@ void synchronize_shrinkers(void) + } + EXPORT_SYMBOL(synchronize_shrinkers); + ++void shrinker_to_text(struct printbuf *out, struct shrinker *shrinker) ++{ ++ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; ++ ++ if (shrinker->name[0]) ++ prt_str(out, shrinker->name); ++ else ++ prt_printf(out, "%ps:", shrinker->scan_objects); ++ prt_newline(out); ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "objects: %lu", shrinker->count_objects(shrinker, &sc)); ++ prt_newline(out); ++ prt_printf(out, "requested to free: %lu", atomic_long_read(&shrinker->objects_requested_to_free)); ++ prt_newline(out); ++ prt_printf(out, "objects freed: %lu", atomic_long_read(&shrinker->objects_freed)); ++ prt_newline(out); ++ ++ if (shrinker->to_text) { ++ shrinker->to_text(out, shrinker); ++ prt_newline(out); ++ } ++ ++ printbuf_indent_sub(out, 2); ++} ++ ++/** ++ * shrinkers_to_text - Report on shrinkers with highest usage ++ * ++ * This reports on the top 10 shrinkers, by object counts, in sorted order: ++ * intended to be used for OOM reporting. ++ */ ++void shrinkers_to_text(struct printbuf *out) ++{ ++ struct shrinker *shrinker; ++ struct shrinker_by_mem { ++ struct shrinker *shrinker; ++ unsigned long mem; ++ } shrinkers_by_mem[10]; ++ int i, nr = 0; ++ ++ if (!down_read_trylock(&shrinker_rwsem)) { ++ prt_str(out, "(couldn't take shrinker lock)"); ++ return; ++ } ++ ++ list_for_each_entry(shrinker, &shrinker_list, list) { ++ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; ++ unsigned long mem = shrinker->count_objects(shrinker, &sc); ++ ++ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY) ++ continue; ++ ++ for (i = 0; i < nr; i++) ++ if (mem < shrinkers_by_mem[i].mem) ++ break; ++ ++ if (nr < ARRAY_SIZE(shrinkers_by_mem)) { ++ memmove(&shrinkers_by_mem[i + 1], ++ &shrinkers_by_mem[i], ++ sizeof(shrinkers_by_mem[0]) * (nr - i)); ++ nr++; ++ } else if (i) { ++ i--; ++ memmove(&shrinkers_by_mem[0], ++ &shrinkers_by_mem[1], ++ sizeof(shrinkers_by_mem[0]) * i); ++ } else { ++ continue; ++ } ++ ++ shrinkers_by_mem[i] = (struct shrinker_by_mem) { ++ .shrinker = shrinker, ++ .mem = mem, ++ }; ++ } ++ ++ for (i = nr - 1; i >= 0; --i) ++ shrinker_to_text(out, shrinkers_by_mem[i].shrinker); ++ ++ up_read(&shrinker_rwsem); ++} ++ + #define SHRINK_BATCH 128 + + static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, +@@ -765,12 +849,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + ++ atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free); ++ + shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; ++ + freed += ret; ++ atomic_long_add(ret, &shrinker->objects_freed); + + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; +diff --git a/net/9p/client.c b/net/9p/client.c +index 8bba0d9cf975..e14074d031c6 100644 +--- a/net/9p/client.c ++++ b/net/9p/client.c +@@ -218,23 +218,29 @@ static int parse_opts(char *opts, struct p9_client *clnt) + return ret; + } + +-static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, +- int alloc_msize) ++static void p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx, unsigned alloc_msize) + { +- if (likely(c->fcall_cache) && alloc_msize == c->msize) { +- fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); +- fc->cache = c->fcall_cache; +- } else { +- fc->sdata = kmalloc(alloc_msize, GFP_NOFS); +- fc->cache = NULL; +- } +- if (!fc->sdata) +- return -ENOMEM; ++ gfp_t gfp = GFP_NOFS|__GFP_NOWARN; ++ ++ BUG_ON(alloc_msize > c->msize); ++ ++ fc->sdata = NULL; ++ fc->used_mempool = false; + fc->capacity = alloc_msize; +- return 0; ++ ++ if (alloc_msize < c->msize) ++ fc->sdata = kmalloc(alloc_msize, gfp); ++ ++ if (!fc->sdata) { ++ fc->sdata = mempool_alloc(&c->pools[fc_idx], gfp); ++ fc->used_mempool = true; ++ fc->capacity = c->msize; ++ } + } + +-void p9_fcall_fini(struct p9_fcall *fc) ++void p9_fcall_fini(struct p9_client *c, struct p9_fcall *fc, ++ int fc_idx) + { + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us +@@ -242,8 +248,8 @@ void p9_fcall_fini(struct p9_fcall *fc) + if (unlikely(!fc->sdata)) + return; + +- if (fc->cache) +- kmem_cache_free(fc->cache, fc->sdata); ++ if (fc->used_mempool) ++ mempool_free(fc->sdata, &c->pools[fc_idx]); + else + kfree(fc->sdata); + } +@@ -270,10 +276,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) + if (!req) + return ERR_PTR(-ENOMEM); + +- if (p9_fcall_init(c, &req->tc, alloc_msize)) +- goto free_req; +- if (p9_fcall_init(c, &req->rc, alloc_msize)) +- goto free; ++ p9_fcall_init(c, &req->tc, 0, alloc_msize); ++ p9_fcall_init(c, &req->rc, 1, alloc_msize); + + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); +@@ -305,14 +309,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ +- refcount_set(&req->refcount.refcount, 2); ++ refcount_set(&req->refcount, 2); + + return req; + + free: +- p9_fcall_fini(&req->tc); +- p9_fcall_fini(&req->rc); +-free_req: ++ p9_fcall_fini(c, &req->tc, 0); ++ p9_fcall_fini(c, &req->rc, 1); + kmem_cache_free(p9_req_cache, req); + return ERR_PTR(-ENOMEM); + } +@@ -341,7 +344,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { +- p9_req_put(req); ++ p9_req_put(c, req); + goto again; + } + } +@@ -367,21 +370,18 @@ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); +- return p9_req_put(r); +-} +- +-static void p9_req_free(struct kref *ref) +-{ +- struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); +- +- p9_fcall_fini(&r->tc); +- p9_fcall_fini(&r->rc); +- kmem_cache_free(p9_req_cache, r); ++ return p9_req_put(c, r); + } + +-int p9_req_put(struct p9_req_t *r) ++int p9_req_put(struct p9_client *c, struct p9_req_t *r) + { +- return kref_put(&r->refcount, p9_req_free); ++ if (refcount_dec_and_test(&r->refcount)) { ++ p9_fcall_fini(c, &r->tc, 0); ++ p9_fcall_fini(c, &r->rc, 1); ++ kmem_cache_free(p9_req_cache, r); ++ return 1; ++ } ++ return 0; + } + EXPORT_SYMBOL(p9_req_put); + +@@ -426,7 +426,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) + + wake_up(&req->wq); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); +- p9_req_put(req); ++ p9_req_put(c, req); + } + EXPORT_SYMBOL(p9_client_cb); + +@@ -709,7 +709,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, + reterr: + p9_tag_remove(c, req); + /* We have to put also the 2nd reference as it won't be used */ +- p9_req_put(req); ++ p9_req_put(c, req); + return ERR_PTR(err); + } + +@@ -746,7 +746,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) + err = c->trans_mod->request(c, req); + if (err < 0) { + /* write won't happen */ +- p9_req_put(req); ++ p9_req_put(c, req); + if (err != -ERESTARTSYS && err != -EFAULT) + c->status = Disconnected; + goto recalc_sigpending; +@@ -1002,7 +1002,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + char *client_id; + + err = 0; +- clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); ++ clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + return ERR_PTR(-ENOMEM); + +@@ -1053,10 +1053,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + goto close_trans; + } + +- err = p9_client_version(clnt); +- if (err) +- goto close_trans; +- + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ +@@ -1066,6 +1062,15 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + clnt->msize - (P9_HDRSZ + 4), + NULL); + ++ err = mempool_init_slab_pool(&clnt->pools[0], 4, clnt->fcall_cache) ?: ++ mempool_init_slab_pool(&clnt->pools[1], 4, clnt->fcall_cache); ++ if (err) ++ goto close_trans; ++ ++ err = p9_client_version(clnt); ++ if (err) ++ goto close_trans; ++ + return clnt; + + close_trans: +@@ -1073,6 +1078,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) + put_trans: + v9fs_put_trans(clnt->trans_mod); + free_client: ++ mempool_exit(&clnt->pools[1]); ++ mempool_exit(&clnt->pools[0]); + kfree(clnt); + return ERR_PTR(err); + } +@@ -1097,6 +1104,8 @@ void p9_client_destroy(struct p9_client *clnt) + + p9_tag_cleanup(clnt); + ++ mempool_exit(&clnt->pools[1]); ++ mempool_exit(&clnt->pools[0]); + kmem_cache_destroy(clnt->fcall_cache); + kfree(clnt); + } +diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c +index 8f8f95e39b03..007c3f45fe05 100644 +--- a/net/9p/trans_fd.c ++++ b/net/9p/trans_fd.c +@@ -378,7 +378,7 @@ static void p9_read_work(struct work_struct *work) + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; +- p9_req_put(m->rreq); ++ p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + +@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work) + m->wpos += err; + if (m->wpos == m->wsize) { + m->wpos = m->wsize = 0; +- p9_req_put(m->wreq); ++ p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + +@@ -695,7 +695,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) + if (req->status == REQ_STATUS_UNSENT) { + list_del(&req->req_list); + req->status = REQ_STATUS_FLSHD; +- p9_req_put(req); ++ p9_req_put(client, req); + ret = 0; + } + spin_unlock(&client->lock); +@@ -722,7 +722,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) + list_del(&req->req_list); + req->status = REQ_STATUS_FLSHD; + spin_unlock(&client->lock); +- p9_req_put(req); ++ p9_req_put(client, req); + + return 0; + } +@@ -883,12 +883,12 @@ static void p9_conn_destroy(struct p9_conn *m) + p9_mux_poll_stop(m); + cancel_work_sync(&m->rq); + if (m->rreq) { +- p9_req_put(m->rreq); ++ p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + cancel_work_sync(&m->wq); + if (m->wreq) { +- p9_req_put(m->wreq); ++ p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + +diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c +index 88e563826674..99d878d70d56 100644 +--- a/net/9p/trans_rdma.c ++++ b/net/9p/trans_rdma.c +@@ -350,7 +350,7 @@ send_done(struct ib_cq *cq, struct ib_wc *wc) + c->busa, c->req->tc.size, + DMA_TO_DEVICE); + up(&rdma->sq_sem); +- p9_req_put(c->req); ++ p9_req_put(client, c->req); + kfree(c); + } + +@@ -431,7 +431,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) + if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { + if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { + /* Got one! */ +- p9_fcall_fini(&req->rc); ++ p9_fcall_fini(client, &req->rc, 1); + req->rc.sdata = NULL; + goto dont_need_post_recv; + } else { +diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c +index b24a4fb0f0a2..147972bf2e79 100644 +--- a/net/9p/trans_virtio.c ++++ b/net/9p/trans_virtio.c +@@ -199,7 +199,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) + /* Reply won't come, so drop req ref */ + static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) + { +- p9_req_put(req); ++ p9_req_put(client, req); + return 0; + } + +@@ -523,7 +523,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, + kvfree(out_pages); + if (!kicked) { + /* reply won't come */ +- p9_req_put(req); ++ p9_req_put(client, req); + } + return err; + } +diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c +index 833cd3792c51..227f89cc7237 100644 +--- a/net/9p/trans_xen.c ++++ b/net/9p/trans_xen.c +@@ -163,7 +163,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); +- p9_req_put(p9_req); ++ p9_req_put(client, p9_req); + + return 0; + } +diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c +index 4d1a947367f9..a2097955dace 100644 +--- a/tools/testing/nvdimm/test/ndtest.c ++++ b/tools/testing/nvdimm/test/ndtest.c +@@ -12,7 +12,7 @@ + #include + #include + #include +-#include ++#include + + #include "../watermark.h" + #include "nfit_test.h" +@@ -740,32 +740,30 @@ static ssize_t flags_show(struct device *dev, + { + struct nvdimm *nvdimm = to_nvdimm(dev); + struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm); +- struct seq_buf s; ++ struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE); + u64 flags; + + flags = dimm->flags; + +- seq_buf_init(&s, buf, PAGE_SIZE); + if (flags & PAPR_PMEM_UNARMED_MASK) +- seq_buf_printf(&s, "not_armed "); ++ prt_printf(&s, "not_armed "); + + if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK) +- seq_buf_printf(&s, "flush_fail "); ++ prt_printf(&s, "flush_fail "); + + if (flags & PAPR_PMEM_BAD_RESTORE_MASK) +- seq_buf_printf(&s, "restore_fail "); ++ prt_printf(&s, "restore_fail "); + + if (flags & PAPR_PMEM_SAVE_MASK) +- seq_buf_printf(&s, "save_fail "); ++ prt_printf(&s, "save_fail "); + + if (flags & PAPR_PMEM_SMART_EVENT_MASK) +- seq_buf_printf(&s, "smart_notify "); ++ prt_printf(&s, "smart_notify "); + ++ if (printbuf_written(&s)) ++ prt_printf(&s, "\n"); + +- if (seq_buf_used(&s)) +- seq_buf_printf(&s, "\n"); +- +- return seq_buf_used(&s); ++ return printbuf_written(&s); + } + static DEVICE_ATTR_RO(flags); + +-- +2.37.1 +